In [1]:
import gzip
import shutil
import os
import json
import csv

In [None]:
input_dir = 'abo-listings/listings/metadata/'
output_dir = 'abo-listings/listings/extracted_metadata/'

os.makedirs(output_dir, exist_ok=True)

for filename in os.listdir(input_dir):
    if filename.endswith('.json.gz'):
        input_path = os.path.join(input_dir, filename)
        output_filename = filename[:-3]  # remove the .gz extension
        output_path = os.path.join(output_dir, output_filename)
        
        with gzip.open(input_path, 'rb') as f_in:
            with open(output_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        
        print(f"Extracted {filename} → {output_filename}")

print("All extractions complete.")

In [None]:
json_dir = 'abo-listings/listings/extracted_metadata/'

for filename in os.listdir(json_dir):
    if filename.endswith('.json'):
        file_path = os.path.join(json_dir, filename)
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read().strip()
                
                if content.startswith('['):
                    data = json.loads(content)
                    print(f"{filename} loaded as JSON array. Length: {len(data)}")
                else:
                    items = []
                    for line in content.splitlines():
                        line = line.strip()
                        if line:
                            items.append(json.loads(line))
                    print(f"{filename} loaded as {len(items)} separate JSON objects (line by line).")
                    
        except json.JSONDecodeError as e:
            print(f"{filename} failed to load: {e}")
        except Exception as e:
            print(f"{filename} encountered an error: {e}")

print("JSON structure check complete.")

In [None]:
file_path = 'abo-listings/listings/extracted_metadata/listings_3.json'

print(f"Inspecting {file_path}")

# Read the raw content
with open(file_path, 'r', encoding='utf-8') as f:
    raw = f.read()

# Show first 500 characters for manual inspection
print("\n--- First 500 characters ---")
print(raw[:500])
print("\n---------------------------")

# Try loading as a single JSON object
try:
    data = json.loads(raw)
    print(" Loaded as single JSON object")
    print(f"Type: {type(data)}")
    if isinstance(data, dict):
        print(f"Top-level keys: {list(data.keys())}")
    elif isinstance(data, list):
        print(f"List length: {len(data)}")
        print(f"First item type: {type(data[0])}")
        if isinstance(data[0], dict):
            print(f"First item keys: {list(data[0].keys())}")
except json.JSONDecodeError as e:
    print(f"Failed to load as single JSON: {e}")

    # Try line by line
    print("\nTrying to parse line by line...")
    items = []
    for line in raw.splitlines():
        line = line.strip()
        if line:
            try:
                obj = json.loads(line)
                items.append(obj)
            except Exception as sub_e:
                print(f"Failed to parse line: {sub_e}")
                break

    print(f"Parsed {len(items)} JSON objects (line by line)")
    if items:
        print(f"First item type: {type(items[0])}")
        if isinstance(items[0], dict):
            print(f"First item keys: {list(items[0].keys())}")


In [None]:
input_dir = 'abo-listings/listings/extracted_metadata'
output_dir = 'abo-listings/listings/filtered_metadata'

os.makedirs(output_dir, exist_ok=True)

header = [
    'main_image_id', 'overall_description', 'colour_description', 'other_description', 'material_description'
]

# Function to filter 'value' fields by language_tag (or accept if no language_tag)
def get_filtered_values(entries):
    filtered_values = []
    for entry in entries:
        value = entry.get('value')
        language_tag = entry.get('language_tag')
        if value and (language_tag is None or language_tag in ['en_IN', 'en_US']):
            filtered_values.append(value)
    return filtered_values

# Function to filter 'standardized_values' by language_tag (or accept if no language_tag)
def get_filtered_standardized_values(color_entries):
    filtered_values = []
    for entry in color_entries:
        language_tag = entry.get('language_tag')
        if language_tag is None or language_tag in ['en_IN', 'en_US']:
            std_values = entry.get('standardized_values', [])
            filtered_values.extend(std_values)
    return filtered_values

# Process each JSON file in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith('.json'):
        input_file = os.path.join(input_dir, filename)
        output_file = os.path.join(output_dir, filename.replace('.json', '.csv'))

        print(f"Processing {input_file} → {output_file}")

        # Load line-delimited JSON
        records = []
        with open(input_file, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    records.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"Failed to load a line in {filename}: {e}")

        required_keys = ['brand', 'bullet_point', 'color', 'model_name', 'item_name', 
                         'product_type', 'main_image_id', 'item_keywords', 'country']

        filtered_records = [
            record for record in records
            if all(key in record for key in required_keys)
            and record.get('country') in ['IN', 'US']
            # and 'item_dimensions' not in record
        ]

        print(f" → Total matching records: {len(filtered_records)}")

        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(header) 

            for record in filtered_records:
                overall_description = get_filtered_values(record.get('bullet_point', []))
                colour_description = []
                colour_description.extend(get_filtered_standardized_values(record.get('color', [])))
                colour_description.extend(get_filtered_values(record.get('color', [])))
                other_description = []
                for field in ['product_type', 'item_keywords']:
                    other_description.extend(get_filtered_values(record.get(field, [])))
                material_description = []
                if 'material' in record:
                    material_description.extend(get_filtered_values(record.get('material', [])))

                row = [
                    record.get('main_image_id'),
                    '; '.join(overall_description),
                    '; '.join(colour_description),
                    '; '.join(other_description),
                    '; '.join(material_description)
                ]

                writer.writerow(row)

        print(f" → Saved {len(filtered_records)} records to {output_file}\n")


In [None]:
!pip install --upgrade pip
!pip install google-genai

In [None]:
from google import genai
from google.genai import Client, types
import time

print(dir(genai))
print(dir(Client))
print(dir(types))

In [None]:
client = genai.Client(api_key="") #insert api key here 

# Set daily limits
MAX_DAILY_REQUESTS = 1500
DELAY_BETWEEN_REQUESTS = 60 

requests_made = 0

def load_progress(progress_file):
    if os.path.exists(progress_file):
        with open(progress_file, 'r') as f:
            index = f.read().strip()
            return int(index)
    return 0

def save_progress(progress_file, index):
    with open(progress_file, 'w') as f:
        f.write(str(index))

def query_gemini_api(image_bytes, combined_description):
    prompt_text = (
        "You will receive an image along with a short product description.\n"
        f"Refer to this product description for context: {combined_description}\n"
        "Create exactly 5 visually-based questions that increase in difficulty and are varied in nature.\n"
        "Each question must be answerable *solely* through visual inspection of the image — do not use external knowledge or assumptions.\n"
        "Incorporate a mix of visual features across questions, such as: color, number of elements, shapes, positioning, relative size, and any visible text.\n"
        "Ensure a balance in difficulty:\n"
        "- 2 questions should be easy (e.g., identify a color or count elements)\n"
        "- 2 should be of medium complexity (e.g., spatial arrangement, size comparisons)\n"
        "- 1 should be more difficult, requiring close observation or visual reasoning (e.g., identifying a main feature or deducing purpose from form)\n"
        "Avoid asking about non-visible attributes like materials or internal functions.\n"
        "Each answer must be a *single word* and answers should not all be 'yes' or 'no'.\n"
        "Format your output exactly like this — do not include any extra comments or explanations:\n"
        "Question 1: <your question>\n"
        "Answer 1: <your one-word answer>"
    )


    try:
        response = client.models.generate_content(
            model='gemini-2.0-flash',
            contents=[
                types.Part.from_bytes(
                    data=image_bytes,
                    mime_type='image/jpeg'
                ),
                prompt_text
            ]
        )
        return response.text
    except Exception as e:
        print(f"Error querying Gemini API: {e}")
        return None

def process_records(listings_csv_path, images_csv_path, images_base_path, output_file, progress_file):
    global requests_made

    image_path_map = {}
    with open(images_csv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            image_path_map[row['image_id']] = row['path']

    print("Loaded image metadata successfully.")

    output_dir = os.path.dirname(output_file)
    os.makedirs(output_dir, exist_ok=True)

    # Load last processed index
    start_index = load_progress(progress_file)
    current_index = 0

    with open(output_file, 'a', newline='', encoding='utf-8') as f_out:
        writer = csv.writer(f_out)
        if os.stat(output_file).st_size == 0:
            writer.writerow(['image_id', 'full_image_path', 'question', 'answer'])

        with open(listings_csv_path, 'r', encoding='utf-8') as f_in:
            reader = csv.DictReader(f_in)
            for row in reader:
                if current_index < start_index:
                    current_index += 1
                    continue  # skip already processed

                if requests_made >= MAX_DAILY_REQUESTS:
                    print("Reached daily request limit. Stopping.")
                    break

                image_id = row['main_image_id']
                image_filename = image_path_map.get(image_id)

                if not image_filename:
                    print(f"Image path not found for image_id: {image_id}")
                    current_index += 1
                    continue

                full_image_path = os.path.join(images_base_path, image_filename)

                if not os.path.exists(full_image_path):
                    print(f"Image file does not exist: {full_image_path}")
                    current_index += 1
                    continue

                try:
                    with open(full_image_path, "rb") as img_file:
                        image_bytes = img_file.read()
                except Exception as e:
                    print(f"Failed to read image {full_image_path}: {e}")
                    current_index += 1
                    continue

                combined_description = f"Overall: {row['overall_description']}; " \
                                       f"Color: {row['colour_description']}; " \
                                       f"Material: {row['material_description']}"\
                                       f"Other: {row['other_description']}; " \

                print(f"Sending request for image_id: {image_id}")

                generated_text = query_gemini_api(image_bytes, combined_description)

                if generated_text:
                    lines = [line.strip() for line in generated_text.strip().split('\n') if line.strip()]
                    question_lines = [line for line in lines if line.lower().startswith('question')]
                    answer_lines = [line for line in lines if line.lower().startswith('answer')]

                    if len(question_lines) == 5 and len(answer_lines) == 5:
                        for q_line, a_line in zip(question_lines, answer_lines):
                            question = q_line.split(':', 1)[1].strip()
                            answer = a_line.split(':', 1)[1].strip()
                            writer.writerow([image_id, full_image_path, question, answer])
                            f_out.flush()
                        print(f"Processed image_id: {image_id}")
                    else:
                        print(f"Unexpected format or count in response for image_id: {image_id}")
                else:
                    print(f"Failed to generate questions for image_id: {image_id}")

                requests_made += 1
                current_index += 1
                save_progress(progress_file, current_index)

                if requests_made < MAX_DAILY_REQUESTS:
                    print(f"Sleeping {DELAY_BETWEEN_REQUESTS} seconds to respect rate limits...")
                    time.sleep(DELAY_BETWEEN_REQUESTS)

import os

current_working_filename = 'listings_3'
question_set_number = 'set_4'


listings_csv_path = f'abo-listings/listings/filtered_metadata/{current_working_filename}.csv'
images_csv_path = 'abo-images-small/images/metadata/images.csv'
images_base_path = 'abo-images-small/images/small'

output_dir = 'generated_questions'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, f'questions_{current_working_filename}_{question_set_number}.csv')

progress_dir = 'progress'
os.makedirs(progress_dir, exist_ok=True)
progress_file = os.path.join(progress_dir, f'progress_{current_working_filename}.txt')


process_records(listings_csv_path, images_csv_path, images_base_path, output_file, progress_file)