In [1]:
import gzip
import shutil
import os
import json
import csv

## Data extraction

In [11]:
source_dir = '/kaggle/input/abo-listings/'
destination_dir = '/kaggle/working/abo-listings/listings/extracted_metadata'

os.makedirs(destination_dir, exist_ok=True)

compressed_files = [file for file in os.listdir(source_dir) if file.endswith('.json.gz')]

for compressed_file in compressed_files:
    src_path = os.path.join(source_dir, compressed_file)
    dest_filename = compressed_file.replace('.gz', '')  # strips only .gz
    dest_path = os.path.join(destination_dir, dest_filename)

    with gzip.open(src_path, 'rb') as source_file, open(dest_path, 'wb') as target_file:
        shutil.copyfileobj(source_file, target_file)

    print(f"Decompressed: {compressed_file} → {dest_filename}")

print("Extraction process finished.")

Extraction process finished.


## Filtering data

In [12]:
# Define directories
source_dir = '/kaggle/input/abo-listings'
target_dir = '/kaggle/working/abo-listings/listings/filtered_metadata'

# Ensure output directory exists
os.makedirs(target_dir, exist_ok=True)

# Define output CSV header
csv_columns = [
    'main_image_id', 'overall_description', 'colour_description',
    'other_description', 'material_description'
]

# Utility: Extract 'value' fields filtered by language
def extract_values_by_language(entries):
    return [
        entry['value'] for entry in entries
        if 'value' in entry and (
            'language_tag' not in entry or entry['language_tag'] in {'en_US', 'en_IN'}
        )
    ]

# Utility: Extract 'standardized_values' from color fields filtered by language
def extract_standardized_colors(entries):
    standardized = []
    for entry in entries:
        if 'language_tag' not in entry or entry['language_tag'] in {'en_US', 'en_IN'}:
            standardized.extend(entry.get('standardized_values', []))
    return standardized

# Loop through each JSON file
for file_name in os.listdir(source_dir):
    if not file_name.endswith('.json'):
        continue

    json_path = os.path.join(source_dir, file_name)
    csv_path = os.path.join(target_dir, file_name.replace('.json', '.csv'))

    print(f"Processing file: {json_path} → {csv_path}")

    # Load JSON lines
    records = []
    with open(json_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line:
                try:
                    records.append(json.loads(line))
                except json.JSONDecodeError as error:
                    print(f"Error parsing line in {file_name}: {error}")

    # Filter out incomplete or irrelevant records
    required_fields = {
        'brand', 'bullet_point', 'color', 'model_name',
        'item_name', 'product_type', 'main_image_id',
        'item_keywords', 'country'
    }

    valid_records = [
        rec for rec in records
        if required_fields.issubset(rec.keys()) and rec.get('country') in {'IN', 'US'}
    ]

    print(f" → Valid entries found: {len(valid_records)}")

    # Write filtered records to CSV
    with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(csv_columns)

        for rec in valid_records:
            overall_desc = extract_values_by_language(rec.get('bullet_point', []))

            color_desc = []
            color_desc += extract_standardized_colors(rec.get('color', []))
            color_desc += extract_values_by_language(rec.get('color', []))

            other_desc = []
            for key in ['product_type', 'item_keywords']:
                other_desc += extract_values_by_language(rec.get(key, []))

            material_desc = extract_values_by_language(rec.get('material', [])) if 'material' in rec else []

            row = [
                rec.get('main_image_id', ''),
                '; '.join(overall_desc),
                '; '.join(color_desc),
                '; '.join(other_desc),
                '; '.join(material_desc)
            ]
            writer.writerow(row)

    print(f" → Output written: {csv_path} ({len(valid_records)} rows)\n")


Processing file: /kaggle/input/abo-listings/listings_3.json → /kaggle/working/abo-listings/listings/filtered_metadata/listings_3.csv
 → Valid entries found: 4193
 → Output written: /kaggle/working/abo-listings/listings/filtered_metadata/listings_3.csv (4193 rows)

Processing file: /kaggle/input/abo-listings/listings_d.json → /kaggle/working/abo-listings/listings/filtered_metadata/listings_d.csv
 → Valid entries found: 4178
 → Output written: /kaggle/working/abo-listings/listings/filtered_metadata/listings_d.csv (4178 rows)

Processing file: /kaggle/input/abo-listings/listings_b.json → /kaggle/working/abo-listings/listings/filtered_metadata/listings_b.csv
 → Valid entries found: 4279
 → Output written: /kaggle/working/abo-listings/listings/filtered_metadata/listings_b.csv (4279 rows)

Processing file: /kaggle/input/abo-listings/listings_a.json → /kaggle/working/abo-listings/listings/filtered_metadata/listings_a.csv
 → Valid entries found: 4232
 → Output written: /kaggle/working/abo-list

In [4]:
!pip install --upgrade pip
!pip install google-genai

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1


In [30]:
from google import genai
from google.genai import Client, types
import time

print(dir(genai))
print(dir(Client))
print(dir(types))

['Client', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', '_api_client', '_api_module', '_common', '_extra_utils', '_replay_api_client', '_transformers', 'batches', 'caches', 'chats', 'client', 'errors', 'files', 'live', 'models', 'pagers', 'tunings', 'types', 'version']
['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_get_api_client', 'aio', 'batches', 'caches', 'chats', 'files', 'models', 'tunings', 'vertexai']


In [None]:
# Constants
MAX_DAILY_REQUESTS = 1500
DELAY_BETWEEN_REQUESTS = 60

# Initialize API client
client = genai.Client(api_key="") # Replace with your API key
requests_made = 0

def load_progress(progress_file):
    """Load the last processed index from a progress file."""
    if os.path.exists(progress_file):
        with open(progress_file, 'r') as f:
            return int(f.read().strip())
    return 0

def save_progress(progress_file, index):
    """Save the current processed index to a progress file."""
    with open(progress_file, 'w') as f:
        f.write(str(index))

def query_gemini_api(image_bytes, combined_description):
    """Send image and description to the Gemini API and return generated questions and answers."""
    prompt_text = (
        "You are given an image and a brief product description.\n"
        f"Use the product description context: {combined_description}\n"
        "Generate exactly 5 diverse, visually clear, and progressively challenging questions.\n"
        "Each question must be answerable by only looking at the image — do NOT rely on external or assumed knowledge.\n"
        "Ensure variation in the *type* of visual cues used: color, shape, count, spatial relationship, relative size, and visible text (if any).\n"
        "Ensure variation in *difficulty level*:\n"
        "- At least 2 simple questions (e.g., color, count)\n"
        "- At least 2 moderately difficult questions (e.g., spatial relations, comparisons)\n"
        "- 1 challenging question requiring closer inspection or subtle visual reasoning (e.g., most prominent item, inferred use from shape)\n"
        "Do NOT ask about materials or properties that are not visually obvious (e.g., plastic, flexible, metal).\n"
        "Answers must be a single word — not all of them 'yes' or 'no'.\n"
        "Strictly use this format without extra text:\n"
        "Question 1: <question>\n"
        "Answer 1: <answer>\n"
        "Do not include any explanations or extra text."
    )
    try:
        response = client.models.generate_content(
            model='gemini-2.0-flash',
            contents=[
                types.Part.from_bytes(data=image_bytes, mime_type='image/jpeg'),
                prompt_text
            ]
        )
        return response.text
    except Exception as e:
        print(f"Error querying Gemini API: {e}")
        return None

def get_image_path_map(images_csv_path):
    """Create a dictionary mapping image_id to image paths from the CSV."""
    image_path_map = {}
    with open(images_csv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            image_path_map[row['image_id']] = row['path']
    print("Loaded image metadata successfully.")
    return image_path_map

def generate_questions_for_image(image_id, row, image_path_map, images_base_path):
    """Generate questions and answers for a given image."""
    image_filename = image_path_map.get(image_id)
    if not image_filename:
        print(f"Image path not found for image_id: {image_id}")
        return None

    full_image_path = os.path.join(images_base_path, image_filename)
    if not os.path.exists(full_image_path):
        print(f"Image file does not exist: {full_image_path}")
        return None

    try:
        with open(full_image_path, "rb") as img_file:
            image_bytes = img_file.read()
    except Exception as e:
        print(f"Failed to read image {full_image_path}: {e}")
        return None

    combined_description = f"Overall: {row['overall_description']}; " \
                           f"Color: {row['colour_description']}; " \
                           f"Material: {row['material_description']}" \
                           f"Other: {row['other_description']}; "
    
    print(f"Sending request for image_id: {image_id}")
    return query_gemini_api(image_bytes, combined_description)

def write_generated_data(writer, image_id, full_image_path, generated_text):
    """Write the generated questions and answers to the output CSV file."""
    lines = [line.strip() for line in generated_text.strip().split('\n') if line.strip()]
    question_lines = [line for line in lines if line.lower().startswith('question')]
    answer_lines = [line for line in lines if line.lower().startswith('answer')]

    if len(question_lines) == 5 and len(answer_lines) == 5:
        for q_line, a_line in zip(question_lines, answer_lines):
            question = q_line.split(':', 1)[1].strip()
            answer = a_line.split(':', 1)[1].strip()
            writer.writerow([image_id, full_image_path, question, answer])

def process_records(listings_csv_path, images_csv_path, images_base_path, output_file, progress_file):
    """Main function to process records, generate questions, and save results."""
    global requests_made
    image_path_map = get_image_path_map(images_csv_path)

    output_dir = os.path.dirname(output_file)
    os.makedirs(output_dir, exist_ok=True)

    start_index = load_progress(progress_file)
    current_index = 0

    with open(output_file, 'a', newline='', encoding='utf-8') as f_out:
        writer = csv.writer(f_out)
        if os.stat(output_file).st_size == 0:
            writer.writerow(['image_id', 'full_image_path', 'question', 'answer'])

        with open(listings_csv_path, 'r', encoding='utf-8') as f_in:
            reader = csv.DictReader(f_in)
            for row in reader:
                if current_index < start_index:
                    current_index += 1
                    continue

                if requests_made >= MAX_DAILY_REQUESTS:
                    print("Reached daily request limit. Stopping.")
                    break

                image_id = row['main_image_id']
                generated_text = generate_questions_for_image(image_id, row, image_path_map, images_base_path)
                
                if generated_text:
                    write_generated_data(writer, image_id, os.path.join(images_base_path, image_path_map.get(image_id)), generated_text)
                    print(f"Processed image_id: {image_id}")
                else:
                    print(f"Failed to generate questions for image_id: {image_id}")

                requests_made += 1
                current_index += 1
                save_progress(progress_file, current_index)

                if requests_made < MAX_DAILY_REQUESTS:
                    print(f"Sleeping {DELAY_BETWEEN_REQUESTS} seconds to respect rate limits...")
                    time.sleep(DELAY_BETWEEN_REQUESTS)

# Run the processing
current_working_filename = 'listings_3'
question_set_number = 'set_4'

listings_csv_path = f'/kaggle/working/abo-listings/listings/filtered_metadata/{current_working_filename}.csv'
images_csv_path = '/kaggle/input/vrdatasets/abo-images-small/abo-images-small/images/metadata/images.csv'
images_base_path = '/kaggle/input/vrdatasets/abo-images-small/abo-images-small/images/small'

output_dir = 'generated_questions'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, f'questions_{current_working_filename}_{question_set_number}.csv')

progress_dir = 'progress'
os.makedirs(progress_dir, exist_ok=True)
progress_file = os.path.join(progress_dir, f'progress_{current_working_filename}.txt')

process_records(listings_csv_path, images_csv_path, images_base_path, output_file, progress_file)


Loaded image metadata successfully.
Sending request for image_id: 717LpuXhzkL
Processed image_id: 717LpuXhzkL
Sleeping 60 seconds to respect rate limits...


KeyboardInterrupt: 