In [28]:
import pandas as pd
from pathlib import Path

In [29]:

df_template = pd.read_csv("/Users/prakharjain/code/aglio_ai/backend/raw_data/template.csv")

columns = df_template.columns.tolist()
columns

['id',
 'name',
 'category',
 'group_category',
 'category_brief',
 'description',
 'price',
 'is_veg',
 'image_path',
 'is_bestseller',
 'is_recommended']

In [30]:
jsons_to_parse = [
    {"raw_json": "/Users/prakharjain/code/aglio_ai/backend/raw_data/handcrafted.json", "output_csv": "/Users/prakharjain/code/aglio_ai/backend/raw_data/handcrafted.csv", "image_dir": "/Users/prakharjain/code/aglio_ai/backend/raw_data/handcrafted"},
    {"raw_json": "/Users/prakharjain/code/aglio_ai/backend/raw_data/chianti.json", "output_csv": "/Users/prakharjain/code/aglio_ai/backend/raw_data/chianti.csv", "image_dir": "/Users/prakharjain/code/aglio_ai/backend/raw_data/chianti"}
]


In [31]:
import json
import os
import requests
from urllib.parse import urlparse

def json_to_csv_with_images(json_file_path, output_csv_path, columns, image_dir):
    """
    Convert JSON file to CSV with proper column mapping and handle image downloading
    
    Args:
        json_file_path (str): Path to the JSON file
        output_csv_path (str): Path where CSV will be saved
        columns (list): List of expected columns from template
        image_dir (str): Directory path where images will be stored
    """
    
    # Create image directory if it doesn't exist
    os.makedirs(image_dir, exist_ok=True)
    
    # Read JSON file
    with open(json_file_path, 'r') as f:
        data = json.load(f)
    
    # Process each item in the JSON
    processed_data = []
    
    for idx, item in enumerate(data):
        row = {}
        
        # Map basic fields
        row['id'] = item.get('id', idx + 1)  # Use index if no id
        row['name'] = item.get('name', '')
        row['category'] = item.get('category', '')
        row['group_category'] = item.get('category_2', item.get('category', ''))  # Use category_2 or fallback to category
        row['description'] = item.get('description', '')
        row['price'] = item.get('price', 0)
        
        # Handle image_path logic
        image = item.get('image')
        image_url = item.get('image_url')
        
        # Generate filename from item name
        if item.get('name'):
            filename = item['name'].replace(' ', '_').replace('/', '_') + '.png'
            full_image_path = os.path.join(image_dir, filename)
        else:
            filename = None
            full_image_path = None
        
        if image and image != "null" and image is not None:
            if image.startswith(('http://', 'https://')):
                # It's a URL - download it
                if filename and full_image_path:
                    if not os.path.exists(full_image_path):
                        try:
                            print(f"Downloading image for '{item['name']}' from {image}")
                            response = requests.get(image, timeout=30)
                            response.raise_for_status()
                            with open(full_image_path, 'wb') as f:
                                f.write(response.content)
                            print(f"✓ Downloaded: {filename}")
                        except Exception as e:
                            print(f"✗ Failed to download {image}: {e}")
                            filename = None
                    row['image_path'] = filename
                else:
                    row['image_path'] = None
            else:
                # Not a URL - use postprocessed filename
                row['image_path'] = filename if filename else image
                
        elif image_url and image_url != "null" and image_url is not None:
            if image_url.startswith(('http://', 'https://')):
                # It's a URL - download it
                if filename and full_image_path:
                    if not os.path.exists(full_image_path):
                        try:
                            print(f"Downloading image for '{item['name']}' from {image_url}")
                            response = requests.get(image_url, timeout=30)
                            response.raise_for_status()
                            with open(full_image_path, 'wb') as f:
                                f.write(response.content)
                            print(f"✓ Downloaded: {filename}")
                        except Exception as e:
                            print(f"✗ Failed to download {image_url}: {e}")
                            filename = None
                    row['image_path'] = filename
                else:
                    row['image_path'] = None
            else:
                # Not a URL - use postprocessed filename
                row['image_path'] = filename if filename else image_url
        else:
            row['image_path'] = None
            
        # Handle boolean fields - convert to 1 or 0
        row['is_veg'] = 1 if item.get('is_vegetarian', 0) else 0
        row['is_bestseller'] = 1 if item.get('is_bestseller', False) else 0
        row['is_recommended'] = 1 if item.get('is_recommended', False) else 0
        
        processed_data.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(processed_data, columns=columns)
    
    # Save to CSV
    df.to_csv(output_csv_path, index=False)
    print(f"Converted {len(processed_data)} items from {json_file_path} to {output_csv_path}")
    
    return df

# Test the updated function with your JSON files
print("Running updated function with image downloading...")
for data in jsons_to_parse:
    print(f"\nProcessing {data['raw_json']}...")
    df_result = json_to_csv_with_images(
        data["raw_json"], 
        data["output_csv"], 
        columns, 
        data["image_dir"]
    )
    print(f"Created {data['output_csv']}")
    print(f"DataFrame shape: {df_result.shape}")
    print("Sample image_path values (relative filenames):")
    print(df_result['image_path'].head().tolist())
    
    # Count how many images we have
    non_null_images = df_result['image_path'].notna().sum()
    print(f"Images available: {non_null_images}/{len(df_result)}")
    print("-" * 50)


Running updated function with image downloading...

Processing /Users/prakharjain/code/aglio_ai/backend/raw_data/handcrafted.json...
Converted 83 items from /Users/prakharjain/code/aglio_ai/backend/raw_data/handcrafted.json to /Users/prakharjain/code/aglio_ai/backend/raw_data/handcrafted.csv
Created /Users/prakharjain/code/aglio_ai/backend/raw_data/handcrafted.csv
DataFrame shape: (83, 11)
Sample image_path values (relative filenames):
['Coffee_Cran.png', "Grown-ups'_Cafe_Mocha.png", None, None, 'Moroccan_Mocha.png']
Images available: 26/83
--------------------------------------------------

Processing /Users/prakharjain/code/aglio_ai/backend/raw_data/chianti.json...
Converted 128 items from /Users/prakharjain/code/aglio_ai/backend/raw_data/chianti.json to /Users/prakharjain/code/aglio_ai/backend/raw_data/chianti.csv
Created /Users/prakharjain/code/aglio_ai/backend/raw_data/chianti.csv
DataFrame shape: (128, 11)
Sample image_path values (relative filenames):
['Spaghetti_In_Neapolitan_S