# Preprocess Datasets

## Epicurious

In [3]:
import json
import csv

# Load JSON
with open("raw/full_format_recipes.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Output CSV
with open("recipes.csv", "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["title", "rating", "calories", "protein", "fat", "sodium", "desc", "directions"])

    for r in data:

        # --- CLEAN TITLE (skip if missing) ---
        title = r.get("title")
        if not title or title.strip() == "":
            continue
        title = title.strip()

        # --- SAFE NUMERIC FIELDS ---
        rating   = r.get("rating", "")
        calories = r.get("calories", "")
        protein  = r.get("protein", "")
        fat      = r.get("fat", "")
        sodium   = r.get("sodium", "")

        # --- DESCRIPTION ---
        desc = r.get("desc", "")

        # --- DIRECTIONS (newline-joined) ---
        dirs = r.get("directions", [])
        if dirs is None:
            dirs = []
        directions = "\n".join(dirs)

        # Write row
        w.writerow([title, rating, calories, protein, fat, sodium, desc, directions])

In [4]:
# === 2. CATEGORIES CSV ===
with open("recipe_categories.csv", "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["title", "category"])
    for r in data:
        for c in r.get("categories", []):
            w.writerow([r["title"].strip(), c.strip()])

In [5]:
# === 3. INGREDIENTS CSV (raw string mode) ===
with open("recipe_ingredients.csv", "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["title", "ingredient_raw"])
    for r in data:
        for ing in r.get("ingredients", []):
            w.writerow([r["title"].strip(), ing.strip()])
