In [1]:
# -*- coding: utf-8 -*-
# Prediction / Inference pipeline:
# - Input: user meal plan text or list of dishes
# - Output: top-K matching recipes, merged shopping list
# - Optional: price simulation & cost optimization (single-store vs multi-store)

# If needed once:
# !pip install pandas numpy scikit-learn tqdm pulp

import re, ast, math, random
from pathlib import Path
from collections import defaultdict, Counter
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

try:
    import pulp  # for LP optimization
    HAS_PULP = True
except Exception:
    HAS_PULP = False

# =========================
# CONFIG
# =========================
BASE_DIR    = r"C:\Users\sagni\Downloads\Smart Grocery List Optimizer"
ARCHIVE     = str(Path(BASE_DIR) / "archive")
RAW_RECIPES = str(Path(ARCHIVE) / "RAW_recipes.csv")    # Food.com raw recipes
TOP_K       = 10                                        # how many recipes to return
SEED        = 7

# =========================
# Helpers
# =========================
def clean_text(s: str) -> str:
    if s is None or (isinstance(s,float) and math.isnan(s)): return ""
    s = str(s).lower().strip()
    s = re.sub(r"\s+", " ", s)
    return s

def row_text(row) -> str:
    # combine name + ingredients + steps for retrieval
    name = clean_text(row.get("name", ""))
    ings = clean_text(row.get("ingredients", ""))
    steps = clean_text(row.get("steps", ""))
    return f"{name} {ings} {steps}"

def safe_list_parse(x):
    """Food.com fields often look like "['onions', 'salt']". Parse safely to list."""
    if isinstance(x, list): return x
    s = str(x)
    try:
        obj = ast.literal_eval(s)
        if isinstance(obj, list):
            return [str(i) for i in obj]
        return [s]
    except Exception:
        # fallback: split on commas
        s = s.strip("[]")
        parts = [p.strip().strip("'").strip('"') for p in s.split(",") if p.strip()]
        return parts if parts else [s]

# minimal units and normalization
UNIT_ALIASES = {
    "g": ["g","gram","grams"],
    "kg": ["kg","kilogram","kilograms"],
    "ml": ["ml","milliliter","milliliters"],
    "l": ["l","liter","liters"],
    "tsp": ["tsp","teaspoon","teaspoons"],
    "tbsp": ["tbsp","tablespoon","tablespoons"],
    "cup": ["cup","cups"],
    "piece": ["piece","pieces","pc","pcs"],
}
UNIT_LOOKUP = {alias:u for u, aliases in UNIT_ALIASES.items() for alias in aliases}

def extract_qty_unit(item: str):
    """
    Very lightweight quantity/unit extractor.
    Examples:
        "2 cups rice" -> qty=2.0, unit=cup, name="rice"
        "200 g chicken breast" -> 200, g, "chicken breast"
        "1 onion" -> 1, piece, "onion"
        "salt to taste" -> None, None, "salt"
    """
    s = clean_text(item)
    # try number at start
    m = re.match(r"^(\d+(?:\.\d+)?)(?:\s*([a-zA-Z]+))?\s+(.*)$", s)
    if m:
        qty = float(m.group(1))
        raw_unit = m.group(2) or ""
        unit = UNIT_LOOKUP.get(raw_unit.lower(), None) if raw_unit else None
        name = m.group(3).strip()
        # if unit missing but name starts with unit word
        if not unit:
            m2 = re.match(r"^([a-zA-Z]+)\s+(.*)$", name)
            if m2 and UNIT_LOOKUP.get(m2.group(1).lower(), None):
                unit = UNIT_LOOKUP[m2.group(1).lower()]
                name = m2.group(2).strip()
        if not unit and qty.is_integer():  # default to piece for integers
            unit = "piece"
        return qty, unit, name
    # if no leading number: assume name only
    return None, None, s

def merge_ingredient_line(item: str):
    """Return canonical (name, qty, unit)."""
    qty, unit, name = extract_qty_unit(item)
    # strip descriptors from name
    name = re.sub(r"\b(chopped|diced|minced|sliced|fresh|ground|to taste|large|small|medium|boneless|skinless)\b", "", name)
    name = re.sub(r"[^a-z0-9\s\-]", "", name).strip()
    name = re.sub(r"\s+", " ", name)
    if not name:
        name = item
    return name, qty, unit

def merge_ingredients(list_of_lists):
    """
    Input: list of ingredient lists (strings)
    Output: dict {name: {"unit": unit or None, "qty": total_qty (can be None), "lines": [raw]}}
    - We sum quantities when unit matches; if not, we keep separate entries by (name, unit).
    """
    merged = {}
    for ing_list in list_of_lists:
        for raw in ing_list:
            name, qty, unit = merge_ingredient_line(raw)
            key = (name, unit or "unitless")
            if key not in merged:
                merged[key] = {"name": name, "unit": unit, "qty": 0.0 if qty is not None else None, "lines": [raw]}
            else:
                merged[key]["lines"].append(raw)
                if qty is not None:
                    if merged[key]["qty"] is None:
                        merged[key]["qty"] = qty
                    else:
                        merged[key]["qty"] += qty
    # convert to list for display
    out = []
    for (name, unit), data in merged.items():
        out.append({
            "name": data["name"],
            "qty": round(data["qty"], 2) if isinstance(data["qty"], (int,float)) else None,
            "unit": data["unit"] if data["unit"] else "",
            "examples": "; ".join(data["lines"][:3])
        })
    # sort alphabetically
    out = sorted(out, key=lambda x: x["name"])
    return out

# =========================
# Load recipes and build vectorizer (fit once; reuse for predictions)
# =========================
print("[INFO] Loading recipes...")
df = pd.read_csv(RAW_RECIPES)
keep = [c for c in ["id","name","ingredients","steps"] if c in df.columns]
df = df[keep].copy()
if "ingredients" not in df.columns: df["ingredients"] = ""
if "steps" not in df.columns: df["steps"] = ""

df["doc_text"] = (df["name"].astype(str) + " " + df["ingredients"].astype(str) + " " + df["steps"].astype(str)).map(clean_text)

print("[INFO] Fitting TF-IDF (first time in this session)...")
vec = TfidfVectorizer(max_features=50000, ngram_range=(1,2), stop_words="english")
X  = vec.fit_transform(df["doc_text"].tolist())

# =========================
# Prediction functions
# =========================
def score_recipes(meal_plan_text: str, top_k: int = TOP_K) -> pd.DataFrame:
    """
    Returns top_k recipes most similar to the user's meal plan text.
    """
    q = vec.transform([clean_text(meal_plan_text)])
    sims = cosine_similarity(X, q).ravel()
    out = df[["id","name","ingredients"]].copy()
    out["score"] = sims
    out = out.sort_values("score", ascending=False).head(top_k).reset_index(drop=True)
    return out

def build_shopping_list(top_df: pd.DataFrame) -> pd.DataFrame:
    """
    From selected recipes, parse and merge ingredients.
    """
    all_ing_lists = []
    for _, row in top_df.iterrows():
        ings = safe_list_parse(row.get("ingredients", "[]"))
        all_ing_lists.append(ings)
    merged = merge_ingredients(all_ing_lists)
    shop = pd.DataFrame(merged, columns=["name","qty","unit","examples"])
    return shop

# =========================
# (Optional) Price simulation + cost optimization
# =========================
def simulate_store_prices(items_df: pd.DataFrame, stores=("BigBasket","Blinkit","Walmart"), seed=SEED):
    """
    Create a fake price table per item per store: name -> {store: unit_price}
    Real integration would fetch actual SKU prices & map units.
    """
    rng = random.Random(seed)
    prices = {}
    for _, row in items_df.iterrows():
        base = rng.uniform(0.5, 5.0)  # base unit price
        prices[row["name"]] = {}
        for s in stores:
            # add small noise per store
            prices[row["name"]][s] = round(base * rng.uniform(0.85, 1.20), 2)
    return prices

def optimize_cart_single_store(items_df: pd.DataFrame, store_prices: dict):
    """
    Choose one store that minimizes total cost.
    Returns (best_store, total_cost).
    """
    totals = {}
    for store in next(iter(store_prices.values())).keys():
        total = 0.0
        for _, row in items_df.iterrows():
            qty = row["qty"] if row["qty"] and row["qty"]>0 else 1.0
            price = store_prices.get(row["name"], {}).get(store, 9999)
            total += qty * price
        totals[store] = round(total, 2)
    best_store = min(totals, key=totals.get)
    return best_store, totals[best_store], totals

def optimize_cart_multi_store(items_df: pd.DataFrame, store_prices: dict):
    """
    Linear program: for each item, select exactly one store to buy from, minimizing total cost.
    """
    if not HAS_PULP:
        return None, None, "PuLP not installed; skipping multi-store optimization."

    prob = pulp.LpProblem("GroceryMinCost", pulp.LpMinimize)

    stores = list(next(iter(store_prices.values())).keys())
    # Decision vars x[i, s] in {0,1}
    x = {}
    for i, row in items_df.iterrows():
        item = row["name"]
        for s in stores:
            x[(i,s)] = pulp.LpVariable(f"x_{i}_{s}", cat="Binary")

    # Objective
    obj = 0
    for i, row in items_df.iterrows():
        qty = row["qty"] if row["qty"] and row["qty"]>0 else 1.0
        for s in stores:
            price = store_prices.get(row["name"], {}).get(s, 9999)
            obj += qty * price * x[(i,s)]
    prob += obj

    # Constraints: each item picked exactly once
    for i, _ in items_df.iterrows():
        prob += pulp.lpSum([x[(i,s)] for s in stores]) == 1

    prob.solve(pulp.PULP_CBC_CMD(msg=False))

    selection = []
    total = 0.0
    for i, row in items_df.iterrows():
        item = row["name"]
        qty = row["qty"] if row["qty"] and row["qty"]>0 else 1.0
        chosen = None; price = None
        for s in stores:
            if pulp.value(x[(i,s)]) > 0.5:
                chosen = s
                price = store_prices[item][s]
                break
        selection.append({"name": item, "store": chosen, "unit_price": price, "qty": qty, "cost": round(qty*price,2)})
        total += qty * price
    sel_df = pd.DataFrame(selection)
    return sel_df, round(total,2), None

# =========================
# EXAMPLE USAGE
# =========================
meal_plan = """
chicken curry, vegetable fried rice, dal tadka, grilled chicken salad, lemon rice,
quick stir-fry veggies, lentil soup, paneer wraps
"""

print("\n[STEP 1] Retrieve top recipes for the meal plan...")
top_df = score_recipes(meal_plan, top_k=TOP_K)
display_cols = ["id","name","score"]
print(top_df[display_cols].to_string(index=False))

print("\n[STEP 2] Build merged shopping list...")
shopping_df = build_shopping_list(top_df)
print(shopping_df.head(20).to_string(index=False))

print("\n[STEP 3] (Optional) Simulate prices & optimize cart...")
price_table = simulate_store_prices(shopping_df)
best_store, single_cost, all_totals = optimize_cart_single_store(shopping_df, price_table)
print(f"Single-store best: {best_store} → ₹{single_cost:.2f} (simulated). Breakdown: {all_totals}")

multi_sel, multi_cost, msg = optimize_cart_multi_store(shopping_df, price_table)
if msg:
    print("[Multi-store] Skipped:", msg)
else:
    print(f"[Multi-store] Optimal split cost → ₹{multi_cost:.2f} (simulated)")
    print(multi_sel.head(20).to_string(index=False))

# Hints for integrating into an API/UI:
# - Wrap score_recipes() + build_shopping_list() in a FastAPI endpoint: POST /plan -> returns top recipes + shopping list.
# - Replace simulate_store_prices() with real price lookups (BigBasket/Blinkit/Walmart) and proper unit mapping.
# - Persist results as JSON for the frontend to render.


[INFO] Loading recipes...
[INFO] Fitting TF-IDF (first time in this session)...

[STEP 1] Retrieve top recipes for the meal plan...
    id                                                     name    score
106830                        special fried rice  rachael ray s 0.249239
143501                                             paneer tikka 0.246493
253298                      quick fix layered chicken casserole 0.208443
351616                      lentil soup w spicy tomatoes   rice 0.206918
137200                                  fried rice chicken soup 0.203894
101505                                    just another stir fry 0.194482
103894                    dal and rice with spicy fried cabbage 0.193244
417193             healthiest stir fry with veggies and chicken 0.186683
162093                                  chicken and salad pitas 0.186408
448061 grilled lime chicken  sandwich  salad  wrap or main dish 0.185574

[STEP 2] Build merged shopping list...
                  name  q