In [29]:
# ==== CONFIG (fill these) ====
GOOGLE_API_KEY = "AIzaSyCyZ9Ohi-GjjrWdSjLqjACA29OGxW7PwJ8"
GOOGLE_CX      = "c5975b4dcae7f47a6"
USDA_API_KEY   = "lJb2tUJxc7C5s8SZxoc3q2o6sRvlGJwDxFB8Qzbf"
PREFERRED_SITES = ["walmart.com", "target.com"]

# ==== IMPORTS ====
import re, json, time, requests
from urllib.parse import quote_plus
from bs4 import BeautifulSoup


In [30]:
!pip install transformers



In [31]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "MBZUAI/LaMini-Flan-T5-783M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



In [32]:
def generate_meal_plan(ingredients, max_length=2048):
    prompt = (
        "You are a helpful vegan chef assistant. "
        "Write a beginner-friendly recipe using ONLY these ingredients when possible: "
        f"{', '.join(ingredients)}. "
        "Give the meal a title and an 'Ingredients:' section with one ingredient per line. "
        "Do NOT include any instructions."
    )

    inputs = tokenizer(prompt, return_tensors="pt")
    output = model.generate(
        **inputs,
        max_length=max_length,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    # Normalize and remove any accidental instructions the model might add
    normalized = normalize_recipe_text(response)
    normalized = strip_instructions(normalized)
    return normalized


In [33]:
BULLET_PAT = re.compile(r"^\s*([\-–•]|\d+\.)\s*")  # -, –, •, or "1."

def extract_ingredient_lines(text: str):
    """
    Robustly pull the bullet lines under the Ingredients section.
    - Skips leading blanks after the header
    - Stops at the next section header or when bullet list ends
    """
    lines = text.splitlines()

    # find "Ingredients" header (allow variations like "Ingredients -" or "Ingredients:")
    start_idx = None
    for i, line in enumerate(lines):
        if re.match(r"^\s*ingredients?\s*[:\-]?\s*$", line.strip(), re.I):
            start_idx = i + 1
            break
    if start_idx is None:
        return []

    # skip blank lines after header
    i = start_idx
    while i < len(lines) and not lines[i].strip():
        i += 1

    collected = []
    started = False
    while i < len(lines):
        line = lines[i]

        # stop at a new section header
        if re.match(r"^\s*(instructions|method|directions|steps?)\s*[:\-]?\s*$", line.strip(), re.I):
            break

        # bullet line?
        if BULLET_PAT.search(line):
            collected.append(line)
            started = True
        else:
            # if we already started collecting bullets and hit a non-bullet (or empty) line, end
            if started and (not line.strip() or not BULLET_PAT.search(line)):
                break
            # if we haven't started yet and it's still blank, just keep moving
        i += 1

    return collected

UNITS = r"cup|cups|tbsp|tablespoon|tablespoons|tsp|teaspoon|teaspoons|oz|ounce|ounces|lb|pound|pounds|g|kg|can|cans|clove|cloves|bag|bags|pinch"
PREP_WORDS = r"chopped|minced|sliced|diced|fresh|drained|rinsed|to\s+taste|optional|ground|grated|julienned|crushed|peeled|seeded|cooked|uncooked|firm|extra|frozen|of"


def normalize_name(s: str):
    s = s.lower()
    # remove quantities like "1", "1/2", "2-3", "15 oz", "2 cups"
    s = re.sub(r"\b\d+\/\d+\b", " ", s)             # fractions
    s = re.sub(r"\b\d+\b", " ", s)                  # integers
    s = re.sub(rf"\b({UNITS})\b", " ", s, flags=re.I)
    s = re.sub(r"[()]", " ", s)
    # remove “, chopped” style descriptors
    s = re.sub(rf"\b({PREP_WORDS})\b", " ", s, flags=re.I)
    # remove punctuation, extra words
    s = re.sub(r"[^a-z0-9 ]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s
import re

def normalize_recipe_text(raw: str) -> str:
    """
    Make the LLM output parser-friendly:
    - Put a newline before/after 'Ingredients:' and 'Instructions:'
    - Put each '-' bullet on its own line
    - Put each numbered step '1.' '2.' on its own line
    """
    t = raw.replace("\r", "")

    # Force headers onto their own lines
    t = re.sub(r'\s*Ingredients\s*:?', '\nIngredients:\n', t, flags=re.I)
    t = re.sub(r'\s*Instructions\s*:?', '\nInstructions:\n', t, flags=re.I)

    # Ensure each bullet starts a new line
    t = re.sub(r'\s*-\s+', '\n- ', t)

    # Ensure numbered steps start a new line (1., 2., 10., etc.)
    t = re.sub(r'(?<!\n)(\b\d+\.)\s*', r'\n\1 ', t)

    # Collapse double/triple newlines
    t = re.sub(r'\n{3,}', '\n\n', t)

    return t.strip()

def strip_instructions(text: str) -> str:
    # remove anything from 'Instructions' onward
    return re.split(r'\n\s*Instructions\s*:?', text, maxsplit=1, flags=re.I)[0].strip()

def parse_recipe_ingredients(recipe_text: str):
    def normalize_name(s: str):
        s = s.lower()
        s = re.sub(r"\b\d+\/\d+\b", " ", s)           # 1/2
        s = re.sub(r"\b\d+(?:\.\d+)?\b", " ", s)      # 1, 1.5
        s = re.sub(rf"\b({UNITS})\b", " ", s, flags=re.I)
        s = re.sub(r"[()]", " ", s)
        s = re.sub(rf"\b({PREP_WORDS})\b", " ", s, flags=re.I)
        s = re.sub(r"[^a-z0-9 ]+", " ", s)
        s = re.sub(r"\s+", " ", s).strip()
        return s

    def clean_ingredient_name(line: str):
        line = BULLET_PAT.sub("", line).strip()
        line = line.replace("–", "-")
        line = line.split(" - ")[0]
        line = line.replace(",", " ")
        return normalize_name(line)

    raw_lines = extract_ingredient_lines(recipe_text)
    names = [clean_ingredient_name(ln) for ln in raw_lines]
    names = [n for n in names if n]  # drop empties

    # dedupe, preserve order
    seen, uniq = set(), []
    for n in names:
        if n not in seen:
            uniq.append(n); seen.add(n)
    return uniq


In [34]:
def jaccard(a: str, b: str):
    A = set(a.split())
    B = set(b.split())
    if not A or not B: return 0.0
    return len(A & B) / len(A | B)

def split_have_missing(pantry_list, recipe_ings, threshold=0.5):
    """
    pantry_list: list of strings user has (e.g., ["rice", "beans", "tofu"])
    recipe_ings: normalized names from recipe
    threshold: token overlap to consider a match
    """
    pantry_norm = [normalize_name(x) for x in pantry_list]
    have, missing = [], []
    for ing in recipe_ings:
        matched = any(jaccard(ing, p) >= threshold or ing in p or p in ing
                      for p in pantry_norm)
        (have if matched else missing).append(ing)
    return have, missing


In [35]:
def cse_first_product_url(query, site):
    q = f"site:{site} {query}"
    url = ( "https://www.googleapis.com/customsearch/v1"
            f"?key={GOOGLE_API_KEY}&cx={GOOGLE_CX}&q={quote_plus(q)}")
    r = requests.get(url, timeout=12)
    if r.status_code != 200:
        return None, f"cse_http_{r.status_code}"
    data = r.json()
    items = data.get("items") or []
    if not items:
        return None, "cse_no_items"
    # prefer product-looking links
    for it in items:
        link = it.get("link", "")
        if any(p in link for p in ["/ip/","/p/","/product/","/gp/"]):
            return link, None
    return items[0].get("link"), None

def best_store_url(item, sites=PREFERRED_SITES, delay=0.2):
    for site in sites:
        url, err = cse_first_product_url(item, site)
        if url:
            return {"store": site, "url": url, "reason": None}
        last_err = err
        time.sleep(delay)
    return {"store": None, "url": None, "reason": last_err}


In [36]:
MAIN_MACROS = {
    "Energy": "Calories",
    "Protein": "Protein (g)",
    "Total lipid (fat)": "Fat (g)",
    "Carbohydrate, by difference": "Carbs (g)",
    "Fiber, total dietary": "Fiber (g)",
    "Sugars, total including NLEA": "Sugars (g)",
}

def usda_macros(query):
    url = "https://api.nal.usda.gov/fdc/v1/foods/search"
    params = {"query": query, "pageSize": 1, "api_key": USDA_API_KEY}
    try:
        r = requests.get(url, params=params, timeout=12)
        r.raise_for_status()
        data = r.json()
        foods = data.get("foods", [])
        if not foods:
            return None, "usda_no_match"
        f = foods[0]
        out = {}
        for n in f.get("foodNutrients", []):
            nm = n.get("nutrientName")
            if nm in MAIN_MACROS:
                out[MAIN_MACROS[nm]] = n.get("value")
        # ensure stable keys
        for k in MAIN_MACROS.values():
            out.setdefault(k, None)
        meta = {"description": f.get("description","N/A"),
                "brand": f.get("brandOwner","N/A"),
                "fdcId": f.get("fdcId")}
        return {"macros": out, "meta": meta}, None
    except requests.RequestException as e:
        return None, f"usda_http_error:{e}"


In [37]:
def plan_and_enrich(pantry_list):
    # 1) generate recipe
    recipe_text = generate_meal_plan(pantry_list)

    # 2) parse ingredients from recipe
    recipe_ings = parse_recipe_ingredients(recipe_text)

    # 3) compare to pantry
    have, missing = split_have_missing(pantry_list, recipe_ings, threshold=0.5)

    # 4) for each missing: store URL + USDA macros
    enriched = []
    for item in missing:
        url_block = best_store_url(item)
        usda_block, usda_err = usda_macros(item)
        enriched.append({
            "item": item,
            "store": url_block["store"],
            "url": url_block["url"],
            "url_reason": url_block["reason"],
            "macros": (usda_block or {}).get("macros") if usda_block else None,
            "usda_meta": (usda_block or {}).get("meta") if usda_block else None,
            "macros_reason": None if usda_block else usda_err
        })

    return {
        "recipe_text": recipe_text,
        "parsed_ingredients": recipe_ings,
        "have": have,
        "missing": missing,
        "shopping_info": enriched
    }


In [38]:
pantry = ["rice", "beans", "tofu", "potato"]  # student’s on-hand list
result = plan_and_enrich(pantry)

# Show recipe as-is
print(result["recipe_text"])

print("\nParsed ingredients:", result["parsed_ingredients"])
print("\nYou already have:", result["have"])
print("\nYou need to buy:")
for row in result["shopping_info"]:
    print(f" - {row['item']}")
    print(f"   store: {row['store'] or 'n/a'}")
    print(f"   url:   {row['url'] or row['url_reason']}")
    print(f"   macros: {row['macros']}")


Recipe: Tofu and Bean Stir Fry
Ingredients:
- 1 cup rice
- 1 block firm tofu, cubed
- 1 diced potato
- 1 tablespoon vegetable oil
- 1 tablespoon soy sauce
- 1 tablespoon sesame oil
- 1 teaspoon ginger, minced
- 1 teaspoon garlic, minced
- 1 tablespoon soy sauce
- Salt and pepper to taste

Parsed ingredients: ['rice', 'block tofu cubed', 'potato', 'vegetable oil', 'soy sauce', 'sesame oil', 'ginger', 'garlic', 'salt and pepper']

You already have: ['rice', 'block tofu cubed', 'potato']

You need to buy:
 - vegetable oil
   store: walmart.com
   url:   https://www.walmart.com/ip/Great-Value-Vegetable-Oil-48-fl-oz/10451002
   macros: {'Protein (g)': 0.0, 'Fat (g)': 100, 'Carbs (g)': 0.0, 'Calories': 857, 'Fiber (g)': None, 'Sugars (g)': None}
 - soy sauce
   store: walmart.com
   url:   https://www.walmart.com/ip/Kikkoman-Soy-Sauce-15-0-FL-OZ/10452918
   macros: {'Protein (g)': 8.14, 'Fat (g)': 0.57, 'Carbs (g)': 4.93, 'Calories': 53, 'Fiber (g)': 0.8, 'Sugars (g)': None}
 - sesame oil
  