In [8]:
import pandas as pd
from pathlib import Path

DATA = Path("../data")

recipes = pd.read_csv(DATA / "RAW_recipes.csv")
inter = pd.read_csv(DATA / "RAW_interactions.csv")

print("Recipes shape:", recipes.shape)
print("Interactions shape:", inter.shape)

recipes.head(3)

Recipes shape: (231637, 12)
Interactions shape: (1132367, 5)


Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13


In [9]:
inter.head(3)

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...


# Ingredient Normalization

In [11]:
import ast
import re
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download("wordnet")

lemmatizer = WordNetLemmatizer()
units = {"cup","cups","tablespoon","tablespoons","tbsp",
         "teaspoon","teaspoons","tsp","gram","grams","kg","ml","oz"}

def normalize_ingredients(ing_list):
    cleaned = []
    for ing in ing_list:
        # lowercase
        ing = ing.lower()
        # numbers, fractions, punctuation (only keeps letters, \d = digits, \u00BC–\u00BE = common fractions)
        ing = re.sub(r"[^a-zA-Z\s]", "", ing)       
        words = [lemmatizer.lemmatize(w) for w in ing.split() if w not in units]  # remove units, lemmatize
        # flatten into word list
        if words:
            cleaned.extend(words) 
    return cleaned

recipes["ingredients_clean"] = recipes["ingredients"].apply(ast.literal_eval).apply(normalize_ingredients)

recipes[["ingredients", "ingredients_clean"]].head(5)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\samyaw\AppData\Roaming\nltk_data...


Unnamed: 0,ingredients,ingredients_clean
0,"['winter squash', 'mexican seasoning', 'mixed ...","[winter, squash, mexican, seasoning, mixed, sp..."
1,"['prepared pizza crust', 'sausage patty', 'egg...","[prepared, pizza, crust, sausage, patty, egg, ..."
2,"['ground beef', 'yellow onions', 'diced tomato...","[ground, beef, yellow, onion, diced, tomato, t..."
3,"['spreadable cheese with garlic and herbs', 'n...","[spreadable, cheese, with, garlic, and, herb, ..."
4,"['tomato juice', 'apple cider vinegar', 'sugar...","[tomato, juice, apple, cider, vinegar, sugar, ..."


# TF-IDF + Cosine Similarity

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

recipes["ingredients_str"] = recipes["ingredients_clean"].apply(lambda lst: " ".join(lst))

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(recipes["ingredients_str"])

# pnatry query
pantry = ["onion", "chicken", "butter"]
pantry_str = " ".join(pantry)
pantry_vec = vectorizer.transform([pantry_str])

# cosine similarity
similarities = cosine_similarity(pantry_vec, tfidf_matrix)

import numpy as np

# gives top 5 matches
top_idx = np.argsort(similarities[0])[::-1][:5]
recipes.iloc[top_idx][["name", "ingredients_clean"]]



Unnamed: 0,name,ingredients_clean
137946,mom florence s baked chicken,"[whole, chicken, onion, butter, salt, pepper]"
74332,easiest chicken recipe of all,"[chicken, salt]"
35996,caramelized onions in crock pot,"[sweet, onion, chicken, broth, butter]"
75771,easy chicken satay peanut curry,"[garlic, clove, onion, butter, chicken, fillet..."
168793,quick chicken rice veggie soup,"[butter, onion, garlic, clove, chicken, broth,..."


# Minimum Ingredient Overlap

* Ensures recipes uses thigns the user has
* Prevents results on matching generic ingredients

In [19]:
# returns how many ingredients overlap
def overlap_count(pantry, recipe_ing):
    return len(set(pantry) & set(recipe_ing))

def recommend_recipes(pantry, top_k=5, min_overlap=2):
    pantry_str = " ".join(pantry)
    pantry_vec = vectorizer.transform([pantry_str])
    sims = cosine_similarity(pantry_vec, tfidf_matrix)[0]

    # sort recipes by similarity
    sorted_idx = sims.argsort()[::-1]

    # filter by overlap
    results = []
    for idx in sorted_idx:
        recipe_ing = recipes.iloc[idx]["ingredients_clean"]
        if overlap_count(pantry, recipe_ing) >= min_overlap:
            results.append((recipes.iloc[idx]["name"], recipe_ing, sims[idx]))
        if len(results) >= top_k:
            break

    return results

recommend_recipes(["chicken", "rice", "onion"])

[('amanda s chicken and rice',
  ['white', 'rice', 'chicken', 'butter'],
  np.float64(0.717151333467904)),
 ('too tired    broke  yellow rice and chicken',
  ['chicken', 'yellow', 'rice'],
  np.float64(0.6747335560913019)),
 ('solo sweet onion rice',
  ['olive', 'oil', 'garlic', 'onion', 'rice', 'chicken', 'stock'],
  np.float64(0.6518829353092435)),
 ('chicken or beef flavored brown rice using pampered chef s rice c',
  ['brown', 'rice', 'water', 'butter', 'chicken'],
  np.float64(0.6514635016612557)),
 ('egyptian rice for fish',
  ['rice', 'oil', 'onion', 'water', 'salt'],
  np.float64(0.62601071347968))]