# 🍲 Recipe Recommender System

This project builds a content-based recommender that suggests recipes based on the ingredients a user already has.  
- **Dataset**: [Food.com Recipes & Interactions](https://www.kaggle.com/datasets/shuyangli94/food-com-recipes-and-user-interactions/data)  
- **Approach**: 
  - Clean and normalize recipe ingredients.
  - Represent recipes using TF–IDF.
  - Recommend recipes using cosine similarity.
  - Filter results with metadata (tags, cooking time).

In [106]:
import pandas as pd
import numpy as np
import re, ast
from pathlib import Path
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display

# Data Loading

In [None]:
DATA = Path("./data")

recipes = pd.read_csv(DATA / "RAW_recipes.csv")
inter = pd.read_csv(DATA / "RAW_interactions.csv")

print("Recipes shape:", recipes.shape)
print("Interactions shape:", inter.shape)

recipes.head(3)

Recipes shape: (231637, 12)
Interactions shape: (1132367, 5)


Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13


# Ingredient Normalization
* Lowercase all words
* remove numbers and units (e.g. cups, tbsp)
* Lemmatize words (e.g. "potatoes" -> "potato")

In [108]:
lemmatizer = WordNetLemmatizer()
units = {"cup","cups","tablespoon","tablespoons","tbsp",
         "teaspoon","teaspoons","tsp","gram","grams","kg","ml","oz"}

def normalize_tokens(str_list):

    cleaned = []
    
    for text in str_list:
        # lowercase
        text = text.lower()
        # numbers, fractions, punctuation (only keeps letters, \d = digits, \u00BC–\u00BE = common fractions)
        text = re.sub(r"[^a-zA-Z\s]", "", text) 
        # remove units, lemmatize
        words = [lemmatizer.lemmatize(w) for w in text.split() if w not in units]
        # flatten into word list
        if words:
            cleaned.extend(words)
    return cleaned

# Apply to recipes & pantry 
recipes["ingredients_clean"] = recipes["ingredients"].apply(ast.literal_eval).apply(normalize_tokens)


# TF-IDF + Cosine Similarity
* TF-IDF applied to weigh ingredients by importance
* Cosine Similarity used to match ingredients in pantry and recipe

In [109]:
recipes["ingredients_str"] = recipes["ingredients_clean"].apply(lambda lst: " ".join(lst))

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(recipes["ingredients_str"])


# Recipe Recommender

* Matches pantry items with recipes using cosine similarity
* Applies filters (minimun ingredient overlap, cooking time, tags)

In [110]:
# turn tags (string) into a list
recipes["tags_list"] = recipes["tags"].apply(ast.literal_eval)

# returns how many ingredients overlap
def overlap_count(pantry_clean, recipe_clean):
    return len(set(pantry_clean) & set(recipe_clean))

def recommend_recipes(
    pantry,
    top_k=5, 
    min_overlap=2,
    max_minutes = None,
    include_tags = None,
    exclude_tags = None
):
    
    # normalize pantry
    pantry_clean = normalize_tokens(pantry)

    # turns pantry into a string for TF-IDF
    pantry_str = " ".join(pantry_clean)
    pantry_vec = vectorizer.transform([pantry_str])
    sims = cosine_similarity(pantry_vec, tfidf_matrix)[0]

    # sort recipes by similarity
    sorted_idx = sims.argsort()[::-1]

    results = []

    for idx in sorted_idx:
        recipe = recipes.iloc[idx]
        recipe_clean = recipe["ingredients_clean"]
        recipe_tags = recipe["tags_list"]

        # ingredients that overlap
        overlap = set(pantry_clean) & set(recipe_clean)

        # Filters
        if overlap_count(pantry_clean, recipe_clean) < min_overlap:
            continue
        if max_minutes and recipe["minutes"] > max_minutes:
            continue
        if include_tags and not any(tag in recipe_tags for tag in include_tags):
            continue
        if exclude_tags and any(tag in recipe_tags for tag in exclude_tags):
            continue

        results.append({
            "name": recipe["name"],
            "minutes": recipe["minutes"],
            "tags": recipe_tags,
            "ingredients": recipe_clean,
            "similarity": sims[idx],
            "matches": list(overlap)
        })

        if len(results) >= top_k:
            break

    return results


# Display Recommendations

In [111]:
def display_recommendations(recommendations):
    df = pd.DataFrame(recommendations)
    # shorten tag lists
    df["tags"] = df["tags"].apply(lambda t: ", ".join(t[:5]))
    df["matches"] = df["matches"].apply(lambda m: ", ".join(m))
    df["similarity"] = df["similarity"].round(3)
    
    # bolds matching ingredients
    def highlight_matches(row):
        ingredients = []
        for ing in row["ingredients"]:
            if ing in row["matches"].split(", "):
                ingredients.append(f"<b>{ing}</b>")
            else:
                ingredients.append(ing)
        return ", ".join(ingredients)
    
    df["ingredients_highlighted"] = df.apply(highlight_matches, axis=1)
    
    return df[["name", "minutes", "matches", "tags", "similarity", "ingredients_highlighted"]].style.format(
        {"ingredients_highlighted": lambda x: x})


# DEMO

In [112]:
pantry = ["onion", "chicken", "butter"]

recommendations = recommend_recipes(
    pantry=["onion", "rice", "chicken"],
    max_minutes=30,
    exclude_tags=["dessert"],
)

display_recommendations(recommendations)

Unnamed: 0,name,minutes,matches,tags,similarity,ingredients_highlighted
0,too tired broke yellow rice and chicken,20,"rice, chicken","30-minutes-or-less, time-to-make, course, main-ingredient, preparation",0.675,"chicken, yellow, rice"
1,solo sweet onion rice,30,"rice, onion, chicken","30-minutes-or-less, time-to-make, course, main-ingredient, preparation",0.652,"olive, oil, garlic, onion, rice, chicken, stock"
2,egyptian rice for fish,30,"rice, onion","30-minutes-or-less, time-to-make, course, main-ingredient, cuisine",0.626,"rice, oil, onion, water, salt"
3,country rice,30,"rice, onion, chicken","30-minutes-or-less, time-to-make, main-ingredient, preparation, healthy",0.59,"chicken, stock, green, onion, black, pepper, white, rice"
4,basic rice pilaf,25,"rice, onion, chicken","30-minutes-or-less, time-to-make, main-ingredient, preparation, healthy",0.586,"butter, onion, rice, salt, black, pepper, chicken, stock, parsley"


# Summary

My recipe recommender project suggests dishes based on ingredients present in the users pantry.
The workflow includes:
- Cleaning and normalizing ingredient text
- Representing recipes with TF–IDF vectors.  
- Ranking results using cosine similarity.  
- Filtering by metadata

### Next Steps
- Experiment with collaborative filtering using the dataset on user interactions.
- Deploy as a Streamlit app to make it interactive