In [4]:
import pandas as pd
import numpy as np

# Term Frequency – Inverse Document Frequency Vectorizer. It converts text into numerical vectors smartly, by associating rarity of 
# words(ingredients) with importance or weightage. Then these vectors can be compared.
from sklearn.feature_extraction.text import TfidfVectorizer
# Measures how similar two texts are using angle between vectors. 1.0 → identical, 0.0 → no similarity, -1.0 → opposite. It is important here
# because it focuses on overlapping of user's ingredients and ones required in the recipe. It is not impacted by order or length of recipe.
from sklearn.metrics.pairwise import cosine_similarity

import nltk
# Stopwords are very common words that add no meaning. Its better to remove them because they add noise, increase vector size, reduce 
# accuracy. In recipes stopwords are like with, and, fresh, chopped, finely. 
from nltk.corpus import stopwords
# It splits text into individual words (tokens). After removing stopwords, we get important words.
from nltk.tokenize import word_tokenize

In [5]:
df = pd.read_csv("../data/Recipes.csv")
df.head()

Unnamed: 0,recipe_title,category,subcategory,description,ingredients,directions,num_ingredients,num_steps
0,Air Fryer Potato Slices with Dipping Sauce,Air Fryer Recipes,Air Fryer Recipes,"These air fryer potato slices, served with a b...","[""3/4 cup ketchup"", ""1/2 cup beer"", ""1 tablesp...","[""Combine ketchup, beer, Worcestershire sauce,...",9,5
1,Gochujang Pork Belly Bites,Air Fryer Recipes,Air Fryer Recipes,These gochujang pork belly bites are sweet and...,"[""1 pound pork belly"", ""1/4 cup gochujang"", ""2...","[""Preheat an air fryer to 400 degrees F (200 d...",5,4
2,3-Ingredient Air Fryer Everything Bagel Chicke...,Air Fryer Recipes,Air Fryer Recipes,These 3-ingredient air fryer everything bagel ...,"[""1 \u00bc pounds chicken tenders"", ""1 tablesp...","[""Gather all ingredients. Preheat an air fryer...",3,4
3,Air Fryer Everything Bagel Chicken Cutlets,Air Fryer Recipes,Air Fryer Recipes,These air fryer everything bagel chicken cutle...,"[""4 chicken cutlets (about 1 pound total)"", ""s...","[""Preheat an air fryer to 400 degrees F (200 d...",9,9
4,Air Fryer Honey Sriracha Salmon Bites,Air Fryer Recipes,Air Fryer Recipes,These air fryer honey Sriracha salmon bites ar...,"[""1 tablespoon soy sauce"", ""1 tablespoon honey...","[""Preheat an air fryer to 400 degrees F (200 d...",5,5


In [42]:
df.columns

Index(['recipe_title', 'category', 'subcategory', 'description', 'ingredients',
       'directions', 'num_ingredients', 'num_steps', 'ingredients_list',
       'clean_ingredients'],
      dtype='object')

In [7]:
len(df)

62126

In [8]:
# Abstract Syntax Tree module. It safely converts a string that looks like Python data into real Python objects. Better than eval().
import ast

In [9]:
# '["1 pound pork belly", "1/4 cup gochujang", "2 tbsp soy sauce"]' -> ["1 pound pork belly", "1/4 cup gochujang", "2 tbsp soy sauce"]
def parse_ingredients(ingredient_str):
    return ast.literal_eval(ingredient_str)

df["ingredients_list"] = df["ingredients"].apply(parse_ingredients)

In [10]:
type(df["ingredients"].iloc[0])

str

In [11]:
type(df["ingredients_list"].iloc[0])

list

In [12]:
import re # regular expressions for pattern-based cleaning

In [13]:
# load stopwords
stop_words = set(stopwords.words("english"))

In [14]:
# Cleaning the ingredients list
def clean_ingredients(ingredients_list):
    cleaned = []

    for ingredient in ingredients_list: # "1/4 cup Gochujang"
        ingredient = ingredient.lower() # "1/4 cup gochujang"
        ingredient = re.sub(r"[^a-z\s]", "", ingredient) # " cup gochujang"
        tokens = ingredient.split() # ["cup", "gochujang"]
        tokens = [word for word in tokens if word not in stop_words] # ["gochujang"]
        cleaned.append(" ".join(tokens)) # array of all ingredients

    return " ".join(cleaned) # string of all ingredients "gochujang"

df["clean_ingredients"] = df["ingredients_list"].apply(clean_ingredients)

In [15]:
# returns dataframe of 2 columns
df[["ingredients", "clean_ingredients"]].head(3)

Unnamed: 0,ingredients,clean_ingredients
0,"[""3/4 cup ketchup"", ""1/2 cup beer"", ""1 tablesp...",cup ketchup cup beer tablespoon worcestershire...
1,"[""1 pound pork belly"", ""1/4 cup gochujang"", ""2...",pound pork belly cup gochujang tablespoons soy...
2,"[""1 \u00bc pounds chicken tenders"", ""1 tablesp...",pounds chicken tenders tablespoon olive oil cu...


In [45]:
df["clean_ingredients"].iloc[0]

'cup ketchup cup beer tablespoon worcestershire sauce teaspoon onion powder teaspoon cayenne baking potatoes olive oil cooking spray teaspoon garlic powder salt freshly ground black pepper'

In [16]:
tfidf = TfidfVectorizer(
    stop_words = "english",
    max_features = 5000 # It limits vocabulary size making model faster and cleaner
)

# A matrix with Rows = recipes and Columns = ingredients
# Values = importance of ingredient in that recipe
tfidf_matrix = tfidf.fit_transform(df["clean_ingredients"])
tfidf_matrix.shape # (62126, 4865)

(62126, 4865)

In [None]:
# This creates a 62k × 62k matrix. Precomputing, the comparison between recipes, so it can be used for a 'give me a similar recipe' feature. 
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [23]:
def ingredient_overlap(user_ingredients, recipe_ingredients):
    user_set = set(user_ingredients.lower().split())
    recipe_set = set(recipe_ingredients.split())
    return list(user_set & recipe_set)

In [25]:
NON_VEG_INGREDIENTS = {
    "chicken", "beef", "pork", "fish", "shrimp", "lamb", "turkey", "bacon"
}

NON_VEGAN_INGREDIENTS = NON_VEG_INGREDIENTS | {
    "milk", "cheese", "butter", "cream", "egg", "yogurt", "honey"
}

In [None]:
def is_veg(clean_ingredients):
    return not any(word in clean_ingredients for word in NON_VEG_INGREDIENTS)

def is_vegan(clean_ingredients):
    return not any(word in clean_ingredients for word in NON_VEGAN_INGREDIENTS)

In [40]:
def recommend_recipes(user_ingredients, top_n=5, fetch_k=20, max_steps=None, max_ingredients=None, veg=False, vegan=False, 
                      alpha=0.7, min_similarity=0.05):
    
    # Convert user input to vector
    user_vec = tfidf.transform([user_ingredients.lower()])
    
    # Compute cosine similarity between user's ingredients and all recipes
    similarities = cosine_similarity(user_vec, tfidf_matrix).flatten()
    # similarities = [0.12, 0.87, 0.33, 0.91, 0.05, 0.76] where Index = recipe ID and Value = similarity score

    # Now, getting the top recipe indices
    sorted_indices = similarities.argsort() # It returns indices, sorted by values (ascending by default)
    top_indices = sorted_indices[-fetch_k:] # Then I take only the last fetch_k indices which correspond to highest similarities
    top_indices = top_indices[::-1] # Reverse the order to get the highest similarity first

    # These recipe relevance scores will be shown in UI
    top_scores = similarities[top_indices]
    
    # Get the recommended recipes
    recommended = df.iloc[top_indices].copy()

    recommended["match_score"] = top_scores[:len(recommended)]
    recommended["match_score"] = recommended["match_score"].round(3)

    # Drop duplicate recipe titles to avoid repetition
    recommended = recommended.drop_duplicates(subset="recipe_title")

    # Ingredient overlap
    recommended["matched_ingredients"] = recommended["clean_ingredients"].apply(
        lambda x: ingredient_overlap(user_ingredients, x)
    )
    recommended["num_matched_ingredients"] = recommended["matched_ingredients"].apply(len)

    # Ingredient overlap score
    user_ingredients_count = len(user_ingredients.split())
    recommended["ingredient_overlap_score"] = recommended["num_matched_ingredients"] / user_ingredients_count

    # Final weighted score. This custom weighted score allows to give recommendation based on both, what the recipe is and 
    # what the ingredients are. User will be able to select what do they prefer.
    recommended["final_score"] = (
        alpha * recommended["match_score"] +
        (1 - alpha) * recommended["ingredient_overlap_score"]
    )
    
    # Filters
    if max_steps:
        recommended = recommended[recommended["num_steps"] <= max_steps]
    if max_ingredients:
        recommended = recommended[recommended["num_ingredients"] <= max_ingredients]
    if veg:
        recommended = recommended[recommended["clean_ingredients"].apply(is_veg)]
    if vegan:
        recommended = recommended[recommended["clean_ingredients"].apply(is_vegan)]

    # Minimum semantic similarity threshold. Just a safety guard so that user intent is still respected.
    recommended = recommended[recommended["match_score"] >= min_similarity]
    
    # Higher semantic relevance first
    recommended = recommended.sort_values(
        by=["final_score", "num_ingredients"],  # first by score, then by fewer ingredients
        ascending=[False, True]  # final_score descending, num_ingredients ascending
    )
    
    # Keep only top_n recipes after removing duplicates
    recommended = recommended.head(top_n)

    # Select informative columns
    recommended = recommended[["recipe_title", "category", "description", "ingredients", "num_ingredients", "num_steps", 
                               "matched_ingredients", "num_matched_ingredients", "match_score", "ingredient_overlap_score",
                               "final_score"
                              ]]
    
    # Return recipe details
    return recommended

In [41]:
recommend_recipes("chicken garlic onion", top_n=5)

Unnamed: 0,recipe_title,category,description,ingredients,num_ingredients,num_steps,matched_ingredients,num_matched_ingredients,match_score,ingredient_overlap_score,final_score
26837,Easy Baked Chicken Thighs,Dinner,These easy baked chicken thighs require minimu...,"[""4 chicken thighs"", ""4 teaspoons garlic powde...",3,3,"[chicken, garlic, onion]",3,0.457,1.0,0.6199
46304,Chicken Broccoli Rice Skillet,Main Dishes,"This chicken, broccoli, and rice skillet is a ...","[""1 tablespoon butter"", ""1 small onion, diced""...",12,5,"[chicken, garlic, onion]",3,0.405,1.0,0.5835
62028,Jalapeno Popper Chicken Baked Ziti,Ziti,"Believe it or not, this recipe came to me in a...","[""2 pounds skinless, boneless chicken breast h...",7,9,"[chicken, garlic, onion]",3,0.404,1.0,0.5828
14424,Chicken Rotini Soup,Chicken Noodle Soups,This homemade chicken rotini soup is very easy...,"[""2 cubes chicken bouillon"", ""1 (12 ounce) pac...",11,3,"[chicken, garlic, onion]",3,0.404,1.0,0.5828
14329,Chicken and Tamale Dumpling Soup,Chicken And Dumplings,I created this recipe when I was craving chick...,"[""1 \u00bd pounds skinless, boneless chicken b...",7,5,"[chicken, garlic, onion]",3,0.38,1.0,0.566
