In [1]:
# Imports
import pandas as pd
import numpy as np

import ast
import re

In [2]:
df = pd.read_csv('../data/recipes_cleaned.csv')
df.head()

Unnamed: 0,title,intro,prep_time,cook_time,total_time,servings,recipe_url,calories,fat,carbs,protein,cleaned_ingredients,time_category,calorie_category,fat_category,carbs_category,protein_category
0,French Silk Pie Bars,These French silk pie bars are sooo good. They...,40.0,20.0,300,16.0,https://www.allrecipes.com/french-silk-pie-bar...,405,31,28,5,"['butter', 'white sugar', 'chocolate graham cr...",Livin in the kitchin!,Average Cal!,Average Fat!,Average Carbs!,Low Protein!
1,No Bake Espresso Martini Cheesecakes,These no bake espresso martini cheesecakes hav...,20.0,5.0,25,6.0,https://www.allrecipes.com/no-bake-espresso-ma...,1058,65,113,10,"['chocolate chips', 'cremefilled chocolate coo...",30 minutes or less!,Don't Look!,High Fat!,High Carbs!,Low Protein!
2,Blackout Cake,Blackout cake is a moist and tender cake with ...,40.0,20.0,135,12.0,https://www.allrecipes.com/blackout-cake-recip...,824,55,80,9,"['cooking spray', 'allpurpose flour', 'white s...",Livin in the kitchin!,High Cal!,High Fat!,Average Carbs!,Low Protein!
3,Sleeping Gingerbread Treats,Shhhh they're sleeping!,15.0,15.0,30,9.0,https://www.allrecipes.com/sleeping-gingerbrea...,231,12,29,3,"['puff pastry', 'chocolate squares', 'gingerbr...",30 minutes or less!,Low Cal!,Low Fat!,Average Carbs!,Low Protein!
4,Little Debbie Brownie Tree Dip,Turn your favorite sweet treat into the best h...,15.0,,135,6.0,https://www.allrecipes.com/little-debbie-brown...,534,31,61,6,"['little debbie® christmas tree brownies', 'cr...",Livin in the kitchin!,Average Cal!,Average Fat!,Average Carbs!,Low Protein!


In [3]:
df['cleaned_ingredients'][0]

"['butter', 'white sugar', 'chocolate graham crackers', 'chocolate', 'eggs', 'white sugar', 'brown sugar', 'water', 'salt', 'vanilla extract', 'unsalted butter', 'heavy cream', 'cream cheese', 'white sugar', 'vanilla extract', 'salt', 'heavy cream', 'chocolate sprinkles']"

### Lemmatanize

In [4]:
import ast
from nltk.stem import WordNetLemmatizer
import nltk

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to normalize and lemmatize each ingredient
def normalize_ingredient(ingredient):
    
    # Lemmatize each word in the ingredient
    words = ingredient.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    
    return ' '.join(lemmatized_words)

# Convert string to a real list of ingredients
df['cleaned_ingredients'] = df['cleaned_ingredients'].apply(ast.literal_eval)

# Now apply normalization and lemmatization to each ingredient in the list
df['normalized_ingredients'] = df['cleaned_ingredients'].apply(
    lambda ingr_list: [normalize_ingredient(i) for i in ingr_list]
)

# Example output
print(df['normalized_ingredients'].iloc[0])

['butter', 'white sugar', 'chocolate graham cracker', 'chocolate', 'egg', 'white sugar', 'brown sugar', 'water', 'salt', 'vanilla extract', 'unsalted butter', 'heavy cream', 'cream cheese', 'white sugar', 'vanilla extract', 'salt', 'heavy cream', 'chocolate sprinkle']


In [5]:
def underscore_ingredients(ingredients):
    # Replace spaces in multi-word ingredients with underscores
    underscored = [ingredient.replace(' ', '_') for ingredient in ingredients]
    
    # Join into a space-separated string for TF-IDF
    return ' '.join(underscored)

# Apply the function to the 'normalized_ingredients' column
df['ingredients_str'] = df['normalized_ingredients'].apply(underscore_ingredients)

In [6]:
df['ingredients_str'][0]

'butter white_sugar chocolate_graham_cracker chocolate egg white_sugar brown_sugar water salt vanilla_extract unsalted_butter heavy_cream cream_cheese white_sugar vanilla_extract salt heavy_cream chocolate_sprinkle'

### TF-IDF
- tokenize ingredients

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
# Custom tokenizer that ensures multi-word ingredients remain with underscores
def custom_tokenizer(text):
    # Replace multi-word ingredient phrases with underscores and split by spaces
    # We assume the list is already in the proper format, where words like 'white_sugar' exist
    return text.split(' ')

# Instantiate
tfidf = TfidfVectorizer(tokenizer = custom_tokenizer)

In [9]:
# Fit
tfidf_matrix = tfidf.fit_transform(df['ingredients_str'])



In [10]:
# Get the feature names (words in the vocabulary)
feature_names = tfidf.get_feature_names_out()

In [11]:
print(feature_names[:20])

['achiote_powder' 'active_yeast' 'adobo_sauce'
 'adobo_sauce_chipotle_pepper' 'adobo_seasoning' 'agave_nectar'
 'agave_syrup' 'aleppo_chile' 'aleppo_chili' 'aleppo_pepper'
 'alfredo_sauce' 'all_purpose_flour' 'allpurpose_baking'
 'allpurpose_flour' 'allpurpose_flour_bread_flour'
 'allpurpose_flour_work_surface' 'allspice' 'allspice_berry' 'almond'
 'almond_butter']


In [12]:
# Example pantry entry
user_input = ['butter', 'chocolate', 'white_sugar', 'eggs', 'vanilla_extract']
pantry_vector = tfidf.transform([' '.join(user_input)])

In [13]:
# Look at cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
cosine_sim = cosine_similarity(pantry_vector, tfidf_matrix)

In [15]:
cosine_sim

array([[0.58049942, 0.10284715, 0.20328099, ..., 0.        , 0.06620717,
        0.06576676]])

In [16]:
cosine_sim_flat = cosine_sim.flatten()

In [17]:
# Get the indices of the top 5 most similar recipes
top_n_indices = cosine_sim_flat.argsort()[::-1][:5]

In [18]:
top_recipes = df.iloc[top_n_indices]

In [19]:
top_recipes[['title', 'intro', 'recipe_url']]

Unnamed: 0,title,intro,recipe_url
0,French Silk Pie Bars,These French silk pie bars are sooo good. They...,https://www.allrecipes.com/french-silk-pie-bar...
20,Chocolate Mousse for Beginners,This chocolate mousse for beginners is the all...,https://www.allrecipes.com/chocolate-mousse-fo...
1594,Very Chocolate Ice Cream,"This chocolate ice cream is a rich, custard-st...",https://www.allrecipes.com/recipe/56803/very-c...
19,Banana Bread Brownies,These banana bread brownies are chocolate brow...,https://www.allrecipes.com/banana-bread-browni...
10,Chocolate-Graham Cracker S’mores Sliders,These baked s’mores sliders bring the campfire...,https://www.allrecipes.com/chocolate-graham-cr...


### Word2Vec

In [20]:
df.head()

Unnamed: 0,title,intro,prep_time,cook_time,total_time,servings,recipe_url,calories,fat,carbs,protein,cleaned_ingredients,time_category,calorie_category,fat_category,carbs_category,protein_category,normalized_ingredients,ingredients_str
0,French Silk Pie Bars,These French silk pie bars are sooo good. They...,40.0,20.0,300,16.0,https://www.allrecipes.com/french-silk-pie-bar...,405,31,28,5,"[butter, white sugar, chocolate graham cracker...",Livin in the kitchin!,Average Cal!,Average Fat!,Average Carbs!,Low Protein!,"[butter, white sugar, chocolate graham cracker...",butter white_sugar chocolate_graham_cracker ch...
1,No Bake Espresso Martini Cheesecakes,These no bake espresso martini cheesecakes hav...,20.0,5.0,25,6.0,https://www.allrecipes.com/no-bake-espresso-ma...,1058,65,113,10,"[chocolate chips, cremefilled chocolate cookie...",30 minutes or less!,Don't Look!,High Fat!,High Carbs!,Low Protein!,"[chocolate chip, cremefilled chocolate cooky, ...",chocolate_chip cremefilled_chocolate_cooky but...
2,Blackout Cake,Blackout cake is a moist and tender cake with ...,40.0,20.0,135,12.0,https://www.allrecipes.com/blackout-cake-recip...,824,55,80,9,"[cooking spray, allpurpose flour, white sugar,...",Livin in the kitchin!,High Cal!,High Fat!,Average Carbs!,Low Protein!,"[cooking spray, allpurpose flour, white sugar,...",cooking_spray allpurpose_flour white_sugar coc...
3,Sleeping Gingerbread Treats,Shhhh they're sleeping!,15.0,15.0,30,9.0,https://www.allrecipes.com/sleeping-gingerbrea...,231,12,29,3,"[puff pastry, chocolate squares, gingerbread m...",30 minutes or less!,Low Cal!,Low Fat!,Average Carbs!,Low Protein!,"[puff pastry, chocolate square, gingerbread me...",puff_pastry chocolate_square gingerbread_men_c...
4,Little Debbie Brownie Tree Dip,Turn your favorite sweet treat into the best h...,15.0,,135,6.0,https://www.allrecipes.com/little-debbie-brown...,534,31,61,6,"[little debbie® christmas tree brownies, cream...",Livin in the kitchin!,Average Cal!,Average Fat!,Average Carbs!,Low Protein!,"[little debbie® christmas tree brownie, cream ...",little_debbie®_christmas_tree_brownie cream_ch...


In [21]:
# Split the ingredients string into tokens (words)
df['tokenized_ingredients'] = df['ingredients_str'].apply(lambda x: x.split())

# Check the tokenized ingredients
print(df['tokenized_ingredients'].head())

0    [butter, white_sugar, chocolate_graham_cracker...
1    [chocolate_chip, cremefilled_chocolate_cooky, ...
2    [cooking_spray, allpurpose_flour, white_sugar,...
3    [puff_pastry, chocolate_square, gingerbread_me...
4    [little_debbie®_christmas_tree_brownie, cream_...
Name: tokenized_ingredients, dtype: object


In [22]:
from gensim.models import Word2Vec # learns word vectors (embeddings) based on data (clean_ingredients)

In [23]:
# Train the Word2Vec model on tokenized ingredients
model = Word2Vec(sentences=df['tokenized_ingredients'], vector_size=100, window=5, min_count=1, workers=4)

In [24]:
# View vocabulary to see which words are included
vocab = list(model.wv.index_to_key)
print(vocab[:10])  # Display first 10 words in the vocabulary

['salt', 'garlic', 'black_pepper', 'olive_oil', 'onion', 'egg', 'water', 'butter', 'soy_sauce', 'white_sugar']


In [25]:
word_vector = model.wv['butter']
print(word_vector)

[-0.03398966  0.30191642  0.06014304 -0.04579302  0.05019699 -0.54832786
  0.26567578  0.77177095 -0.3753123  -0.2838381  -0.09923783 -0.5990402
 -0.04378508  0.3267153  -0.10856833 -0.2537743  -0.05143534 -0.52451104
 -0.1880564  -0.89629376  0.36707962  0.3418805   0.13273206 -0.23564906
 -0.10650773 -0.03832389 -0.4629229  -0.14224164 -0.18586738  0.09296003
  0.34825858  0.0585234   0.06064095 -0.2708937  -0.060718    0.4277937
  0.3370531  -0.14939435 -0.15323192 -0.48921287  0.0842445  -0.41968417
 -0.29627648 -0.05108272  0.33322492 -0.04453627 -0.22124703 -0.07243204
  0.36449808  0.3445569   0.24534033 -0.44995326 -0.03011054 -0.02680731
 -0.17920163  0.18161957  0.31246957 -0.08413497 -0.40570876  0.2062652
  0.05987574  0.04842714  0.3873884  -0.13774325 -0.38023978  0.43658847
  0.09344681  0.4216598  -0.5032611   0.37467965 -0.12187426  0.3810169
  0.1908906  -0.18955863  0.35233995 -0.12466365  0.02416947 -0.05080333
 -0.13947168  0.03756258 -0.24311982  0.07257099 -0.538

In [26]:
similar_words = model.wv.most_similar('butter', topn=5)
print(similar_words)

[('salt', 0.9996880292892456), ('cilantro', 0.9996777772903442), ('unsalted_butter', 0.9996607899665833), ('shallot', 0.9996492862701416), ('water', 0.999640166759491)]


In [27]:
# Function to compute average vector for each recipe
def get_average_embedding(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Apply to each row
df['embedding'] = df['tokenized_ingredients'].apply(lambda tokens: get_average_embedding(tokens, model))

In [28]:
from numpy.linalg import norm

# Stack recipe vectors into a matrix
recipe_vectors = np.vstack(df['embedding'].values)

In [29]:
# Compute pairwise cosine similarity
cosine_sim = cosine_similarity(recipe_vectors)

In [30]:
# Example user ingredients
user_ingredients = ['butter', 'chocolate', 'white_sugar', 'eggs', 'vanilla_extract']
user_vector = get_average_embedding(user_ingredients, model)

In [31]:
user_vector = user_vector.reshape(1, -1)
similarities = cosine_similarity(user_vector, recipe_vectors)[0]

In [32]:
# Get indices of top 5 matches
top_indices = similarities.argsort()[::-1][:10]

# Show top recipes
top_recipes = df.iloc[top_indices][['title', 'ingredients_str']]
print(top_recipes)

                                            title  \
0                            French Silk Pie Bars   
198                   Nectarine Blueberry Cobbler   
53                                No-Bake Cookies   
55               Chocolate Chocolate Chip Cookies   
1485                    Christmas Cheesecake Bars   
760   Individual Bourbon-Pecan French Toast Bakes   
54                       Hot Water Chocolate Cake   
1596               Blueberry Cheesecake Ice Cream   
19                          Banana Bread Brownies   
214               Strawberries and Cream Cupcakes   

                                        ingredients_str  
0     butter white_sugar chocolate_graham_cracker ch...  
198   nectarine blueberry white_sugar cornstarch all...  
53    white_sugar butter milk cocoa_powder peanut_bu...  
55    white_sugar butter egg vanilla_extract allpurp...  
1485  unsalted_butter brown_sugar white_sugar salt n...  
760   milk egg maple_syrup bourbon_whiskey vanilla_e...  
54    whit

### Ingredient Substitution Matching

In [33]:
df['ingredients_str']

0       butter white_sugar chocolate_graham_cracker ch...
1       chocolate_chip cremefilled_chocolate_cooky but...
2       cooking_spray allpurpose_flour white_sugar coc...
3       puff_pastry chocolate_square gingerbread_men_c...
4       little_debbie®_christmas_tree_brownie cream_ch...
                              ...                        
1865    milk allpurpose_flour egg vegetable_oil almond...
1866    beef_chuck salt black_pepper hickory_smoked_ba...
1867    avocado rom_tomato shallot jalapeno_pepper lem...
1868    french_bread_dough olive_oil potato smoked_bac...
1869    allpurpose_flour salt black_pepper stew_meat b...
Name: ingredients_str, Length: 1870, dtype: object

In [34]:
def find_substitutable_matches_for_df(df, user_pantry):
    """
    This function checks for ingredient substitution matches based on word overlap.
    It processes the entire DataFrame of recipes and compares the ingredients with the user's pantry.

    Parameters:
    - df: The DataFrame containing recipes with ingredients as space-separated and underscore-joined.
    - user_pantry: List of ingredients the user has in their pantry.

    Returns:
    - df_with_substitutions: DataFrame with two new columns: 'matches' (substituted ingredients) and 'missing' (ingredients without any match).
    """
    
    def find_substitutable_matches(recipe_ingredients, user_pantry):
        matches = []
        missing = []
        
        recipe_list = recipe_ingredients.split()  # Split ingredients into words
        
        for ingredient in recipe_list:
            if ingredient in user_pantry:
                matches.append((ingredient, ingredient))  # Perfect match
            else:
                # Check word-level overlap
                ingredient_words = set(ingredient.split('_'))
                found = False
                for pantry_item in user_pantry:
                    pantry_words = set(pantry_item.split('_'))
                    if ingredient_words & pantry_words:  # Overlap between words
                        matches.append((ingredient, pantry_item))  # Substitution match
                        found = True
                        break
                if not found:
                    missing.append(ingredient)  # No match or substitution
        
        return matches, missing
    
    # Apply the substitution matching to each recipe in the dataframe
    df['matches'], df['missing'] = zip(*df['ingredients_str'].apply(lambda x: find_substitutable_matches(x, user_pantry)))
    
    return df

In [35]:
# Example user pantry
#user_pantry = ['butter', 'sugar', 'almond_milk', 'egg', 'cheese']

# Apply the substitution logic to the dataframe
#df_with_substitutions = find_substitutable_matches_for_df(df, user_pantry)

# See the new dataframe with 'matches' and 'missing' columns
#print(df_with_substitutions[['title','ingredients_str', 'matches', 'missing']].head())

In [36]:
df.head()

Unnamed: 0,title,intro,prep_time,cook_time,total_time,servings,recipe_url,calories,fat,carbs,...,cleaned_ingredients,time_category,calorie_category,fat_category,carbs_category,protein_category,normalized_ingredients,ingredients_str,tokenized_ingredients,embedding
0,French Silk Pie Bars,These French silk pie bars are sooo good. They...,40.0,20.0,300,16.0,https://www.allrecipes.com/french-silk-pie-bar...,405,31,28,...,"[butter, white sugar, chocolate graham cracker...",Livin in the kitchin!,Average Cal!,Average Fat!,Average Carbs!,Low Protein!,"[butter, white sugar, chocolate graham cracker...",butter white_sugar chocolate_graham_cracker ch...,"[butter, white_sugar, chocolate_graham_cracker...","[-0.034832727, 0.26622808, 0.044742864, -0.040..."
1,No Bake Espresso Martini Cheesecakes,These no bake espresso martini cheesecakes hav...,20.0,5.0,25,6.0,https://www.allrecipes.com/no-bake-espresso-ma...,1058,65,113,...,"[chocolate chips, cremefilled chocolate cookie...",30 minutes or less!,Don't Look!,High Fat!,High Carbs!,Low Protein!,"[chocolate chip, cremefilled chocolate cooky, ...",chocolate_chip cremefilled_chocolate_cooky but...,"[chocolate_chip, cremefilled_chocolate_cooky, ...","[-0.013425088, 0.110976, 0.020704398, -0.02135..."
2,Blackout Cake,Blackout cake is a moist and tender cake with ...,40.0,20.0,135,12.0,https://www.allrecipes.com/blackout-cake-recip...,824,55,80,...,"[cooking spray, allpurpose flour, white sugar,...",Livin in the kitchin!,High Cal!,High Fat!,Average Carbs!,Low Protein!,"[cooking spray, allpurpose flour, white sugar,...",cooking_spray allpurpose_flour white_sugar coc...,"[cooking_spray, allpurpose_flour, white_sugar,...","[-0.031293496, 0.23015375, 0.039598744, -0.033..."
3,Sleeping Gingerbread Treats,Shhhh they're sleeping!,15.0,15.0,30,9.0,https://www.allrecipes.com/sleeping-gingerbrea...,231,12,29,...,"[puff pastry, chocolate squares, gingerbread m...",30 minutes or less!,Low Cal!,Low Fat!,Average Carbs!,Low Protein!,"[puff pastry, chocolate square, gingerbread me...",puff_pastry chocolate_square gingerbread_men_c...,"[puff_pastry, chocolate_square, gingerbread_me...","[-0.024685394, 0.15305515, 0.023608416, -0.030..."
4,Little Debbie Brownie Tree Dip,Turn your favorite sweet treat into the best h...,15.0,,135,6.0,https://www.allrecipes.com/little-debbie-brown...,534,31,61,...,"[little debbie® christmas tree brownies, cream...",Livin in the kitchin!,Average Cal!,Average Fat!,Average Carbs!,Low Protein!,"[little debbie® christmas tree brownie, cream ...",little_debbie®_christmas_tree_brownie cream_ch...,"[little_debbie®_christmas_tree_brownie, cream_...","[-0.0068890424, 0.06670862, 0.008688562, -0.01..."


In [37]:
df.drop(columns = ['prep_time', 'cook_time', 'cleaned_ingredients', 'normalized_ingredients', 'ingredients_str'], inplace = True)

In [38]:
model_df = df

In [39]:
model_df.head()

Unnamed: 0,title,intro,total_time,servings,recipe_url,calories,fat,carbs,protein,time_category,calorie_category,fat_category,carbs_category,protein_category,tokenized_ingredients,embedding
0,French Silk Pie Bars,These French silk pie bars are sooo good. They...,300,16.0,https://www.allrecipes.com/french-silk-pie-bar...,405,31,28,5,Livin in the kitchin!,Average Cal!,Average Fat!,Average Carbs!,Low Protein!,"[butter, white_sugar, chocolate_graham_cracker...","[-0.034832727, 0.26622808, 0.044742864, -0.040..."
1,No Bake Espresso Martini Cheesecakes,These no bake espresso martini cheesecakes hav...,25,6.0,https://www.allrecipes.com/no-bake-espresso-ma...,1058,65,113,10,30 minutes or less!,Don't Look!,High Fat!,High Carbs!,Low Protein!,"[chocolate_chip, cremefilled_chocolate_cooky, ...","[-0.013425088, 0.110976, 0.020704398, -0.02135..."
2,Blackout Cake,Blackout cake is a moist and tender cake with ...,135,12.0,https://www.allrecipes.com/blackout-cake-recip...,824,55,80,9,Livin in the kitchin!,High Cal!,High Fat!,Average Carbs!,Low Protein!,"[cooking_spray, allpurpose_flour, white_sugar,...","[-0.031293496, 0.23015375, 0.039598744, -0.033..."
3,Sleeping Gingerbread Treats,Shhhh they're sleeping!,30,9.0,https://www.allrecipes.com/sleeping-gingerbrea...,231,12,29,3,30 minutes or less!,Low Cal!,Low Fat!,Average Carbs!,Low Protein!,"[puff_pastry, chocolate_square, gingerbread_me...","[-0.024685394, 0.15305515, 0.023608416, -0.030..."
4,Little Debbie Brownie Tree Dip,Turn your favorite sweet treat into the best h...,135,6.0,https://www.allrecipes.com/little-debbie-brown...,534,31,61,6,Livin in the kitchin!,Average Cal!,Average Fat!,Average Carbs!,Low Protein!,"[little_debbie®_christmas_tree_brownie, cream_...","[-0.0068890424, 0.06670862, 0.008688562, -0.01..."


## Streamlit App

In [40]:
import streamlit as st

In [41]:
# Create a python file for my app
smart_food_planner.py

NameError: name 'smart_food_planner' is not defined