Notes: Stage 1 is Candidate Generation. This step using TF-IDF and Cosine Similarity to Filter top 100 foods.

## **Import Library**

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re
from autocorrect import Speller
import numpy as np
import pickle
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, precision_recall_curve
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv('./dataset/preprocessed_recipes.csv')

In [3]:
df.head(2)

Unnamed: 0,RecipeId,NameClean,DescriptionClean,KeywordsClean,RecipeCategoryClean,ImagesClean,RecipeIngredientPartsClean,RecipeInstructionsClean,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,Combined
0,38,lowfat berry blue frozen dessert,make share lowfat berry blue frozen dessert re...,"dessert, low protein, low cholesterol, healthy...",frozen dessert,['https://img.sndimg.com/food/image/upload/w_5...,"blueberry, granulated sugar, vanilla yogurt, l...","toss 2 ups berry sugar, let stand 45 minutes, ...",170.9,2.5,1.3,8.0,29.8,37.1,3.6,30.2,3.2,lowfat berry blue frozen dessertmake share low...
1,41,carina tofuvegetable kebab,dish best prepared day advance allow ingredien...,"beans, vegetable, low cholesterol, weeknight, ...","soy, tofu",['https://img.sndimg.com/food/image/upload/w_5...,"extra firm tofu, eggplant, zuhini, mushrooms, ...","drain tofu, arefully squeezing ex water, pat d...",536.1,24.0,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,carina tofuvegetable kebabdish best prepared d...


# **Stage 1: Candidate Generation**
## **Content-Based Filtering**

*Desc:*

Similar items, such as Reels about dogs, are close together in the embedding space. The candidate generator works as follows:
- given a user, the system looks for items that are close to them in the embedding space.
- The notion of “closeness” is defined by a similarity measure


*Vectorization:*

For dietary food recommender system, TF-IDF is the most suitable method because it effectively highlights important ingredients and instructions, capturing the essence of the recipes better than binary features or BoW.

*Similarity:*

For dietary food recommender system, Cosine Similarity is the most appropriate choice due to its effectiveness in handling high-dimensional sparse data, such as TF-IDF vectors. It focuses on the direction of the vectors rather than their magnitude, making it robust for text-based features.

## User Input

In [5]:
user_favorite_foods = ['fish', 'beef']

# Generate combinations for similarity calculations
user_favorites = user_favorite_foods

In [6]:
user_favorites

['fish', 'beef']

## Vectorization: TF-IDF & Cosine Similarity

TF-IDF: Converts text into numerical vectors by capturing term importance within a document and across the corpus.

In [7]:
df_combined=df["Combined"]
df_combined

0         lowfat berry blue frozen dessertmake share low...
1         carina tofuvegetable kebabdish best prepared d...
2         cabbage soupmake share cabbage soup recipe foo...
3         buttermilk pie gingersnap crumb crustmake shar...
4         jad cucumber picklemake share jad cucumber pic...
                                ...                        
297329    mama bean saladmake share mama bean salad reci...
297330    amazing ground beef stroganoffmake share amazi...
297331    spanish coffee tia marianice digestif meal per...
297332    slowcooker classic coffee cakehouse fill aroma...
297333    meg fresh ginger gingerbreadmake share meg fre...
Name: Combined, Length: 297334, dtype: object

In [8]:
# Create a TfidfVectorizer object to transform the movie genres into a Tf-idf representation
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df_combined)

In [9]:
# Create the directory if it doesn't exist
!mkdir -p ./models

# Save the TfidfVectorizer
with open('./models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# Save the Tfidf matrix
with open('./models/tfidf_matrix.pkl', 'wb') as f:
    pickle.dump(tfidf_matrix, f)

In [10]:
# Transform user's favorite foods to TF-IDF vector
user_favorite_vector = tfidf.transform(user_favorites)

Cosine Similarity: Measures the similarity between two vectors, providing a metric to compare documents.

In [11]:
# Calculate the cosine similarity matrix between the food
cosine_similarities = cosine_similarity(user_favorite_vector, tfidf_matrix).flatten()

In [12]:
# Initialize a dictionary to store similarities
similarity_dict = {}

# Calculate similarities for each string
for favorite in user_favorites:
    # Transform the string to a TF-IDF vector
    favorite_vector = tfidf.transform([favorite])

    # Calculate cosine similarities
    similarities = cosine_similarity(favorite_vector, tfidf_matrix).flatten()

    # Store the similarities in the dictionary
    similarity_dict[favorite] = similarities

In [20]:
similarity_df = pd.DataFrame(similarity_dict, index=df["NameClean"])
similarity_df.head(100)

Unnamed: 0_level_0,fish,beef
NameClean,Unnamed: 1_level_1,Unnamed: 2_level_1
lowfat berry blue frozen dessert,0.0,0.0
carina tofuvegetable kebab,0.0,0.0
cabbage soup,0.0,0.0
buttermilk pie gingersnap crumb crust,0.0,0.0
jad cucumber pickle,0.0,0.0
...,...,...
chinese meatball,0.0,0.0
chocolate almond scone,0.0,0.0
cinnamon apple salad,0.0,0.0
chocolate bread butter pudding,0.0,0.0


## Filter High Similarity

In [21]:
top_n_high = 50
high_similarity_candidates = []

for favorite, similarities in similarity_dict.items():
    # Get top n indices
    top_n_indices = similarities.argsort()[-top_n_high:][::-1]

    # Select the top n candidate recipes
    candidate_recipes = df.iloc[top_n_indices].copy()
    candidate_recipes['cosine_similarity'] = similarities[top_n_indices]

    # Append the candidate DataFrame to the list
    high_similarity_candidates.append(candidate_recipes)

In [22]:
high_similarity_df = pd.concat(high_similarity_candidates).drop_duplicates().reset_index(drop=True)
high_similarity_df

Unnamed: 0,RecipeId,NameClean,DescriptionClean,KeywordsClean,RecipeCategoryClean,ImagesClean,RecipeIngredientPartsClean,RecipeInstructionsClean,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,Combined,cosine_similarity
0,465434,jack crevalle fish fish taco,dark red fish people throw back know always us...,"low carbs, low protein, low cholesterol, 60 min","lunch, snack",['https://img.sndimg.com/food/image/upload/w_5...,"buttermilk, tortilla, onion, tomatoes, lettue,...","fillet jak small enough otherwise, use haksaw ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,jack crevalle fish fish tacodark red fish peop...,0.599640
1,88936,fish vinaigrette,delicious cold fish dish tangy enough people n...,easy,4 hour,['https://img.sndimg.com/food/image/upload/w_5...,"water, tarragon vinegar, lemon juie, pepperorn...","combine water, 14 vinegar, 14 lemon juie, pepp...",370.4,28.1,3.9,65.0,322.0,2.2,0.2,0.6,27.2,fish vinaigrettedelicious cold fish dish tangy...,0.543207
2,469610,american kitchen classic gefilte fish,gefilte mean quotstuffedquot yiddish refers tr...,"european, low carbs, low protein, healthy, kos...",whitefish,['character(0'],"onion, arrots, elery, parsley, blak pepperorns...",plae fish trimming large soup pot along vegeta...,35.9,1.0,0.3,34.9,320.9,5.2,0.8,1.8,1.8,american kitchen classic gefilte fishgefilte m...,0.527778
3,209588,spicy thai fish cake green bean,husband make yummy fish cake nice mild fish fl...,"thai, asian, 30 mins, deep fried",whitefish,['character(0'],"thai red urry paste, egg, fish saue, sugar, or...","finely hop fish may use blender food proessor,...",161.8,2.8,0.7,129.1,798.1,9.5,2.0,2.3,23.9,spicy thai fish cake green beanhusband make yu...,0.523157
4,408552,fish goulash,make share fish goulash recipe foodcom,easy,60 min,['character(0'],"onion, green pepper, garli love, butter, olive...","saute onion, pepper garli butter oil skillet, ...",269.6,13.6,5.8,85.5,529.9,8.6,2.2,4.2,28.4,fish goulashmake share fish goulash recipe foo...,0.520845
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,69653,lean beef stroganoff,make share lean beef stroganoff recipe foodcom,"meat, 60 min",one dish meal,['https://img.sndimg.com/food/image/upload/w_5...,"boneless beef top sirloin steak, mushroom, oni...","plae beef freezer make easier slie, prepare eg...",380.7,8.0,1.4,60.9,1193.1,62.5,4.0,5.8,17.0,lean beef stroganoffmake share lean beef strog...,0.496098
96,109766,crock pot beef roast,make share crock pot beef roast recipe foodcom,"meat, easy",roast beef,['character(0'],"boneless beef rump roast, beef broth","plae roast slow ooker, mix beef broth, dressin...",477.0,29.7,11.4,143.3,767.1,1.7,0.0,0.0,47.3,crock pot beef roastmake share crock pot beef ...,0.495774
97,24072,beef noodle paprikash,make share beef noodle paprikash recipe foodcom,"vegetable, meat, hungarian, european, 30 mins,...",one dish meal,['https://img.sndimg.com/food/image/upload/w_5...,"ground beef, garli, onion, green pepper, papri...","duth oven, brown beef garli, onions, green pep...",468.7,27.9,12.2,116.1,900.0,26.1,3.9,4.9,28.8,beef noodle paprikashmake share beef noodle pa...,0.493967
98,382078,creamy mushroom beef soup,make share creamy mushroom beef soup recipe fo...,"vegetable, meat, winter, 60 mins, beginner coo...",potato,['https://img.sndimg.com/food/image/upload/w_5...,"lean ground beef, water, milk, beef bouillon u...",brown beef large pot longer pink drain ex grea...,380.4,17.4,7.5,76.1,1118.1,29.2,2.9,1.6,26.3,creamy mushroom beef soupmake share creamy mus...,0.492689


## Filter Low Random Similarity

In [31]:
top_n_low = 5
low_similarity_candidates = []

for favorite, similarities in similarity_dict.items():
    # Get indices of foods with non-zero and low similarity
    non_zero_indices = np.where(similarities > 0.05)[0]
    low_similarity_indices = non_zero_indices[similarities[non_zero_indices].argsort()[:top_n_low]]

    # Select random foods from these low similarity candidates
    random_low_sim_candidates = df.iloc[low_similarity_indices].sample(top_n_low, random_state=42)
    random_low_sim_candidates['cosine_similarity'] = similarities[low_similarity_indices]

    # Append the low similarity DataFrame to the list
    low_similarity_candidates.append(random_low_sim_candidates)

In [32]:
low_similarity_df = pd.concat(low_similarity_candidates).drop_duplicates().reset_index(drop=True)
low_similarity_df

Unnamed: 0,RecipeId,NameClean,DescriptionClean,KeywordsClean,RecipeCategoryClean,ImagesClean,RecipeIngredientPartsClean,RecipeInstructionsClean,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,Combined,cosine_similarity
0,12126,"smarty pant chicken, sausage, okra gumbo rice","fast, quick, easy way make delicious cajun new...","stew, chiken, white rie, long grain rie, poult...",gumbo,['character(0'],"zatarians gumbo base mix, oak grove smoke hous...","gumbo, follow reipe gumbo base ontainer mix ba...",476.0,25.0,7.8,77.2,963.7,39.4,1.4,1.6,21.3,smarty pant chicken sausage okra gumbo ricefas...,0.050037
1,114823,south indian coconut chutney powder buttered b...,adopted recipe here original chef say quotthis...,"fruit, nuts, asian, indian, 15 min",coconut,['https://img.sndimg.com/food/image/upload/w_5...,"hanna dal, dry oonut powder, kosher salt, whol...",grind roasted hanna dal spie grinder fine powd...,507.3,14.2,6.6,24.4,1078.2,82.4,10.0,5.0,14.0,south indian coconut chutney powder buttered b...,0.050088
2,407193,hungs clay pot rice,"recipe hung huynh, student culinary institute ...","pork, rie, vegetable, meat, asian, thanksgivin...",short grain rice,['character(0'],"short-grain rie, shiitake mushrooms, sallions,...","bowl, rie water let soak grain turn white, 1 h...",258.2,3.9,0.6,0.0,200.3,51.7,3.2,3.3,4.9,hungs clay pot ricerecipe hung huynh student c...,0.05069
3,231693,longer kitchen sink shrimp amp veggie curry,"like many family tight budget wsml children, t...","fruit, vegetable, savory, 30 mins, beginner co...",one dish meal,['character(0'],"arrot, unsweetened oonut milk, fish saue, urry...",plae 1st 7 ingredient arrot thru pineapple lrg...,534.1,14.9,12.7,0.0,490.8,92.4,4.1,9.5,9.1,longer kitchen sink shrimp amp veggie currylik...,0.051029
4,492820,lowcarb arroz imperial imperial rice,tasty cuban chicken rice casserole called arro...,"vegetable, south amerian, free of, weeknight, ...",cauliflower,['character(0'],"parmesan heese, ground umin, ground oregano, o...","making chiken, wash hiken plae stok pot add en...",478.5,26.2,10.5,152.6,748.3,9.2,1.4,3.3,50.2,lowcarb arroz imperial imperial ricetasty cuba...,0.051675
5,425962,cold meat loaf kafta mahshi bil bayd,posting zwt httpwwwpremiumsavorcom via lerner ...,"southwest asia middle east, asian, 4 hour",lebanese,['character(0'],"egg, ground lamb, beef, innamon, za'atar spie ...","preheat oven 375f, plae egg medium sauepan old...",388.5,30.3,12.8,222.4,704.9,3.6,0.8,1.3,24.0,cold meat loaf kafta mahshi bil baydposting zw...,0.050106
6,319762,renalfriendly mom pot roast,kidney friendly comfort food compliment shire ...,"low carbs, easy",meat,['character(0'],"olive oil, fresh onion, garli loves, turnip, d...","brown meat sided oil, add onions, ook low heat...",266.6,19.4,7.1,61.7,67.8,4.8,0.6,1.1,17.1,renalfriendly mom pot roastkidney friendly com...,0.050148
7,410251,aunt claires pul gol gi bul gogi,aunt claires korean bbq sirloin beef one beer ...,"meat, korean, asian, 30 min",roast beef,['character(0'],"sirloin tip roast, top round roast, soy saue, ...","freeze meat one hour slie beef 18 inh thik, to...",164.5,4.4,0.6,0.0,2018.0,25.5,1.3,20.2,4.8,aunt claires pul gol gi bul gogiaunt claires k...,0.050269
8,437799,irish colcannon,colcannon traditional irish dish consists mash...,"potato, vegetable, european, low protein, hall...",mashed potato,['https://img.sndimg.com/food/image/upload/w_5...,"abbage, yellow onion, water, russet baking pot...","srub ut potato quarters, plae large pot water ...",306.3,13.7,8.5,44.8,290.4,42.4,5.8,3.3,5.5,irish colcannoncolcannon traditional irish dis...,0.050281
9,113858,spinach lasagna easy noboil method,"traditional ovenbaked layered lasagna, without...",4 hour,european,['https://img.sndimg.com/food/image/upload/w_5...,"ground beef, italian sausage, onion, mozzarell...","preheat oven 350 degree f, large skillet stok ...",457.2,23.1,11.1,127.2,567.1,32.8,2.9,7.1,28.2,spinach lasagna easy noboil methodtraditional ...,0.050329


In [33]:
high_similarity_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   RecipeId                    100 non-null    int64  
 1   NameClean                   100 non-null    object 
 2   DescriptionClean            100 non-null    object 
 3   KeywordsClean               100 non-null    object 
 4   RecipeCategoryClean         100 non-null    object 
 5   ImagesClean                 100 non-null    object 
 6   RecipeIngredientPartsClean  100 non-null    object 
 7   RecipeInstructionsClean     100 non-null    object 
 8   Calories                    100 non-null    float64
 9   FatContent                  100 non-null    float64
 10  SaturatedFatContent         100 non-null    float64
 11  CholesterolContent          100 non-null    float64
 12  SodiumContent               100 non-null    float64
 13  CarbohydrateContent         100 non-

In [34]:
low_similarity_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   RecipeId                    10 non-null     int64  
 1   NameClean                   10 non-null     object 
 2   DescriptionClean            10 non-null     object 
 3   KeywordsClean               10 non-null     object 
 4   RecipeCategoryClean         10 non-null     object 
 5   ImagesClean                 10 non-null     object 
 6   RecipeIngredientPartsClean  10 non-null     object 
 7   RecipeInstructionsClean     10 non-null     object 
 8   Calories                    10 non-null     float64
 9   FatContent                  10 non-null     float64
 10  SaturatedFatContent         10 non-null     float64
 11  CholesterolContent          10 non-null     float64
 12  SodiumContent               10 non-null     float64
 13  CarbohydrateContent         10 non-nul

## Final Candidate

In [None]:
final_candidates = pd.concat([high_similarity_df, low_similarity_df]).drop_duplicates().reset_index(drop=True)

In [None]:
final_candidates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165 entries, 0 to 164
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   RecipeId                    165 non-null    int64  
 1   NameClean                   165 non-null    object 
 2   RecipeIngredientPartsClean  165 non-null    object 
 3   RecipeInstructionsClean     165 non-null    object 
 4   Calories                    165 non-null    float64
 5   FatContent                  165 non-null    float64
 6   SaturatedFatContent         165 non-null    float64
 7   CholesterolContent          165 non-null    float64
 8   SodiumContent               165 non-null    float64
 9   CarbohydrateContent         165 non-null    float64
 10  FiberContent                165 non-null    float64
 11  SugarContent                165 non-null    float64
 12  ProteinContent              165 non-null    float64
 13  Combined                    165 non

# **Export Final Candidate**

In [None]:
final_candidates.to_csv('./dataset/final_candidates.csv', index=False)

In [None]:
high_similarity_df.to_csv('./dataset/high_similarity_df.csv', index=False)
low_similarity_df.to_csv('./dataset/low_similarity_df.csv', index=False)

In [None]:
high_similarity_df.head(2)

Unnamed: 0,RecipeId,NameClean,RecipeIngredientPartsClean,RecipeInstructionsClean,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,Combined,cosine_similarity
0,72665,ginger chilli baked fish,"lime, fresh ginger, fish saue, fresh ginger, g...","preheat oven hot 220 degree c, make 4 deep sli...",436.4,8.8,1.6,268.0,985.1,12.2,1.2,7.9,74.3,lime fresh ginger fish saue fresh ginger garli...,0.784882
1,512673,fresh fillet sole snap,"fillets of sole, garli, salt, pepper, butter, ...","spread garli side fish, sprinkle salt pepper, ...",179.7,9.7,2.6,82.2,815.2,0.8,0.1,0.2,21.4,fillets of sole garli salt pepper butter olive...,0.730094


In [None]:
low_similarity_df.head(2)

Unnamed: 0,RecipeId,NameClean,RecipeIngredientPartsClean,RecipeInstructionsClean,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,Combined,cosine_similarity
0,509635,julia amp jaquess deonstruted turkey corn brea...,"turkey arass, onions, onions, arrots, arrots, ...",rih turkey stok heat oil large heavybottomed p...,713.2,37.3,10.5,241.9,1232.9,9.6,1.9,4.0,75.7,turkey arass onions onions arrots arrots elery...,0.0133
1,312548,roasted vegetable ravioli crispy panetta,"egg, all-purpose flour, olive oil, zuhini, red...",basi pasta reipe start flour measure plae nie ...,637.8,19.7,4.6,269.3,703.2,86.3,5.3,5.7,23.4,egg all-purpose flour olive oil zuhini red bel...,0.019405
