<h1 style = "text-align: center">Collaborative Filtering Data Model</h1>

<h3 style = "text-align: center">Food.com Recipe Recommender - SOEN 471 (Big Data Analytics)</h3>

## Objective:
The objective of this notebook is to create a recommender system data model that recommend recipes based on user preferences using collaborative filtering.

In [155]:
import os
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

# accessing directory
for dirname, _, filenames in os.walk('./clean_data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./clean_data/interactions_TRAIN.csv
./clean_data/interactions_TEST.csv
./clean_data/recipes.csv


## Reading files:

In [156]:
# read
training = pd.read_csv("./clean_data/interactions_TRAIN.csv")
testing = pd.read_csv("./clean_data/interactions_TEST.csv")
recipes = pd.read_csv("./clean_data/recipes.csv")

# print shapes of data
print("Shape of training model: ", training.shape)
print("Shape of testing model: ", testing.shape)
print("Shape of recipes model: ", recipes.shape)

Shape of training model:  (1019129, 6)
Shape of testing model:  (113237, 6)
Shape of recipes model:  (231636, 19)


## Since the data is big, we will take a small random sample:

In [157]:
interactions_sample = training.sample(40000)
interactions_sample.head(1)

Unnamed: 0.1,Unnamed: 0,user_id,recipe_id,date,rating,review
173812,56355,24928,83268,2008-06-28,5,"Well, this is a different kind of review. I ju..."


In [158]:
recipes_sample = recipes.sample(40000)
recipes_sample.head(1)

Unnamed: 0.1,Unnamed: 0,name,recipe_id,minutes,contributor_id,submitted,tags,n_steps,steps,description,ingredients,n_ingredients,Calories,Total_fat_PDV,Sugar_PDV,Sodium_PDV,Protein_PDV,Saturated_fat_PDV,Carbohydrates_PDV
198148,23673,spinach asparagus and strawberry salad,86837,55,49304,2004-03-17,"['60-minutes-or-less', 'time-to-make', 'course...",10,['to toast walnuts: place in baking pan and ba...,this recipe includes two of springs freshest i...,"['olive oil', 'asparagus', 'fresh spinach', 's...",8,308.6,37.0,15.0,5.0,30.0,41.0,3.0


## Join both samples based on 

In [159]:
data = pd.merge(interactions_sample, recipes_sample, right_on='recipe_id', left_on='recipe_id')
print("The shape of the joind training data sample: ", data.shape)
data.head(2)

The shape of the joind training data sample:  (7031, 24)


Unnamed: 0,Unnamed: 0_x,user_id,recipe_id,date,rating,review,Unnamed: 0_y,name,minutes,contributor_id,...,description,ingredients,n_ingredients,Calories,Total_fat_PDV,Sugar_PDV,Sodium_PDV,Protein_PDV,Saturated_fat_PDV,Carbohydrates_PDV
0,56355,24928,83268,2008-06-28,5,"Well, this is a different kind of review. I ju...",1290,3 packet roast,485,123272,...,a wonderful roast with a very tasty gravy,"['italian salad dressing mix', 'ranch dressing...",6,296.7,15.0,0.0,18.0,95.0,19.0,1.0
1,56358,95217,83268,2009-10-19,5,I made this for Sunday dinner and it was delic...,1290,3 packet roast,485,123272,...,a wonderful roast with a very tasty gravy,"['italian salad dressing mix', 'ranch dressing...",6,296.7,15.0,0.0,18.0,95.0,19.0,1.0


 ## Summary of the descriptive statistics include those that summarize the central tendency, dispersion and shape of a dataset's distribution:

In [160]:
data.describe()

Unnamed: 0,Unnamed: 0_x,user_id,recipe_id,rating,Unnamed: 0_y,minutes,contributor_id,n_steps,n_ingredients,Calories,Total_fat_PDV,Sugar_PDV,Sodium_PDV,Protein_PDV,Saturated_fat_PDV,Carbohydrates_PDV
count,7031.0,7031.0,7031.0,7031.0,7031.0,7031.0,7031.0,7031.0,7031.0,7031.0,7031.0,7031.0,7031.0,7031.0,7031.0,7031.0
mean,113089.009671,143248500.0,162163.520836,4.396814,27898.910397,89.050348,1534725.0,9.643294,8.943678,455.060646,34.009956,75.990755,33.153463,35.617409,42.562651,14.885649
std,64869.981207,508957700.0,130543.035936,1.292438,16748.390905,604.9772,49167190.0,5.750866,3.657563,872.720735,73.743307,235.223431,143.968048,54.83853,94.034934,37.042358
min,7.0,1533.0,91.0,0.0,1.0,0.0,1530.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,57977.5,134388.0,56103.0,4.0,13772.0,25.0,37449.0,6.0,6.0,179.1,9.0,9.0,6.0,7.0,7.0,4.0
50%,112694.0,335737.0,123854.0,5.0,27168.0,40.0,91453.0,9.0,9.0,306.7,20.0,24.0,16.0,18.0,24.0,9.0
75%,169734.5,844554.0,246478.5,5.0,42404.5,65.0,244114.0,12.0,11.0,504.1,39.0,68.0,34.0,52.0,49.0,15.0
max,227390.0,2002368000.0,537429.0,5.0,58868.0,43200.0,2002285000.0,88.0,37.0,30933.4,2406.0,8320.0,5509.0,2030.0,2878.0,1188.0


## Create a pivot table from the data and replace any missing value by 0:

In [161]:
pivot_table = data.pivot_table(index='recipe_id', columns='user_id', values='rating')
pivot_table.fillna(0, inplace=True)

## Define a function that generates similar recipes based on recipe_id provided

In [162]:
def similar_recipes(recipe_id):
    recipe = pivot_table.loc[recipe_id].values.reshape(1,-1)
    cosine_similarities = cosine_similarity(pivot_table, recipe).flatten()
    related_recipe_indices = cosine_similarities.argsort()[:-6:-1]
    related_recipes = recipes.loc[related_recipe_indices][['name', 'ingredients']]
    return related_recipes.values.tolist()

## Define a function that returns a recommended recipe based on minutes and calories provided 

In [163]:
def recommend_recipe(minutes, calories):
    # find the recipes that have similar minutes and calories as the input
    similar_minutes = data[(data['minutes'] >= minutes-10) & (data['minutes'] <= minutes+10)]
    similar_calories = similar_minutes[(similar_minutes['Calories'] >= calories-100) & (similar_minutes['Calories'] <= calories+100)]
    recipe_ids = similar_calories['recipe_id'].unique().tolist()

    # recommend similar recipes for each recipe in the filtered data
    recommended_recipes = []
    for recipe_id in recipe_ids:
        recommended_recipes.extend(similar_recipes(recipe_id))

    # remove duplicates
    recommended_recipes = list(set([tuple(recipe) for recipe in recommended_recipes]))

    return recommended_recipes[:10] # return top 10 recommended recipes

## Usage Example

In [None]:
recommended_recipes = recommend_recipe(30, 500)
print("Recommended Recipes that needs 30 minutes to prepare and contains 500 calories:")

for i, recipe in enumerate(recommended_recipes):
    print(f"{i+1}. Recipe Name: {recipe[0]}")
    print("Ingredients:")
    ingredients = recipe[1].split(', ')
    for ingredient in ingredients:
        print("- ", ingredient)
    print()