<h1 style = "text-align: center">Collaborative Filtering Data Model</h1>

<h3 style = "text-align: center">Food.com Recipe Recommender - SOEN 471 (Big Data Analytics)</h3>

## Objective:
The objective of this notebook is to create a recommender system data model that recommend recipes based on user preferences using collaborative filtering.

In [1]:
import os
import json
import numpy as np
import pandas as pd 
import dask.array as da
import dask.dataframe as dd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

# accessing directory
for dirname, _, filenames in os.walk('./clean_data'):
    for filename in filenames:
        os.path.join(dirname, filename)

## Reading files:

In [2]:
# Removing column titled "Unnamed: 0"
training = dd.read_csv("./clean_data/interactions_TRAIN.csv").drop("Unnamed: 0", axis=1)
#testing = dd.read_csv("./clean_data/interactions_TEST.csv").drop("Unnamed: 0", axis=1)
recipes = dd.read_csv("./clean_data/recipes.csv").drop("Unnamed: 0", axis=1)

In [3]:
recipes.head()

Unnamed: 0,name,recipe_id,minutes,contributor_id,submitted,tags,n_steps,steps,description,ingredients,n_ingredients,Calories,Total_fat_PDV,Sugar_PDV,Sodium_PDV,Protein_PDV,Saturated_fat_PDV,Carbohydrates_PDV
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13,269.8,22.0,32.0,48.0,39.0,27.0,5.0
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11,368.1,17.0,10.0,2.0,14.0,8.0,20.0
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8,352.9,1.0,337.0,23.0,3.0,0.0,28.0


## Joining files based on Recipe ID:

In [4]:
# Merging training and recipes
mergedData = dd.merge(training, recipes, right_on='recipe_id', left_on='recipe_id')

# Making item-item matrix for sample of 10000 rows
mergedData = mergedData.categorize(columns='user_id')
item_item_matrix = dd.pivot_table(mergedData.head(10000, compute=False), values="rating", index='recipe_id', columns='user_id')

## Item-Item Recommendation Class:

In [5]:
class ItemItemRecommendations:
    def __init__(self, dataset):
        self.recipes = dataset.head(30000)
        self.recipes['tags'] = self.recipes['tags'].map(eval)
        self.cosine_matrix = None
        self.flag = False
        self.indices = pd.Series(self.recipes.index, index=self.recipes['recipe_id']).drop_duplicates()

    def get_recipe_info(self, recipe_id):
        # Get the recipe with the specified recipe_id
        recipe = self.recipes[self.recipes['recipe_id'] == recipe_id].compute().iloc[0]

        # Select the columns containing the nutritional information
        nutritional_columns = ['Calories', 'Total_fat_PDV', 'Sugar_PDV', 'Sodium_PDV', 'Protein_PDV', 'Saturated_fat_PDV', 'Carbohydrates_PDV']

        # Fill missing values with zeros
        recipe[nutritional_columns] = recipe[nutritional_columns].fillna(0)

        # Create a dictionary with the name and nutritional information of the recipe
        recipe_dict = {
            'name': recipe['name'],
            'Calories': recipe['Calories'],
            'Total_fat_PDV': recipe['Total_fat_PDV'],
            'Sugar_PDV': recipe['Sugar_PDV'],
            'Sodium_PDV': recipe['Sodium_PDV'],
            'Protein_PDV': recipe['Protein_PDV'],
            'Saturated_fat_PDV': recipe['Saturated_fat_PDV'],
            'Carbohydrates_PDV': recipe['Carbohydrates_PDV']
        }

        return recipe_dict
    
    def carbs_recommender(self, recipe_id):
        allRecipes = self.recipes

        # Select the columns containing the nutritional information
        nutritional_columns = ['Calories', 'Total_fat_PDV', 'Sugar_PDV', 'Sodium_PDV', 'Protein_PDV', 'Saturated_fat_PDV', 'Carbohydrates_PDV']

        # Fill missing values with zeros
        allRecipes[nutritional_columns] = allRecipes[nutritional_columns].fillna(0)

        # Check if recipe_id is present in allRecipes
        if recipe_id not in list(allRecipes['recipe_id'].values):
            print("recipe_id not found")
            return []
            
        # Print the information of the passed recipe_id
        recipe_info = allRecipes[allRecipes['recipe_id'] == recipe_id][['name', 'Calories', 'Total_fat_PDV', 'Sugar_PDV', 'Sodium_PDV', 'Protein_PDV', 'Saturated_fat_PDV', 'Carbohydrates_PDV']].to_dict('records')[0]
        print(json.dumps(recipe_info, indent=4))

        # Compute the cosine similarities between the recipes
        recipe_nutrition = da.from_array(np.vstack(allRecipes[nutritional_columns].values))
        similarity_scores = cosine_similarity(recipe_nutrition, recipe_nutrition[allRecipes.index[allRecipes['recipe_id'] == recipe_id]])

        # Get the top 5 most similar recipes
        top_recipe_indices = np.argsort(similarity_scores.ravel())[::-1][1:6]
        top_recipe_ids = allRecipes.iloc[top_recipe_indices]['recipe_id'].values.tolist()

        # Create a dictionary for each recipe containing its name and nutritional information
        recommended_recipes = []
        for recipe_id in top_recipe_ids:
            recipe_info = {}
            recipe_info['name'] = allRecipes[allRecipes['recipe_id'] == recipe_id]['name'].values[0]
            recipe_info['nutritional_info'] = allRecipes[allRecipes['recipe_id'] == recipe_id][nutritional_columns].to_dict('records')[0]
            recommended_recipes.append(recipe_info)
            
        # Return the recommended recipes
        print("_____ The Recommmended Top 5 Most Similar Recipes _____")
        return recommended_recipes


## Testing the model given a rando recipe_id:

In [6]:
model = ItemItemRecommendations(recipes)
model.carbs_recommender(137739)

{
    "name": "arriba   baked winter squash mexican style",
    "Calories": 51.5,
    "Total_fat_PDV": 0.0,
    "Sugar_PDV": 13.0,
    "Sodium_PDV": 0.0,
    "Protein_PDV": 2.0,
    "Saturated_fat_PDV": 0.0,
    "Carbohydrates_PDV": 4.0
}
_____ The Recommmended Top 5 Most Similar Recipes _____


[{'name': 'baked toaster like pastries',
  'nutritional_info': {'Calories': 174.3,
   'Total_fat_PDV': 0.0,
   'Sugar_PDV': 44.0,
   'Sodium_PDV': 0.0,
   'Protein_PDV': 8.0,
   'Saturated_fat_PDV': 0.0,
   'Carbohydrates_PDV': 13.0}},
 {'name': 'aztec oranges',
  'nutritional_info': {'Calories': 140.6,
   'Total_fat_PDV': 0.0,
   'Sugar_PDV': 37.0,
   'Sodium_PDV': 0.0,
   'Protein_PDV': 4.0,
   'Saturated_fat_PDV': 0.0,
   'Carbohydrates_PDV': 11.0}},
 {'name': 'an mochi  1',
  'nutritional_info': {'Calories': 1543.6,
   'Total_fat_PDV': 6.0,
   'Sugar_PDV': 401.0,
   'Sodium_PDV': 0.0,
   'Protein_PDV': 37.0,
   'Saturated_fat_PDV': 6.0,
   'Carbohydrates_PDV': 117.0}},
 {'name': 'black forest rice pudding',
  'nutritional_info': {'Calories': 477.3,
   'Total_fat_PDV': 6.0,
   'Sugar_PDV': 117.0,
   'Sodium_PDV': 3.0,
   'Protein_PDV': 21.0,
   'Saturated_fat_PDV': 6.0,
   'Carbohydrates_PDV': 34.0}},
 {'name': 'baked acorn squash   brown sugar',
  'nutritional_info': {'Calories': 1