<h1 style='text-align:center'>Content Based Recommender System for Recipes<h1/>

In [25]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import os
import seaborn as sns
from collections import Counter
import plotly.express as px
import plotly.graph_objects as go
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import math
import warnings

from scipy.sparse import csr_matrix
warnings.filterwarnings("ignore")

In [26]:
ratings = dd.read_csv('./clean_data/interactions_TRAIN.csv').drop('Unnamed: 0', axis=1)
ratings_test = dd.read_csv('./clean_data/interactions_TRAIN.csv').drop('Unnamed: 0', axis=1)
recipes = dd.read_csv('./clean_data/recipes.csv').drop('Unnamed: 0', axis=1)

---

# Item Profiles based on TF-IDF of tags column

In this approach of a content based recommendation engine - item profiles (and user profiles) are constructed using the TF-IDF of the `tags` column. The reason for this is because the `tags` column provides descriptive words for the associated recipes. An argument can be made to use the `description` column instead. However, `tags` was used instead for two reasons.

1. The `tags` column has no null values.
2. The `description` column is user generated - where as the `tags` column is not. The tags column is free from typos and other english words that may have hindered the performance of the TF-IDF vectorizer.

Below is a description of TF-IDF

TF (Term Frequency):
$$
\text{TF}(t,d) = \frac{n_{t,d}}{\sum_{k}n_{k,d}}
$$

where $n_{t,d}$ is the number of times term $t$ appears in document $d$, and $\sum_{k}n_{k,d}$ is the total number of terms in document $d$. This formula calculates the relative frequency of term $t$ in document $d$.

IDF (Inverse Document Frequency):
$$
\text{IDF}(t,D) = \log{\frac{|D|}{|\{d \in D : t \in d\}|}}
$$

where $|D|$ is the total number of documents in the corpus $D$, and $|\{d \in D : t \in d\}|$ is the number of documents in the corpus that contain the term $t$. This formula measures the rarity of the term $t$ in the corpus $D$.

TF-IDF (Term Frequency - Inverse Document Frequency):
$$
\text{TF-IDF}(t,d,D) = \text{TF}(t,d) \times \text{IDF}(t,D)
$$

where $\text{TF}(t,d)$ is the term frequency of term $t$ in document $d$, and $\text{IDF}(t,D)$ is the inverse document frequency of term $t$ in the corpus $D$. This formula combines the relative frequency of term $t$ in document $d$ with the rarity of term $t$ in the corpus $D$, giving higher weights to terms that are both frequent in the document and rare in the corpus.

Cosine Similarity:
$$
\text{similarity}(\textbf{v},\textbf{w}) = \frac{\textbf{v} \cdot \textbf{w}}{\|\textbf{v}\| \|\textbf{w}\|}
$$

where $\textbf{v}$ and $\textbf{w}$ are two vectors, and $\textbf{v} \cdot \textbf{w}$ is the dot product of the vectors, and $\|\textbf{v}\|$ and $\|\textbf{w}\|$ are the magnitudes of the vectors. This formula calculates the cosine of the angle between the two vectors, which measures how similar they are in direction. The result is a value between -1 and 1, where values closer to 1 indicate higher similarity.

In [27]:
class TFIDF_Recommendations():
    
    def __init__(self, recipes, ratings):
        self.recipes = recipes
        self.recipes['tags'] = self.recipes['tags'].map(eval, meta=('x', int)).map(lambda x: ' '.join(x), meta=('x', str))
        self.flag = False
        self.recipes.drop_duplicates(subset='recipe_id')
        
        # Compute weighted ratings for train
        self.ratings = ratings.merge(self.recipes, on = 'recipe_id')
        self.ratings.drop(['date', 'minutes', 'contributor_id', 'submitted', 'n_steps', 'description', 'ingredients', 'n_ingredients', 'Calories', 'Total_fat_PDV', 'Sugar_PDV', 'Sodium_PDV', 'Protein_PDV', 'Saturated_fat_PDV', 'Carbohydrates_PDV', 'steps'], axis=1)
        
        mean_user_ratings = self.ratings.groupby('user_id')['rating'].mean().rename('mean_user_rating')
        self.ratings = self.ratings.join(mean_user_ratings, on='user_id')
        self.ratings['weighted_rating'] = self.ratings['rating'] - self.ratings['mean_user_rating']
        
        # Filter recipes to only those referenced in an interaction in our training set
        rated_recipes = self.ratings['recipe_id'].unique().compute()
        self.recipes = self.recipes[self.recipes['recipe_id'].isin(rated_recipes)]
        
        # Compute weighted ratings for test
        self.ratings_test = ratings_test.merge(self.recipes, on = 'recipe_id')
        self.ratings_test.drop(['date', 'minutes', 'contributor_id', 'submitted', 'n_steps', 'description', 'ingredients', 'n_ingredients', 'Calories', 'Total_fat_PDV', 'Sugar_PDV', 'Sodium_PDV', 'Protein_PDV', 'Saturated_fat_PDV', 'Carbohydrates_PDV', 'steps'], axis=1)
        
        mean_user_ratings = self.ratings_test.groupby('user_id')['rating'].mean().rename('mean_user_rating')
        self.ratings_test = self.ratings_test.join(mean_user_ratings, on='user_id')
        self.ratings_test['weighted_rating'] = self.ratings_test['rating'] - self.ratings_test['mean_user_rating']
        
        # Compute TFDIF matrices
        self.tfidf = TfidfVectorizer(stop_words='english', token_pattern=r"(?u)\S\S+")
        
        self.tfidf_ratings_matrix = self.tfidf.fit_transform(self.ratings['tags'])
        self.tfidf_recipes_matrix = self.tfidf.fit_transform(self.recipes['tags'])
    
    
    def display_tag_distribution(self):
        
        tags_count = self.recipes['tags'].str.split().map(Counter).sum().compute().most_common()
        fig = px.bar(tags_count,
                     x = 0,
                     y = 1,
                     log_y = True,
                     title = "Count Distribution of Tags Column")
        fig.update_layout(
            xaxis_title="Tag Names",
            yaxis_title="Counts",
            yaxis = dict(
                tickmode = 'linear',
                tick0 = 0,
                dtick = 1
    ))
    
        return fig
    
    
    def generate_user_profile(self, user_id):
        rows = self.ratings.query(f'user_id == {user_id}')
        
        indices = rows.index.compute()
        
    
        ratings = self.ratings.compute().reset_index()
        user_profile = 0
        for index in indices:
            user_profile += self.tfidf_ratings_matrix[index] * ratings.loc[index]['weighted_rating']

        return user_profile
        
    
    def generate_recommendations(self, user_id):
        similar_recipes = []
        curr_similarity = 0
        user_profile = self.generate_user_profile(user_id)
        collected_profile = user_profile
    
        recipes = self.recipes.compute().reset_index()
        
        for i in range(0, self.tfidf_recipes_matrix.shape[0]):
            curr_similarity = cosine_similarity(self.tfidf_recipes_matrix[i], collected_profile)
            similar_recipes.append((curr_similarity[0][0], recipes.loc[i]['recipe_id'], i))
        
        similar_recipes = sorted(similar_recipes, key = lambda x: x[0], reverse=True)
        
        return similar_recipes
    
    def display_recommendations(self, recommendations, n):
        toReturn = []
        
        similar_recipes = recommendations[:n]
        
        recipes = self.recipes.compute().reset_index()
        for sim, _, idx in similar_recipes:
            recipe = recipes.loc[idx]['name']
            toReturn.append(recipe)
        
        return toReturn
    
    
    def display_similar_recipes_heatmap(self, user_id, similar_recipes):
        sim_scores = list(map(lambda x: x[0], similar_recipes))
        recipe_ids = list(map(lambda x: str(x), map(lambda x: x[1], similar_recipes)))

        heatmap_trace = go.Heatmap(x=recipe_ids, y=[f'User_id: {user_id}'], z=[sim_scores], colorscale='Viridis')

        # Create the layout for the plot
        layout = go.Layout(title=f'Similar recipes for {user_id}',
                           xaxis=dict(title='X-Axis', tickangle=-90),
                           yaxis=dict(title='Y-Axis', ticktext=[f'User_id: {user_id}'], tickvals=[0]),
                           height=600,
                           width=1000)

        # Create the figure and plot the heatmap
        fig = go.Figure(data=[heatmap_trace], layout=layout)

        return fig
    
    
    def evaluate_model(self, recommendations, user_id):
        # Turn recommendations into a dask dataframe
        recommendations_np = np.array(recommendations)
        recommendations = dd.from_array(recommendations_np, columns=['prediction', 'recipe_id', 'index'])
        
        # Grab the ratings dataframe to be used for testing, and filter it to only those of this user id
        ratings_test_by_user = self.ratings_test.query(f'user_id == {user_id}')
        
        # Merge the recommendations and ratings dataframe to more easily compute RMSE
        ratings_list = ratings_test_by_user.merge(recommendations, on='recipe_id')
        
        # Add the mean user rating back to get the true predicted rating
        ratings_list['prediction'] = ratings_list['prediction'] + ratings_list['mean_user_rating']

        # RMSE = SQRT[SUM[(prediction - rating)^2]]
        ratings_list = ratings_list.assign(temp=lambda x: (x.prediction - x.rating)*(x.prediction - x.rating))
        rmse_sum = ratings_list['temp'].sum().compute()
        rmse = math.sqrt(rmse_sum)

        return rmse

        

In [28]:
obj = TFIDF_Recommendations(recipes, ratings)

In [29]:
obj.display_tag_distribution()

# Recommendations
    
Using the cosine similarity, figure out what items are most similar to the highest rated items by the specified user, and print the top 10 items.


In [30]:
recommendations = obj.generate_recommendations(104295)

In [31]:
obj.display_recommendations(recommendations, 10)

['tuna   rice casserole',
 'linguine with smoked haddock  tomatoes and spinach',
 'salmon with fennel  swiss chard and penne',
 'seared salmon with linguine and ramp pesto',
 'lemony tuna pasta',
 'tuna balls with cream sauce',
 'tuna pabucas  patties  with two radish caper sauce',
 'tuna patties with ranch dressing',
 'tuna steaks with orange and rosemary',
 'tuna with wasabi lime butter sauce']

# Heatmap

This heatmap visualizes the simlarities of the top 100 recipes of the given user

In [32]:
obj.display_similar_recipes_heatmap(104295, recommendations[:100])

# Model evaluation

Compares the predictions against known ratings using the Root mean square error (RMSE) metric

In [33]:
obj.evaluate_model(recommendations, 104295)

20.879368036700317