In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
# from dask_ml.feature_extraction import HashingVectorizer
import os
import seaborn as sns
from collections import Counter
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
from scipy.sparse import csr_matrix
warnings.filterwarnings("ignore")

# import findspark
# findspark.init()

# import pyspark
# from pyspark.sql import SparkSession
# from pyspark.sql.functions import col

# spark = SparkSession.builder.config("spark.driver.memory", "8g").appName('SparkByExamples.com').getOrCreate()
# sc=spark.sparkContext

---

## Item Profiles based on TF-IDF of tags column

In [2]:
ratings = dd.read_csv('./clean_data/interactions_TRAIN.csv').drop('Unnamed: 0', axis=1)
ratings_test = dd.read_csv('./clean_data/interactions_TRAIN.csv').drop('Unnamed: 0', axis=1)
recipes = dd.read_csv('./clean_data/recipes.csv').drop('Unnamed: 0', axis=1)

# ratings_TRAIN_dask = dd.from_pandas(ratings_TRAIN, 1).reset_index().drop('index', axis=1)
# recipes_dask = dd.read_csv('./clean_data/recipes.csv').drop('Unnamed: 0', axis=1)

In [3]:
class TFIDF_Recommendations():
    
    def __init__(self, recipes, ratings):
        self.recipes = recipes
        self.recipes['tags'] = self.recipes['tags'].map(eval, meta=('x', int)).map(lambda x: ' '.join(x), meta=('x', str))
        self.cosine_matrix = None
        self.flag = False
        self.recipes.drop_duplicates(subset='recipe_id')
        # self.indices = dd.Series(self.recipes.index, index = self.recipes['recipe_id']).drop_duplicates())
        
        # compute weighted rating for train
        self.ratings = ratings.merge(self.recipes, on = 'recipe_id')
        self.ratings.drop(['date', 'minutes', 'contributor_id', 'submitted', 'n_steps', 'description', 'ingredients', 'n_ingredients', 'Calories', 'Total_fat_PDV', 'Sugar_PDV', 'Sodium_PDV', 'Protein_PDV', 'Saturated_fat_PDV', 'Carbohydrates_PDV', 'steps'], axis=1)
        
        mean_user_ratings = self.ratings.groupby('user_id')['rating'].mean().rename('mean_user_rating')
        self.ratings = self.ratings.join(mean_user_ratings, on='user_id')
        self.ratings['weighted_rating'] = self.ratings['rating'] - self.ratings['mean_user_rating']
        
        # filter recipes
        rated_recipes = self.ratings['recipe_id'].unique().compute()
        self.recipes = self.recipes[self.recipes['recipe_id'].isin(rated_recipes)]
        
        # compute weighted rating for test
        self.ratings_test = ratings_test.merge(self.recipes, on = 'recipe_id')
        self.ratings_test.drop(['date', 'minutes', 'contributor_id', 'submitted', 'n_steps', 'description', 'ingredients', 'n_ingredients', 'Calories', 'Total_fat_PDV', 'Sugar_PDV', 'Sodium_PDV', 'Protein_PDV', 'Saturated_fat_PDV', 'Carbohydrates_PDV', 'steps'], axis=1)
        
        mean_user_ratings = self.ratings_test.groupby('user_id')['rating'].mean().rename('mean_user_rating')
        self.ratings_test = self.ratings_test.join(mean_user_ratings, on='user_id')
        self.ratings_test['weighted_rating'] = self.ratings_test['rating'] - self.ratings_test['mean_user_rating']
        
        # compute TFDIF
        self.tfidf = TfidfVectorizer(stop_words='english', token_pattern=r"(?u)\S\S+")
        
        self.tfidf_ratings_matrix = self.tfidf.fit_transform(self.ratings['tags'])
        self.tfidf_recipes_matrix = self.tfidf.fit_transform(self.recipes['tags'])
    
    def display_tag_distribution(self):
        
        tags_count = self.recipes['tags'].map(Counter).sum().compute().most_common()
        fig = px.bar(tags_count,
                     x = 0,
                     y = 1,
                     log_y = True,
                     title = "Count Distribution of Tags Column")
        fig.update_layout(
            xaxis_title="Tag Names",
            yaxis_title="Counts",
            yaxis = dict(
                tickmode = 'linear',
                tick0 = 0,
                dtick = 1
    ))
    
        return fig
    
    def display_cosine_matrix(self, n=5):
        
        try:
            fig = px.imshow(self.cosine_matrix[:n, :n],
                            labels = dict(x = "Recipe Index", y = "Recipe Index", color = "Cosine Similarity"),
                            title = "Cosine similarity of Recipes",
                            text_auto = True)
        except:
            raise Exception("Generate a cosine matrix first by calling generate_cosine_sim_matrix()")
            
        return fig
    
    def generate_user_profile(self, user_id):
        rows = self.ratings.query(f'user_id == {user_id}')
        
        indices = rows.index.compute()
        
    
        ratings = self.ratings.compute().reset_index()
        user_profile = 0
        for index in indices:
            user_profile += self.tfidf_ratings_matrix[index] * ratings.loc[index]['weighted_rating']

        return user_profile
        
    
    def generate_recommendations(self, user_id):
        similar_recipes = []
        curr_similarity = 0
        user_profile = self.generate_user_profile(user_id)
        collected_profile = user_profile
    
        recipes = self.recipes.compute().reset_index()
        
        for i in range(0, self.tfidf_recipes_matrix.shape[0]):
            curr_similarity = cosine_similarity(self.tfidf_recipes_matrix[i], collected_profile)
            similar_recipes.append((curr_similarity[0][0], recipes.loc[i]['recipe_id'], i))
        
        similar_recipes = sorted(similar_recipes, key = lambda x: x[0])
        
        return similar_recipes
    
    def display_recommendations(self, recommendations, n):
        toReturn = []
        
        similar_recipes = recommendations[:n]
        
        recipes = self.recipes.compute().reset_index()
        for sim, _, idx in similar_recipes:
            recipe = recipes.loc[idx]['name']
            toReturn.append(recipe)
        
        return toReturn
        
        

In [4]:
obj = TFIDF_Recommendations(recipes, ratings)

In [5]:
recommendations = obj.generate_recommendations(104295)

In [6]:
obj.display_recommendations(recommendations, 10)

['bulgur custard bake',
 'chicken tortilla rollups',
 'banana   walnut cake',
 'chicken with vinegar',
 'roast chicken with grand marnier glaze',
 'simon   garfunkel roast chicken',
 'bacon roasted chicken with stuffing',
 'beer brined chicken',
 'braised to be praised paprika chicken',
 'fried chicken with chicken gravy']

In [7]:
def evaluate_model(obj, recommendations, user_id):
    import math
    
    recommendations_np = np.array(recommendations)
    recommendations = dd.from_array(recommendations_np, columns=['prediction', 'recipe_id', 'index'])
    ratings_test_by_user = obj.ratings_test.query(f'user_id == {user_id}')
    print(ratings_test_by_user.compute())
    print(recommendations.compute())
    ratings_list = ratings_test_by_user.merge(recommendations, on='recipe_id')
    
    ratings_list = ratings_list.assign(temp=lambda x: (x.prediction - x.weighted_rating)*(x.prediction - x.weighted_rating))
    
    print(ratings_list.compute())
    
    rmse_sum = ratings_list['temp'].sum().compute()
    
    print (rmse_sum)
    
    rmse = math.sqrt(rmse_sum)
    print (rmse)
    return rmse

In [8]:
evaluate_model(obj, recommendations, 104295)

        user_id  recipe_id        date  rating   
4712     104295      32204  2008-01-27       5  \
6057     104295     186124  2007-01-26       5   
12627    104295     101104  2006-11-24       5   
12870    104295      57919  2004-04-03       3   
15684    104295      32059  2005-04-12       5   
...         ...        ...         ...     ...   
203920   104295     233549  2009-12-22       5   
204353   104295     128805  2006-06-07       4   
204542   104295     377487  2009-09-22       3   
204769   104295     153051  2008-07-24       4   
205359   104295      53395  2004-01-21       4   

                                                   review   
4712    Yep.  Very easy and super good.  DH made them ...  \
6057    Excellent recipe.  My squash were, apparently,...   
12627   This is the pie recipe I always use.  Classic ...   
12870   We thought this was so-so.  I used half the am...   
15684   Very moist incredibly low fat muffin.  I used ...   
...                              

20.836389089042143