### All imports

In [1]:
# Data processing
import pandas as pd
import numpy as np
import csv

# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns


#Import the mean_squared_error function
from sklearn.metrics import mean_squared_error

# Similarity
from sklearn.metrics.pairwise import cosine_similarity


from tqdm import trange

In [2]:
user_item_rating = pd.read_csv("../data/all_recipes/user-item-rating.csv",  on_bad_lines='skip', sep = '\t', names = ['user_id', 'item_id', 'rating'])
user_item_rating


Unnamed: 0,user_id,item_id,rating
0,455,50,3.0
1,455,457,4.0
2,455,28,5.0
3,455,458,3.0
4,455,459,5.0
...,...,...,...
50676,84839,131,3.0
50677,84839,109,5.0
50678,84839,145,5.0
50679,84839,133,5.0


In [3]:
user_item_rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50681 entries, 0 to 50680
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   user_id  50681 non-null  int64  
 1   item_id  50681 non-null  int64  
 2   rating   50681 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 1.2 MB


In [4]:
# Checking for any missing data

print("Percentage null or na values in Dataset\n-------------------------------------")
((user_item_rating.isnull() | user_item_rating.isna()).sum() * 100 / user_item_rating.index.size).round(2)

Percentage null or na values in Dataset
-------------------------------------


user_id    0.0
item_id    0.0
rating     0.0
dtype: float64

In [5]:
######    IKKE VIKTIG MEN SER FINT UT???      ######


# Number of users, recipies and ratings
print('The number of users in the dataset:', user_item_rating['user_id'].nunique())

print('The number of recipies that are rated:', user_item_rating['item_id'].nunique())

print('The number of different ratings in the dataset:', user_item_rating['rating'].nunique())

print('The unique ratings are:', sorted(user_item_rating['rating'].unique()))

The number of users in the dataset: 1273
The number of recipies that are rated: 1031
The number of different ratings in the dataset: 5
The unique ratings are: [1.0, 2.0, 3.0, 4.0, 5.0]


In [6]:
# reading in data from "item-profiles2.csv" to get the recipe names

item_profiles2 = pd.read_csv("data/all_recipes/item-profiles2.csv",  on_bad_lines='skip', sep = ';')

item_profiles2

Unnamed: 0,Recipe ID,Name,Fiber (g),Sodium (g),Carbohydrates (g),Fat (g),Protein (g),Sugar (g),Saturated Fat (g),Size (g),Servings,Calories (kCal),Average Rating,Average Sentiment,Number of Ratings,Number of Bookmarks,Year of Publishing
0,2622,Slow Cooker Tender and Yummy Round Steak,4.5,0.83,33.1,13.6,33.8,5.0,4.6,2599.35,6.0,393.0,4.32,1.79,81,2271,2000
1,722,Chicken Pot Pie II,6.2,1.06,47.8,29.5,51.4,6.4,11.4,2137.86,4.0,666.0,4.66,2.02,116,1200,2000
2,1137,Chicken in a Pot,1.0,0.40,6.9,6.6,28.7,1.8,1.4,819.37,4.0,206.0,4.29,1.98,83,1779,2001
3,2502,Erin's Indonesian Chicken,6.4,0.32,58.1,18.6,35.4,7.8,3.8,1972.13,4.0,530.0,4.39,1.99,80,872,2005
4,2714,Bubble Pizza,3.0,1.96,45.4,36.4,28.5,8.7,13.4,2375.00,8.0,624.0,4.35,1.96,117,2204,2001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1026,1353,Beef and Biscuit,0.9,0.63,15.5,18.9,18.4,3.7,8.6,1536.45,10.0,306.0,3.99,1.52,79,811,2001
1027,987,Creamy Pesto Shrimp,2.7,0.44,43.0,42.5,23.1,0.2,24.3,1655.88,8.0,646.0,4.58,1.98,225,3134,2000
1028,2903,Boiled Chicken,1.2,0.06,4.5,11.1,16.3,1.9,3.0,1780.00,8.0,186.0,4.70,1.20,74,1083,2001
1029,2136,Mushroom Sauce Baked Pork Chops,2.5,1.11,28.1,14.3,19.0,6.5,6.1,2241.75,6.0,316.0,4.45,1.86,121,1188,2001


The dataframe shown above has several columns that are of no use to us, we only want the columns "Recipe ID" and "Name". 

In [7]:
recipe_names = item_profiles2[['Recipe ID', 'Name']].copy()
recipe_names.head()

Unnamed: 0,Recipe ID,Name
0,2622,Slow Cooker Tender and Yummy Round Steak
1,722,Chicken Pot Pie II
2,1137,Chicken in a Pot
3,2502,Erin's Indonesian Chicken
4,2714,Bubble Pizza


Now that we have extracted only the preferred columns, we can merge this dataframe with "user_item_rating", our other dataframe that contains the user id, recipe id and ratings. To merge these, we use "Recipe ID". Before we merge them, we change the name of the column "Recipe ID" to "item_id" to match the column in "user_item_rating". 

In [8]:
recipe_names.rename(columns={'Recipe ID':'item_id'}, inplace=True)


In [9]:
df = user_item_rating.merge(recipe_names, how= 'inner', on='item_id')
df[df['item_id']==8125]


Unnamed: 0,user_id,item_id,rating,Name
45562,8849,8125,5.0,Stuffed Chicken Valentino
45563,10088,8125,4.0,Stuffed Chicken Valentino
45564,11672,8125,4.0,Stuffed Chicken Valentino
45565,12064,8125,4.0,Stuffed Chicken Valentino
45566,20084,8125,5.0,Stuffed Chicken Valentino
45567,21517,8125,4.0,Stuffed Chicken Valentino
45568,22126,8125,4.0,Stuffed Chicken Valentino
45569,23397,8125,3.0,Stuffed Chicken Valentino
45570,24473,8125,4.0,Stuffed Chicken Valentino
45571,24585,8125,4.0,Stuffed Chicken Valentino


In [10]:
# We want to check if any recipes have very few ratings, or if any 
# users have given very few ratings. We also find the mean. 

# Group data by user, count number of ratings and shows the mean for each user: 
aggregate_by_user = df.groupby('user_id').agg(mean_rating = ('rating', 'mean'), 
number_of_ratings = ('rating', 'count')).reset_index()



# Sorts the dataframe by number of ratings, helps us see what the lowest number of recipes any user has rated is
# This also shows us that there are 1273 unique users that have given ratings (number of rows)

aggregate_by_user.sort_values('number_of_ratings')


Unnamed: 0,user_id,mean_rating,number_of_ratings
260,22109,4.550000,20
518,41530,4.600000,20
1085,75525,4.450000,20
1082,75380,5.000000,20
760,57892,3.950000,20
...,...,...,...
597,48719,4.750000,236
1142,78714,4.135802,243
320,25694,4.657258,248
1094,76151,3.771331,293


In [11]:

# Group data by recipe, count number of ratings and shows mean rating for each recipe:
aggregate_by_recipe = df.groupby('item_id').agg(mean_rating = ('rating', 'mean'), 
number_of_ratings = ('rating', 'count')).reset_index()


# Sorts the dataframe by number of ratings, shows us the number of times the least rated recipe has been rated.
#       By not using .head(), we can also see the recipes with the highest number of ratings. 
# This also shows us that there are 1031 unique recipes (number of rows)

aggregate_by_recipe.sort_values('number_of_ratings')

Unnamed: 0,item_id,mean_rating,number_of_ratings
1030,15746,4.450000,20
654,1976,4.600000,20
657,2006,4.100000,20
680,2118,4.500000,20
685,2152,4.450000,20
...,...,...,...
64,134,4.600000,290
68,140,4.429553,291
21,50,4.594156,308
66,137,4.651090,321


Seeing that no recipe is rated less than 20 times, and none of our users have given less than 20 ratings, we decide that there is no reason to remove any data. 

In [12]:
# Shows the recipes that has the best mean - given best ratings - at the top,
#   the recipes with the lowest mean - worse ratings - at the bottom:
aggregate_by_recipe.sort_values('mean_rating', ascending=False)

Unnamed: 0,item_id,mean_rating,number_of_ratings
397,956,4.960000,25
245,572,4.954545,22
70,143,4.887097,62
842,3437,4.882353,34
704,2312,4.869565,23
...,...,...,...
168,372,3.583333,36
483,1247,3.553846,65
727,2534,3.428571,28
448,1108,3.393939,33


In [13]:
# NOT IMPORTANT, JUST SHOWS THE DIFFERENT MEANS

df_mean_users = aggregate_by_user['mean_rating'].mean()
df_ratings_mean = df['rating'].mean()
df_mean_items = aggregate_by_recipe['mean_rating'].mean()
print('Mean of all the users individual means: ',df_mean_users, '\n' 'Mean of ratings from original df:', df_ratings_mean,'\n' 'Mean of each recipes individual mean:', df_mean_items)

Mean of all the users individual means:  4.404332547821255 
Mean of ratings from original df: 4.388054695053373 
Mean of each recipes individual mean: 4.352838067432242


### Følger en tutorial!
De gjør ting annerledes enn vi hadde originalt, vil bare teste litt forskjellig

In [14]:
# BUILDING RATINGS MATRIX

#Can use either item_id or Name as column, same thing?
ratings_matrix = df.pivot_table(values='rating', index='user_id', columns='item_id')

ratings_matrix.head()

item_id,4,5,17,19,22,23,28,30,31,34,...,9098,9319,9661,9757,11194,11572,11895,13339,14078,15746
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
455,,,,,,,5.0,,,5.0,...,,,,,,,,,,
559,,,,,,,,,,5.0,...,,,,,,,,,,
833,,,,,,,,,,,...,,,,,,,,,,
1155,,,,,,,,,,,...,,,,,,,,,,
1299,,,,,,,,,,,...,,,,,,,,,,


Some people give generally higher - or lower - ratings than others, and have different views on what a "bad rating" is (For some poeple a rating of 3 is average or "just fine", while others would consider 3 a horrendous rating). Because of this we have to normalize our data by extracting each users average rating. Movies with a rating less than the users average will get a negative rating, and movies with a rating higher than the users average will get a positive value. 

In [15]:
# Normalizing the ratings matrix

norm_ratings_matrix = ratings_matrix.subtract(ratings_matrix.mean(axis=1), axis='rows')

norm_ratings_matrix.head()

item_id,4,5,17,19,22,23,28,30,31,34,...,9098,9319,9661,9757,11194,11572,11895,13339,14078,15746
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
455,,,,,,,0.75,,,0.75,...,,,,,,,,,,
559,,,,,,,,,,0.45,...,,,,,,,,,,
833,,,,,,,,,,,...,,,,,,,,,,
1155,,,,,,,,,,,...,,,,,,,,,,
1299,,,,,,,,,,,...,,,,,,,,,,


## Identify similar users

### Pearson correlation coefficient

In [16]:

pearson_sim = norm_ratings_matrix.T.corr(method='pearson')
pearson_sim.head()

user_id,455,559,833,1155,1299,1381,1537,1646,1873,2020,...,84374,84543,84570,84572,84583,84767,84778,84780,84790,84839
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
455,1.0,1.0,-0.375,-1.0,-0.4,,,,,1.0,...,,,,-0.327327,,,,-0.5,,0.333333
559,1.0,1.0,,,,,,,,,...,,,,-0.333333,,,,,,-1.0
833,-0.375,,1.0,-0.461538,0.408248,,,-0.612372,0.57735,0.229416,...,,-0.2,,-0.258199,,,,0.3611576,,0.036274
1155,-1.0,,-0.461538,1.0,,0.5,,,,1.0,...,,-1.0,,,,0.5,,-1.0,-1.0,0.226455
1299,-0.4,,0.408248,,1.0,,,,,1.0,...,,,,1.0,,,,9.614813000000001e-17,,


In [17]:
def find_item(item):    
    #df[df['item_id'] == item]
    return aggregate_by_recipe[aggregate_by_recipe['item_id']==item]    #returns specific items mean rating + number of ratings

find_item(94)


Unnamed: 0,item_id,mean_rating,number_of_ratings
43,94,4.373626,91


In [18]:

num_sim_users = 20
sim_threshold = 0.4     # Noe her er fucka når jeg endrer dette tallet men aner ikke hvorfor
top_recipes = 10

def recommend_using_pearson(userID, itemID = None):     # Can add sim_matrix as parameter and be used for both pearson and cosine?? 
    # Making copy of the dataframe, so we dont change the original one
    sim_matrix = pearson_sim.copy()
    sim_users = sim_matrix[sim_matrix[userID]>sim_threshold][userID].sort_values(ascending=False)[:num_sim_users]
 
    if itemID != None:
        sim_user_recipes = norm_ratings_matrix[norm_ratings_matrix.index.isin(sim_users.index)].dropna(axis=1, how='all')  
    else:
        sim_users.drop(index = userID, inplace = True)
        sim_user_recipes = norm_ratings_matrix[norm_ratings_matrix.index.isin(sim_users.index)].dropna(axis=1, how='all')

        userID_rated = norm_ratings_matrix[norm_ratings_matrix.index == userID].dropna(axis=1, how= 'all')
        sim_user_recipes.drop(userID_rated.columns,axis=1, inplace= True, errors = 'ignore')
    
    avg_rating = ratings_matrix[ratings_matrix.index == userID].T.mean()[userID]
    
    item_ratings = {}   #dict to store item scores
    for i in sim_user_recipes.columns:
        recipe_rating = sim_user_recipes[i]
        total_score = 0
        num_scores = 0
        for j in sim_users.index:
            if pd.isna(recipe_rating[j]) == False:
                sum_score_rating = sim_users[j] * recipe_rating[j] 
                total_score += sum_score_rating
                num_scores+=1
        item_ratings[i] = (total_score/num_scores) + avg_rating

    #turning dict into dataframe
    item_ratings = pd.DataFrame(item_ratings.items(),columns=['recipe', 'recipe_rating'])
    
    ranking_recipe_scores = item_ratings.sort_values(by='recipe_rating', ascending=False)

    recipes_to_recommend = ranking_recipe_scores.head(top_recipes)
    if itemID != None:
        try:
            return item_ratings[item_ratings['recipe']==itemID]['recipe_rating'].item()
        except:
            return 3.0
    return recipes_to_recommend

recommend_using_pearson(455)


Unnamed: 0,recipe,recipe_rating
162,1185,5.2
305,9098,5.2
304,8090,5.2
280,4294,5.2
188,1491,5.043103
286,4722,5.043103
233,2312,5.043103
186,1480,5.0
116,815,5.0
163,1225,5.0


In [19]:
def recommend_and_predict(sim_rec, userID):
    ranking_recipe_scores = sim_rec(userID)

    # The average rating for the given user
    #avg_rating = aggregate_by_user[aggregate_by_user.index == userID]['mean_rating']
    avg_rating = ratings_matrix[ratings_matrix.index == userID].T.mean()[userID]

    ranking_recipe_scores['pred_rating'] = ranking_recipe_scores['recipe_score'] + avg_rating
    recipes_to_predict = ranking_recipe_scores.head(top_recipes)

    return recipes_to_predict

In [20]:
pearson_copy = pearson_sim.copy()


### Cosine similarity

In [21]:
df_cosim = df.copy()

df_cosim

Unnamed: 0,user_id,item_id,rating,Name
0,455,50,3.0,Baked Ziti I
1,2878,50,2.0,Baked Ziti I
2,3172,50,5.0,Baked Ziti I
3,3698,50,5.0,Baked Ziti I
4,3794,50,5.0,Baked Ziti I
...,...,...,...,...
50676,75878,3287,4.0,Oven Baked BBQ Ribs
50677,79032,3287,5.0,Oven Baked BBQ Ribs
50678,79777,3287,5.0,Oven Baked BBQ Ribs
50679,81125,3287,5.0,Oven Baked BBQ Ribs


In [22]:
norm_ratings_matrix

item_id,4,5,17,19,22,23,28,30,31,34,...,9098,9319,9661,9757,11194,11572,11895,13339,14078,15746
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
455,,,,,,,0.750000,,,0.750000,...,,,,,,,,,,
559,,,,,,,,,,0.450000,...,,,,,,,,,,
833,,,,,,,,,,,...,,,,,,,,,,
1155,,,,,,,,,,,...,,,,,,,,,,
1299,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84767,,,,,,,,,,,...,,,,-0.35,,,,,,
84778,,,,,,,-0.307692,,,0.692308,...,,,,,,,,,,
84780,,,,,,,,,,,...,,,,,,,,,,
84790,,0.666667,,,,,,,,,...,,,-0.333333,,,,,,,


In [23]:
not_null_matrix = norm_ratings_matrix.copy().fillna(0)

cos = cosine_similarity(not_null_matrix)
cosim_matrix = pd.DataFrame(cos,index=not_null_matrix.index)
cosim_matrix.columns = not_null_matrix.index
cosim_matrix

user_id,455,559,833,1155,1299,1381,1537,1646,1873,2020,...,84374,84543,84570,84572,84583,84767,84778,84780,84790,84839
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
455,1.000000,0.036559,-0.021651,-0.021935,-0.035414,-0.008890,0.030601,0.000000,0.000000,0.145863,...,-0.036435,0.004534,0.032791,-0.022519,0.027190,-0.009465,0.020536,-0.025851,0.021385,-0.024355
559,0.036559,1.000000,-0.018876,0.000000,-0.180098,0.008357,0.000000,0.012797,0.000000,0.000000,...,0.000000,0.010959,0.013211,-0.038633,0.000000,-0.047522,0.008274,0.000000,0.034464,-0.085561
833,-0.021651,-0.018876,1.000000,-0.118334,-0.024201,0.005301,-0.001976,-0.046177,0.095624,0.070376,...,0.020950,-0.010234,-0.004190,-0.022462,-0.023162,-0.112879,-0.004198,0.058900,0.018217,0.003930
1155,-0.021935,0.000000,-0.118334,1.000000,0.004350,0.031434,0.035763,0.000000,-0.018245,0.059711,...,0.004747,-0.013087,0.000863,0.000688,-0.023618,0.023343,-0.015135,-0.021774,-0.051787,0.023323
1299,-0.035414,-0.180098,-0.024201,0.004350,1.000000,0.018470,-0.038146,0.036296,0.000000,0.054549,...,0.011355,0.000000,0.000000,0.012554,0.000000,0.002682,0.000000,-0.010063,-0.028274,-0.000511
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84767,-0.009465,-0.047522,-0.112879,0.023343,0.002682,0.023560,0.032927,0.000000,0.000000,-0.011011,...,-0.002865,0.029576,0.000000,0.029703,0.014254,1.000000,0.041985,-0.006941,-0.016193,-0.005674
84778,0.020536,0.008274,-0.004198,-0.015135,0.000000,-0.013631,0.029688,0.000000,0.000000,0.000000,...,-0.018406,0.000000,0.036364,0.009667,0.040204,0.041985,1.000000,-0.003729,-0.004684,-0.017402
84780,-0.025851,0.000000,0.058900,-0.021774,-0.010063,0.000000,0.000000,0.066088,-0.005297,-0.023485,...,0.009924,-0.016122,-0.015299,0.016488,0.000000,-0.006941,-0.003729,1.000000,0.000719,-0.006064
84790,0.021385,0.034464,0.018217,-0.051787,-0.028274,-0.021968,0.084865,0.018113,0.005989,0.000000,...,0.015583,-0.019821,0.009350,0.014913,-0.007753,-0.016193,-0.004684,0.000719,1.000000,-0.066578


In [24]:
def cosine_recommendation(userID, itemID = None):

    matrix = cosim_matrix.copy()
    
    sim_users = matrix[userID].sort_values(ascending=False)[:num_sim_users]
    if itemID != None:
        sim_user_recipes = norm_ratings_matrix[norm_ratings_matrix.index.isin(sim_users.index)].dropna(axis=1, how='all')
    else:
        sim_users.drop(index = userID, inplace = True)
        sim_user_recipes = norm_ratings_matrix[norm_ratings_matrix.index.isin(sim_users.index)].dropna(axis=1, how='all')

        userID_rated = norm_ratings_matrix[norm_ratings_matrix.index == userID].dropna(axis=1, how= 'all')
        sim_user_recipes.drop(userID_rated.columns,axis=1, inplace= True, errors = 'ignore')
    
    #print(f'The similar users for user {userID} are the following. Their similarity score to the right:' '\n', sim_users)
        
      
    item_ratings = {}   #dict to store item scores

    users_average = aggregate_by_user[aggregate_by_user['user_id']==userID]['mean_rating'].item()

    # Computing weighted mean
    for i in sim_user_recipes.columns:
        sim_scores = sim_users.copy()
        recipe_ratings = sim_user_recipes[i]
        idx = recipe_ratings[recipe_ratings.isnull()].index
        recipe_ratings = recipe_ratings.dropna()
        sim_scores = sim_scores.drop(idx)
        simScore = sim_scores.sum()
        if simScore == 0:
            simScore = 1
        wmean_rating = users_average + (np.dot(sim_scores, recipe_ratings) / simScore)
        item_ratings[i] = wmean_rating

    #turning dict into dataframe
    item_ratings = pd.DataFrame(item_ratings.items(),columns=['recipe', 'predicted_rating'])

    ranking_recipe_scores = item_ratings.sort_values(by='predicted_rating', ascending=False)
  
    recipes_to_recommend = ranking_recipe_scores.head(top_recipes)

    if itemID != None:
        try:
            return item_ratings[item_ratings['recipe']==itemID]['predicted_rating'].item()
        except:
            return 3.0
    
    return recipes_to_recommend
    

cosine_recommendation(455)
     

Unnamed: 0,recipe,predicted_rating
277,2925,5.185484
256,2118,5.185484
141,856,5.185484
325,5394,5.185484
279,3030,5.185484
311,4257,5.185484
225,1579,5.185484
165,987,5.138889
220,1543,5.138889
217,1510,5.138889


Because cosine similarity does not take missing values (NaN), we must convert all NaN´s to zeros.

In [25]:
matrix_dummy = norm_ratings_matrix.copy().fillna(0)
cosine_simiarity_ = cosine_similarity(matrix_dummy, matrix_dummy)

#Convert into pandas dataframe 
cosine_sim = pd.DataFrame(cosine_simiarity_, index=norm_ratings_matrix.index, columns=norm_ratings_matrix.index)

print(cosine_sim)

user_id     455       559       833       1155      1299      1381      1537   \
user_id                                                                         
455      1.000000  0.036559 -0.021651 -0.021935 -0.035414 -0.008890  0.030601   
559      0.036559  1.000000 -0.018876  0.000000 -0.180098  0.008357  0.000000   
833     -0.021651 -0.018876  1.000000 -0.118334 -0.024201  0.005301 -0.001976   
1155    -0.021935  0.000000 -0.118334  1.000000  0.004350  0.031434  0.035763   
1299    -0.035414 -0.180098 -0.024201  0.004350  1.000000  0.018470 -0.038146   
...           ...       ...       ...       ...       ...       ...       ...   
84767   -0.009465 -0.047522 -0.112879  0.023343  0.002682  0.023560  0.032927   
84778    0.020536  0.008274 -0.004198 -0.015135  0.000000 -0.013631  0.029688   
84780   -0.025851  0.000000  0.058900 -0.021774 -0.010063  0.000000  0.000000   
84790    0.021385  0.034464  0.018217 -0.051787 -0.028274 -0.021968  0.084865   
84839   -0.024355 -0.085561 

### -----Vanessa sine endringer bare over denne linjen-----

In [26]:
#Import the train_test_split function
from sklearn.model_selection import train_test_split

In [27]:

X = df.copy()
y = df['user_id']

print(y)

0          455
1         2878
2         3172
3         3698
4         3794
         ...  
50676    75878
50677    79032
50678    79777
50679    81125
50680    83519
Name: user_id, Length: 50681, dtype: int64


In [28]:
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y,random_state=42)

In [29]:

#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [31]:
def score(cf_model):
    # List of user-item tuples from test set
    id_pairs = list(zip(X_test['user_id'], X_test['item_id']))

    # Predict rating for every user-item tuple
    y_pred = []
    for i in trange(len(id_pairs)):
        (user,item) = id_pairs[i]
        y_pred.append(cf_model(user,item))

    #Get actual ratings
    y_true = np.array(X_test['rating'])
    
    print('true ratings: ')
    print(y_true[:10])
    print('predicted_ratings:')
    print(y_pred[:10])

    # Final RMSE score
    return rmse(y_true, y_pred)



In [33]:
# BUILDING RATINGS MATRIX

ratings_matrix = X_train.pivot_table(values='rating', index='user_id', columns='item_id')

ratings_matrix.head()

item_id,4,5,17,19,22,23,28,30,31,34,...,9098,9319,9661,9757,11194,11572,11895,13339,14078,15746
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
455,,,,,,,5.0,,,5.0,...,,,,,,,,,,
559,,,,,,,,,,,...,,,,,,,,,,
833,,,,,,,,,,,...,,,,,,,,,,
1155,,,,,,,,,,,...,,,,,,,,,,
1299,,,,,,,,,,,...,,,,,,,,,,


In [34]:
#User Based Collaborative Filter using Mean Ratings
def cf_user_mean(user_id, item_id):
    
    #Check if movie_id exists in r_matrix
    if item_id in ratings_matrix:
        #Compute the mean of all the ratings given to the movie
        mean_rating = ratings_matrix[item_id].mean()
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        mean_rating = 3.0
    return mean_rating

In [35]:
score(cf_user_mean)

100%|██████████| 12671/12671 [00:00<00:00, 41475.20it/s]

true ratings: 
[4. 5. 4. 4. 4. 5. 5. 5. 4. 3.]
predicted_ratings:
[4.695238095238095, 4.886363636363637, 4.6, 4.384615384615385, 4.027777777777778, 4.142857142857143, 4.313131313131313, 4.527027027027027, 4.435897435897436, 4.533333333333333]





0.8079141460532007

Output from cell above
```
100%|██████████| 12671/12671 [00:00<00:00, 41475.20it/s]true ratings: 
[4. 5. 4. 4. 4. 5. 5. 5. 4. 3.]
predicted_ratings:
[4.695238095238095, 4.886363636363637, 4.6, 4.384615384615385, 4.027777777777778, 4.142857142857143, 4.313131313131313, 4.527027027027027, 4.435897435897436, 4.533333333333333]

0.8079141460532007
```

In [36]:
#Create a dummy ratings matrix with all null values imputed to 0
r_matrix_dummy = ratings_matrix.copy().fillna(0)

In [37]:

#Compute the cosine similarity matrix using the dummy ratings matrix
cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)

In [38]:
#Convert into pandas dataframe 
cosine_sim = pd.DataFrame(cosine_sim, index=ratings_matrix.index, columns=ratings_matrix.index)

cosine_sim.head(10)

user_id,455,559,833,1155,1299,1381,1537,1646,1873,2020,...,84374,84543,84570,84572,84583,84767,84778,84780,84790,84839
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
455,1.0,0.0,0.072052,0.057419,0.155429,0.031255,0.042177,0.0,0.0,0.088777,...,0.0,0.034862,0.053469,0.055017,0.050725,0.0,0.127414,0.066191,0.026025,0.077538
559,0.0,1.0,0.036873,0.0,0.0,0.0,0.0,0.072172,0.0,0.0,...,0.0,0.049954,0.0,0.096353,0.0,0.047423,0.056177,0.0,0.093228,0.0
833,0.072052,0.036873,1.0,0.125698,0.097366,0.027909,0.0,0.093547,0.059597,0.203605,...,0.100885,0.049807,0.0,0.078602,0.036236,0.0,0.028006,0.140273,0.130135,0.132934
1155,0.057419,0.0,0.125698,1.0,0.122952,0.030133,0.108433,0.0,0.0,0.087091,...,0.031121,0.055568,0.0,0.0,0.0,0.056723,0.050395,0.042542,0.058543,0.154037
1299,0.155429,0.0,0.097366,0.122952,1.0,0.031121,0.0,0.0,0.0,0.062032,...,0.026785,0.0,0.0,0.0,0.0,0.0,0.0,0.073229,0.0,0.037433
1381,0.031255,0.0,0.027909,0.030133,0.031121,1.0,0.0,0.0,0.055173,0.088683,...,0.043763,0.030248,0.046393,0.033149,0.055015,0.059823,0.085039,0.0,0.021169,0.022935
1537,0.042177,0.0,0.0,0.108433,0.0,0.0,1.0,0.0,0.0,0.0,...,0.011811,0.0,0.062604,0.044733,0.0,0.14531,0.117624,0.0,0.039993,0.041266
1646,0.0,0.072172,0.093547,0.0,0.0,0.0,0.0,1.0,0.0,0.052264,...,0.056418,0.0,0.07476,0.0,0.070924,0.0,0.0,0.0,0.045484,0.0
1873,0.0,0.0,0.059597,0.0,0.0,0.055173,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.043163,0.0,0.0,0.0,0.062315,0.082691,0.0
2020,0.088777,0.0,0.203605,0.087091,0.062032,0.088683,0.0,0.052264,0.0,1.0,...,0.0,0.045219,0.0,0.0,0.082244,0.057236,0.0,0.0,0.0,0.160003


In [41]:
score(cosine_recommendation)

100%|██████████| 12671/12671 [27:41<00:00,  7.63it/s]

true ratings: 
[4. 5. 4. 4. 4. 5. 5. 5. 4. 3.]
predicted_ratings:
[4.273401923434033, 4.826882414678494, 4.0, 4.0, 4.589032208574551, 5.0, 5.07581691190038, 4.481430723182727, 4.0, 3.376748401326009]





0.45396745851284365

Output from cell above
```
5 rows × 1031 columns

100%|██████████| 12671/12671 [27:41<00:00,  7.63it/s]true ratings: 
[4. 5. 4. 4. 4. 5. 5. 5. 4. 3.]
predicted_ratings:
[4.273401923434033, 4.826882414678494, 4.0, 4.0, 4.589032208574551, 5.0, 5.07581691190038, 4.481430723182727, 4.0, 3.376748401326009]

0.45396745851284365
```

In [None]:
score(recommend_using_pearson)