### All imports

In [1]:
# Data processing
import pandas as pd
import numpy as np
import csv

# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns


#Import the mean_squared_error function
from sklearn.metrics import mean_squared_error

# Similarity
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
user_item_rating = pd.read_csv("user-item-rating.csv",  on_bad_lines='skip', sep = '\t', names = ['user_id', 'item_id', 'rating'])
user_item_rating

Unnamed: 0,user_id,item_id,rating
0,455,50,3.0
1,455,457,4.0
2,455,28,5.0
3,455,458,3.0
4,455,459,5.0
...,...,...,...
50676,84839,131,3.0
50677,84839,109,5.0
50678,84839,145,5.0
50679,84839,133,5.0


In [3]:
user_item_rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50681 entries, 0 to 50680
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   user_id  50681 non-null  int64  
 1   item_id  50681 non-null  int64  
 2   rating   50681 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 1.2 MB


In [4]:
# Checking for any missing data

print("Percentage null or na values in Dataset\n-------------------------------------")
((user_item_rating.isnull() | user_item_rating.isna()).sum() * 100 / user_item_rating.index.size).round(2)

Percentage null or na values in Dataset
-------------------------------------


user_id    0.0
item_id    0.0
rating     0.0
dtype: float64

In [5]:
######    IKKE VIKTIG MEN SER FINT UT???      ######


# Number of users, recipies and ratings
print('The number of users in the dataset:', user_item_rating['user_id'].nunique())

print('The number of recipies that are rated:', user_item_rating['item_id'].nunique())

print('The number of different ratings in the dataset:', user_item_rating['rating'].nunique())

print('The unique ratings are:', sorted(user_item_rating['rating'].unique()))

The number of users in the dataset: 1273
The number of recipies that are rated: 1031
The number of different ratings in the dataset: 5
The unique ratings are: [1.0, 2.0, 3.0, 4.0, 5.0]


In [6]:
# reading in data from "item-profiles2.csv" to get the recipe names

item_profiles2 = pd.read_csv("item-profiles2.csv",  on_bad_lines='skip', sep = ';')

item_profiles2

Unnamed: 0,Recipe ID,Name,Fiber (g),Sodium (g),Carbohydrates (g),Fat (g),Protein (g),Sugar (g),Saturated Fat (g),Size (g),Servings,Calories (kCal),Average Rating,Average Sentiment,Number of Ratings,Number of Bookmarks,Year of Publishing
0,2622,Slow Cooker Tender and Yummy Round Steak,4.5,0.83,33.1,13.6,33.8,5.0,4.6,2599.35,6.0,393.0,4.32,1.79,81,2271,2000
1,722,Chicken Pot Pie II,6.2,1.06,47.8,29.5,51.4,6.4,11.4,2137.86,4.0,666.0,4.66,2.02,116,1200,2000
2,1137,Chicken in a Pot,1.0,0.40,6.9,6.6,28.7,1.8,1.4,819.37,4.0,206.0,4.29,1.98,83,1779,2001
3,2502,Erin's Indonesian Chicken,6.4,0.32,58.1,18.6,35.4,7.8,3.8,1972.13,4.0,530.0,4.39,1.99,80,872,2005
4,2714,Bubble Pizza,3.0,1.96,45.4,36.4,28.5,8.7,13.4,2375.00,8.0,624.0,4.35,1.96,117,2204,2001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1026,1353,Beef and Biscuit,0.9,0.63,15.5,18.9,18.4,3.7,8.6,1536.45,10.0,306.0,3.99,1.52,79,811,2001
1027,987,Creamy Pesto Shrimp,2.7,0.44,43.0,42.5,23.1,0.2,24.3,1655.88,8.0,646.0,4.58,1.98,225,3134,2000
1028,2903,Boiled Chicken,1.2,0.06,4.5,11.1,16.3,1.9,3.0,1780.00,8.0,186.0,4.70,1.20,74,1083,2001
1029,2136,Mushroom Sauce Baked Pork Chops,2.5,1.11,28.1,14.3,19.0,6.5,6.1,2241.75,6.0,316.0,4.45,1.86,121,1188,2001


The dataframe shown above has several columns that are of no use to us, we only want the columns "Recipe ID" and "Name". 

In [7]:
recipe_names = item_profiles2[['Recipe ID', 'Name']].copy()
recipe_names.head()

Unnamed: 0,Recipe ID,Name
0,2622,Slow Cooker Tender and Yummy Round Steak
1,722,Chicken Pot Pie II
2,1137,Chicken in a Pot
3,2502,Erin's Indonesian Chicken
4,2714,Bubble Pizza


Now that we have extracted only the preferred columns, we can merge this dataframe with "user_item_rating", our other dataframe that contains the user id, recipe id and ratings. To merge these, we use "Recipe ID". Before we merge them, we change the name of the column "Recipe ID" to "item_id" to match the column in "user_item_rating". 

In [8]:
recipe_names.rename(columns={'Recipe ID':'item_id'}, inplace=True)
recipe_names.head()

Unnamed: 0,item_id,Name
0,2622,Slow Cooker Tender and Yummy Round Steak
1,722,Chicken Pot Pie II
2,1137,Chicken in a Pot
3,2502,Erin's Indonesian Chicken
4,2714,Bubble Pizza


In [48]:
df = user_item_rating.merge(recipe_names, how= 'inner', on='item_id')


0          50
1          50
2          50
3          50
4          50
         ... 
50676    3287
50677    3287
50678    3287
50679    3287
50680    3287
Name: item_id, Length: 50681, dtype: int64

In [10]:
# We want to check if any recipes have very few ratings, or if any 
# users have given very few ratings. We also find the mean. 

# Group data by user, count number of ratings and shows the mean for each user: 
aggregate_by_user = df.groupby('user_id').agg(mean_rating = ('rating', 'mean'), 
number_of_ratings = ('rating', 'count')).reset_index()



# Sorts the dataframe by number of ratings, helps us see what the lowest number of recipes any user has rated is
# This also shows us that there are 1273 unique users that have given ratings (number of rows)

aggregate_by_user.sort_values('number_of_ratings')


Unnamed: 0,user_id,mean_rating,number_of_ratings
260,22109,4.550000,20
518,41530,4.600000,20
1085,75525,4.450000,20
1082,75380,5.000000,20
760,57892,3.950000,20
...,...,...,...
597,48719,4.750000,236
1142,78714,4.135802,243
320,25694,4.657258,248
1094,76151,3.771331,293


In [11]:

# Group data by recipe, count number of ratings and shows mean rating for each recipe:
aggregate_by_recipe = df.groupby('item_id').agg(mean_rating = ('rating', 'mean'), 
number_of_ratings = ('rating', 'count')).reset_index()


# Sorts the dataframe by number of ratings, shows us the number of times the least rated recipe has been rated.
#       By not using .head(), we can also see the recipes with the highest number of ratings. 
# This also shows us that there are 1031 unique recipes (number of rows)

aggregate_by_recipe.sort_values('number_of_ratings')

Unnamed: 0,item_id,mean_rating,number_of_ratings
1030,15746,4.450000,20
654,1976,4.600000,20
657,2006,4.100000,20
680,2118,4.500000,20
685,2152,4.450000,20
...,...,...,...
64,134,4.600000,290
68,140,4.429553,291
21,50,4.594156,308
66,137,4.651090,321


Seeing that no recipe is rated less than 20 times, and none of our users have given less than 20 ratings, we decide that there is no reason to remove any data. 

In [12]:
# Shows the recipes that has the best mean - given best ratings - at the top,
#   the recipes with the lowest mean - worse ratings - at the bottom:
aggregate_by_recipe.sort_values('mean_rating', ascending=False)

Unnamed: 0,item_id,mean_rating,number_of_ratings
397,956,4.960000,25
245,572,4.954545,22
70,143,4.887097,62
842,3437,4.882353,34
704,2312,4.869565,23
...,...,...,...
168,372,3.583333,36
483,1247,3.553846,65
727,2534,3.428571,28
448,1108,3.393939,33


In [13]:
# NOT IMPORTANT, JUST SHOWS THE DIFFERENT MEANS

df_mean_users = aggregate_by_user['mean_rating'].mean()
df_ratings_mean = df['rating'].mean()
df_mean_items = aggregate_by_recipe['mean_rating'].mean()
print('Mean of all the users own means: ',df_mean_users, '\n' 'Mean of ratings from "normal" df:', df_ratings_mean,'\n' 'Mean of each recipes own mean:', df_mean_items)

Mean of all the users own means:  4.404332547821255 
Mean of ratings from "normal" df: 4.388054695053373 
Mean of each recipes own mean: 4.352838067432242


### Følger en tutorial!
De gjør ting annerledes enn vi hadde originalt, vil bare teste litt forskjellig

In [50]:
# BUILDING RATINGS MATRIX

#Can use either item_id or Name as column, same thing?
ratings_matrix = df.pivot_table(values='rating', index='user_id', columns='Name')

ratings_matrix.head()

Name,A Good Easy Garlic Chicken,A Jerky Chicken,Acapulco Chicken,Actually Delicious Turkey Burgers,Addictive Sweet Potato Burritos,African Chicken Stew,Aimee's Quick Chicken,Alaska Salmon Bake with Pecan Crunch Coating,Alfredo Mostaccioli,Alice Chicken,...,Whole Wheat and Honey Pizza Dough,World's Best Lasagna,Yakisoba Chicken,Yummy Honey Chicken Kabobs,Yummy Pork Chops,Zesty Slow Cooker Chicken Barbecue,Zippy Summer Shrimp,Zucchini Alfredo,Zucchini Parmesan,Zucchini Patties
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
455,,,,,,,,,,,...,,4.0,,,,,,,,4.0
559,,,,,,,,,,,...,,,,,,,,,,
833,,,,,,,,,,,...,,,,,,5.0,,,,
1155,,,,,,,,,,,...,,4.0,,,,3.0,,,,
1299,,,,,,,,,,,...,,,,,,,,,,


Some people give generally higher - or lower - ratings than others, and have different views on what a "bad rating" is (For some poeple a rating of 3 is average or "just fine", while others would consider 3 a horrendous rating). Because of this we have to normalize our data by extracting each users average rating. Movies with a rating less than the users average will get a negative rating, and movies with a rating higher than the users average will get a positive value. 

In [15]:
# Normalizing the ratings matrix

norm_ratings_matrix = ratings_matrix.subtract(ratings_matrix.mean(axis=1), axis='rows')

norm_ratings_matrix.head()

Name,A Good Easy Garlic Chicken,A Jerky Chicken,Acapulco Chicken,Actually Delicious Turkey Burgers,Addictive Sweet Potato Burritos,African Chicken Stew,Aimee's Quick Chicken,Alaska Salmon Bake with Pecan Crunch Coating,Alfredo Mostaccioli,Alice Chicken,...,Whole Wheat and Honey Pizza Dough,World's Best Lasagna,Yakisoba Chicken,Yummy Honey Chicken Kabobs,Yummy Pork Chops,Zesty Slow Cooker Chicken Barbecue,Zippy Summer Shrimp,Zucchini Alfredo,Zucchini Parmesan,Zucchini Patties
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
455,,,,,,,,,,,...,,-0.25,,,,,,,,-0.25
559,,,,,,,,,,,...,,,,,,,,,,
833,,,,,,,,,,,...,,,,,,0.227848,,,,
1155,,,,,,,,,,,...,,0.029412,,,,-0.970588,,,,
1299,,,,,,,,,,,...,,,,,,,,,,


## Identify similar users

### Pearson correlation coefficient

In [16]:

pearson_sim = norm_ratings_matrix.T.corr()
pearson_sim.head()

user_id,455,559,833,1155,1299,1381,1537,1646,1873,2020,...,84374,84543,84570,84572,84583,84767,84778,84780,84790,84839
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
455,1.0,1.0,-0.375,-1.0,-0.4,,,,,1.0,...,,,,-0.327327,,,,-0.5,,0.333333
559,1.0,1.0,,,,,,,,,...,,,,-0.333333,,,,,,-1.0
833,-0.375,,1.0,-0.461538,0.408248,,,-0.612372,0.57735,0.229416,...,,-0.2,,-0.258199,,,,0.3611576,,0.036274
1155,-1.0,,-0.461538,1.0,,0.5,,,,1.0,...,,-1.0,,,,0.5,,-1.0,-1.0,0.226455
1299,-0.4,,0.408248,,1.0,,,,,1.0,...,,,,1.0,,,,-9.614813000000001e-17,,


How to find similar users? We can use user_id 455 to show.

In [17]:
# Exclude user 455 from the similar user list, and choose number of similar users

our_user = 455

pearson_sim.drop(index = our_user, inplace=True)

pearson_sim.head()  # shows that user 455 is gone!

user_id,455,559,833,1155,1299,1381,1537,1646,1873,2020,...,84374,84543,84570,84572,84583,84767,84778,84780,84790,84839
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
559,1.0,1.0,,,,,,,,,...,,,,-0.333333,,,,,,-1.0
833,-0.375,,1.0,-0.461538,0.408248,,,-0.612372,0.57735,0.229416,...,,-0.2,,-0.258199,,,,0.3611576,,0.036274
1155,-1.0,,-0.461538,1.0,,0.5,,,,1.0,...,,-1.0,,,,0.5,,-1.0,-1.0,0.226455
1299,-0.4,,0.408248,,1.0,,,,,1.0,...,,,,1.0,,,,-9.614813000000001e-17,,
1381,,,,0.5,,1.0,,,,,...,,,,,,,,,-0.5,


In the matrix above, the values range from -1 to 1. -1 represents an oppsite preference, 1 represents the same preference. We want to find the top 10 most similar users to user 455. 

We chose the threshold = 0.3, which means that a user must have at least a Pearson correlation coefficient of 0.3 for us to consider them a similar user. 

In [18]:
num_sim_users = 10

sim_threshold = 0.2

sim_users = pearson_sim[pearson_sim[our_user]>sim_threshold][our_user].sort_values(ascending= False)[:num_sim_users]

sorted_users = pearson_sim[pearson_sim[our_user]==1.0][our_user].sort_values(ascending= False)

#print the top 10 similar users:
print(f'The similar users for user {our_user} are', sim_users)

#how many users have a pearson correlation coefficient of 1.0?? 
print('The number of users that have a pearson correlation coefficient of 1.0 with user455 is:',sorted_users.count())

The similar users for user 455 are user_id
559      1.0
41398    1.0
54812    1.0
53966    1.0
2020     1.0
51896    1.0
48039    1.0
47640    1.0
45946    1.0
45643    1.0
Name: 455, dtype: float64
The number of users that have a pearson correlation coefficient of 1.0 with user455 is: 66


Because the point is to recommend new recipes to user455, we can now try to "clean up" a bit. We only want to keep the recipes that our list of similar users have rated, but if user455 has rated any of these they must be removed. 

In [19]:
# Removing all recipes that none of the similar users have rated. 
sim_user_recipes = norm_ratings_matrix[norm_ratings_matrix.index.isin(sim_users.index)].dropna(axis=1, how='all')

sim_user_recipes

Name,A Good Easy Garlic Chicken,Aimee's Quick Chicken,Amazing Chicken,Amazing Pork Tenderloin in the Slow Cooker,Amish Yumazuti,Anniversary Chicken I,Army SOS Creamed Ground Beef,Artichoke Chicken,Artichoke Spinach Lasagna,Asian Orange Chicken,...,Tomato Chicken Parmesan,Turkey Tetrazzini II,Turkey Veggie Meatloaf Cups,Unbelievable Chicken,Vegetarian Chickpea Sandwich Filling,Vegetarian Korma,White Cheese Chicken Lasagna,World's Best Lasagna,Yummy Honey Chicken Kabobs,Zesty Slow Cooker Chicken Barbecue
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
559,,,,,,,,,,,...,,,,,,,,,,
2020,,,,,,,,,,,...,0.7,-0.3,,,,,0.7,,0.7,-1.3
41398,,,,0.409091,,,,,,-1.590909,...,,,,,,,,,,
45643,,,,,,-0.333333,,,,,...,,,,-0.333333,,,,,,
45946,,,-0.25,,,,,,,,...,,,,,,,,,,
47640,0.730769,,,,,,,,,,...,,,,0.730769,,,,,,
48039,0.605263,,,,-0.394737,,-1.394737,,,,...,,,,,,,,,,
51896,-1.05,,,,,,,,,-0.05,...,,,,,,,,0.95,,
53966,,,,-0.206897,,,,,,,...,,,0.793103,,,-0.206897,,,,-0.206897
54812,,-0.44,,,,,,-0.44,0.56,,...,,,,,-0.44,0.56,,,,


In [51]:
# Recipes that our user has rated

our_user_rated = norm_ratings_matrix[norm_ratings_matrix.index == our_user].dropna(axis=1, how= 'all')

# Show all the recipes this user has rated
our_user_rated 

Name,Artichoke Chicken,BBQ Pork for Sandwiches,Baked Pork Chops I,Baked Scallops,Baked Teriyaki Chicken,Baked Ziti I,Best Tuna Casserole,Cabbage Roll Casserole,Candied Kielbasa,Caramel Apple Pork Chops,...,Seven Layer Tortilla Pie,Shrimp Scampi Bake,"Slow Cooker Green Beans, Ham and Potatoes",Slow-Cooker Barbecue Ribs,Slow-Cooker Pepper Steak,Stuffed Green Peppers I,Tangy Slow Cooker Pork Roast,Torsk,World's Best Lasagna,Zucchini Patties
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
455,-1.25,-0.25,0.75,0.75,0.75,-1.25,0.75,0.75,-0.25,0.75,...,-1.25,-1.25,-0.25,0.75,-0.25,0.75,0.75,-0.25,-0.25,-0.25


In [24]:
# Removing the recipes above

sim_user_recipes.drop(our_user_rated.columns,axis=1, inplace= True, errors = 'ignore')

#sim_user_recipes   # shows they are gone

In [26]:
# Which recipe should we recommend to our user?

# The recipe ratings are weighted by the similarity scores, the recommended items 
#   are determined by the weighted average of user similarity score and recipe rating



# The following code iterates through the recipes and users to get the item score, 
#   ranks the score from high to low and picks the top 10 recipes.

item_scores = {}   #dict to store item scores

for i in sim_user_recipes.columns:
    recipe_rating = sim_user_recipes[i]
    total_score = 0
    num_scores = 0
    for j in sim_users.index:
        if pd.isna(recipe_rating[j]) == False:
            sum_score_rating = sim_users[j] * recipe_rating[j]
            total_score += sum_score_rating
            num_scores+=1
    item_scores[i] = total_score/num_scores
#turning dict into dataframe
item_scores = pd.DataFrame(item_scores.items(),columns=['recipe', 'recipe_score'])

ranking_recipe_scores = item_scores.sort_values(by='recipe_score', ascending=False)

top = 10

# Show top recipes
ranking_recipe_scores.head(top)

Unnamed: 0,recipe,recipe_score
61,Chicken with Asparagus and Roasted Red Peppers,0.95
90,French Dip Sandwiches,0.95
112,Linguine Pasta with Shrimp and Tomatoes,0.95
43,Cajun Chicken Pasta,0.95
47,Cheese Ravioli with Fresh Tomato and Artichoke...,0.95
154,Slow Cooker Buffalo Chicken Sandwiches,0.793103
115,Meatball Nirvana,0.793103
181,Turkey Veggie Meatloaf Cups,0.793103
67,Classic Cuban Midnight (Medianoche) Sandwich,0.793103
22,Beef Bulgogi,0.793103


### Predicting the users ratings

In [29]:
# The average rating for our user
avg_rating = ratings_matrix[ratings_matrix.index == our_user].T.mean()[our_user]

# Print the average rating
print(f'The average movie rating for user {our_user} is {avg_rating:.2f}')

# Calcuate the predicted rating
ranking_recipe_scores['pred_rating'] = ranking_recipe_scores['recipe_score'] + avg_rating

ranking_recipe_scores.head(top)

The average movie rating for user 455 is 4.25


Unnamed: 0,recipe,recipe_score,predicted_rating,pred_rating
61,Chicken with Asparagus and Roasted Red Peppers,0.95,5.2,5.2
90,French Dip Sandwiches,0.95,5.2,5.2
112,Linguine Pasta with Shrimp and Tomatoes,0.95,5.2,5.2
43,Cajun Chicken Pasta,0.95,5.2,5.2
47,Cheese Ravioli with Fresh Tomato and Artichoke...,0.95,5.2,5.2
154,Slow Cooker Buffalo Chicken Sandwiches,0.793103,5.043103,5.043103
115,Meatball Nirvana,0.793103,5.043103,5.043103
181,Turkey Veggie Meatloaf Cups,0.793103,5.043103,5.043103
67,Classic Cuban Midnight (Medianoche) Sandwich,0.793103,5.043103,5.043103
22,Beef Bulgogi,0.793103,5.043103,5.043103


In [None]:
# All of the above code - combined into one function/model ?  -- IKKE FERDIG


### Cosine similarity

Because cosine similarity does not take missing values (NaN), we must convert all NaN´s to zeros.

In [None]:
cosine_sim_matrix = cosine_similarity(norm_ratings_matrix.fillna(0))

cosine_sim_matrix

### -----Vanessa sine endringer bare over denne linjen-----

In [31]:
#Import the train_test_split function
from sklearn.model_selection import train_test_split

In [32]:

X = df.copy()
y = df['user_id']

print(y)

0          455
1         2878
2         3172
3         3698
4         3794
         ...  
50676    75878
50677    79032
50678    79777
50679    81125
50680    83519
Name: user_id, Length: 50681, dtype: int64


In [33]:
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y,random_state=42)

In [34]:

#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [35]:
# Define the baseline model to always return 3.
def baseline(user_id, item_id):
    return 3.0

In [36]:
def score(cf_model):
    # List of user-item tuples from test set
    id_pairs = zip(X_test['user_id'], X_test['item_id'])

    # Predict rating for every user-item tuple
    y_pred = np.array([cf_model(user, item) for (user, item) in id_pairs])
    print(y_pred)

    #Get actual ratings
    y_true = np.array(X_test['rating'])
    print('true ratings: ')
    print(y_true)
    # Final RMSE score
    return rmse(y_true, y_pred)



In [37]:
score(baseline)

[3. 3. 3. ... 3. 3. 3.]
true ratings: 
[4. 5. 4. ... 5. 5. 5.]


1.6050046030669667

In [38]:
# BUILDING RATINGS MATRIX

ratings_matrix = X_train.pivot_table(values='rating', index='user_id', columns='item_id')

ratings_matrix.head()

item_id,4,5,17,19,22,23,28,30,31,34,...,9098,9319,9661,9757,11194,11572,11895,13339,14078,15746
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
455,,,,,,,5.0,,,5.0,...,,,,,,,,,,
559,,,,,,,,,,,...,,,,,,,,,,
833,,,,,,,,,,,...,,,,,,,,,,
1155,,,,,,,,,,,...,,,,,,,,,,
1299,,,,,,,,,,,...,,,,,,,,,,


In [39]:
#User Based Collaborative Filter using Mean Ratings
def cf_user_mean(user_id, item_id):
    
    #Check if movie_id exists in r_matrix
    if item_id in ratings_matrix:
        #Compute the mean of all the ratings given to the movie
        mean_rating = ratings_matrix[item_id].mean()
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        mean_rating = 3.0
    
    return mean_rating

In [40]:
score(cf_user_mean)

[4.6952381  4.88636364 4.6        ... 4.67692308 4.375      4.22      ]
true ratings: 
[4. 5. 4. ... 5. 5. 5.]


0.8079141460532007

In [41]:
#Create a dummy ratings matrix with all null values imputed to 0
r_matrix_dummy = ratings_matrix.copy().fillna(0)

In [42]:

#Compute the cosine similarity matrix using the dummy ratings matrix
cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)

In [43]:
#Convert into pandas dataframe 
cosine_sim = pd.DataFrame(cosine_sim, index=ratings_matrix.index, columns=ratings_matrix.index)

cosine_sim.head(10)

user_id,455,559,833,1155,1299,1381,1537,1646,1873,2020,...,84374,84543,84570,84572,84583,84767,84778,84780,84790,84839
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
455,1.0,0.0,0.072052,0.057419,0.155429,0.031255,0.042177,0.0,0.0,0.088777,...,0.0,0.034862,0.053469,0.055017,0.050725,0.0,0.127414,0.066191,0.026025,0.077538
559,0.0,1.0,0.036873,0.0,0.0,0.0,0.0,0.072172,0.0,0.0,...,0.0,0.049954,0.0,0.096353,0.0,0.047423,0.056177,0.0,0.093228,0.0
833,0.072052,0.036873,1.0,0.125698,0.097366,0.027909,0.0,0.093547,0.059597,0.203605,...,0.100885,0.049807,0.0,0.078602,0.036236,0.0,0.028006,0.140273,0.130135,0.132934
1155,0.057419,0.0,0.125698,1.0,0.122952,0.030133,0.108433,0.0,0.0,0.087091,...,0.031121,0.055568,0.0,0.0,0.0,0.056723,0.050395,0.042542,0.058543,0.154037
1299,0.155429,0.0,0.097366,0.122952,1.0,0.031121,0.0,0.0,0.0,0.062032,...,0.026785,0.0,0.0,0.0,0.0,0.0,0.0,0.073229,0.0,0.037433
1381,0.031255,0.0,0.027909,0.030133,0.031121,1.0,0.0,0.0,0.055173,0.088683,...,0.043763,0.030248,0.046393,0.033149,0.055015,0.059823,0.085039,0.0,0.021169,0.022935
1537,0.042177,0.0,0.0,0.108433,0.0,0.0,1.0,0.0,0.0,0.0,...,0.011811,0.0,0.062604,0.044733,0.0,0.14531,0.117624,0.0,0.039993,0.041266
1646,0.0,0.072172,0.093547,0.0,0.0,0.0,0.0,1.0,0.0,0.052264,...,0.056418,0.0,0.07476,0.0,0.070924,0.0,0.0,0.0,0.045484,0.0
1873,0.0,0.0,0.059597,0.0,0.0,0.055173,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.043163,0.0,0.0,0.0,0.062315,0.082691,0.0
2020,0.088777,0.0,0.203605,0.087091,0.062032,0.088683,0.0,0.052264,0.0,1.0,...,0.0,0.045219,0.0,0.0,0.082244,0.057236,0.0,0.0,0.0,0.160003


In [58]:
#User Based Collaborative Filter using Weighted Mean Ratings
def cf_user_wmean(user_id, item_id):
    
    #Check if movie_id exists in r_matrix
    if item_id in ratings_matrix:
        
        #Get the similarity scores for the user in question with every other user
        sim_scores = cosine_sim[user_id]
        
        #Get the user ratings for the movie in question
        m_ratings = ratings_matrix[item_id]
        
        #Extract the indices containing NaN in the m_ratings series
        idx = m_ratings[m_ratings.isnull()].index
        
        #Drop the NaN values from the m_ratings Series
        m_ratings = m_ratings.dropna()
        
        #Drop the corresponding cosine scores from the sim_scores series
        sim_scores = sim_scores.drop(idx)
        
        #Compute the final weighted mean
        
        simScore = sim_scores.sum()
        if simScore == 0:
                simScore = 1
        wmean_rating = np.dot(sim_scores, m_ratings) / simScore
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        wmean_rating = 3.0
    
    return wmean_rating

In [59]:
cf_user_wmean(455, 90)

3.0

In [60]:
score(cf_user_wmean)

[3. 3. 3. ... 3. 3. 3.]
true ratings: 
[4. 5. 4. ... 5. 5. 5.]


1.6050046030669667