### All imports

In [10]:
# Data processing
import pandas as pd
import numpy as np
import csv

# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns


#Import the mean_squared_error function
from sklearn.metrics import mean_squared_error

# Similarity
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
user_item_rating = pd.read_csv("user-item-rating.csv",  on_bad_lines='skip', sep = '\t', names = ['user_id', 'item_id', 'rating'])
user_item_rating

Unnamed: 0,user_id,item_id,rating
0,455,50,3.0
1,455,457,4.0
2,455,28,5.0
3,455,458,3.0
4,455,459,5.0
...,...,...,...
50676,84839,131,3.0
50677,84839,109,5.0
50678,84839,145,5.0
50679,84839,133,5.0


In [3]:
user_item_rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50681 entries, 0 to 50680
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   user_id  50681 non-null  int64  
 1   item_id  50681 non-null  int64  
 2   rating   50681 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 1.2 MB


In [4]:
# Checking for any missing data

print("Percentage null or na values in Dataset\n-------------------------------------")
((user_item_rating.isnull() | user_item_rating.isna()).sum() * 100 / user_item_rating.index.size).round(2)

Percentage null or na values in Dataset
-------------------------------------


user_id    0.0
item_id    0.0
rating     0.0
dtype: float64

In [5]:
######    IKKE VIKTIG MEN SER FINT UT???      ######


# Number of users, recipies and ratings
print('The number of users in the dataset:', user_item_rating['user_id'].nunique())

print('The number of recipies that are rated:', user_item_rating['item_id'].nunique())

print('The number of different ratings in the dataset:', user_item_rating['rating'].nunique())

print('The unique ratings are:', sorted(user_item_rating['rating'].unique()))

The number of users in the dataset: 1273
The number of recipies that are rated: 1031
The number of different ratings in the dataset: 5
The unique ratings are: [1.0, 2.0, 3.0, 4.0, 5.0]


In [38]:
# reading in data from "item-profiles2.csv" to get the recipe names

item_profiles2 = pd.read_csv("item-profiles2.csv",  on_bad_lines='skip', sep = ';')

item_profiles2

Unnamed: 0,Recipe ID,Name,Fiber (g),Sodium (g),Carbohydrates (g),Fat (g),Protein (g),Sugar (g),Saturated Fat (g),Size (g),Servings,Calories (kCal),Average Rating,Average Sentiment,Number of Ratings,Number of Bookmarks,Year of Publishing
0,2622,Slow Cooker Tender and Yummy Round Steak,4.5,0.83,33.1,13.6,33.8,5.0,4.6,2599.35,6.0,393.0,4.32,1.79,81,2271,2000
1,722,Chicken Pot Pie II,6.2,1.06,47.8,29.5,51.4,6.4,11.4,2137.86,4.0,666.0,4.66,2.02,116,1200,2000
2,1137,Chicken in a Pot,1.0,0.40,6.9,6.6,28.7,1.8,1.4,819.37,4.0,206.0,4.29,1.98,83,1779,2001
3,2502,Erin's Indonesian Chicken,6.4,0.32,58.1,18.6,35.4,7.8,3.8,1972.13,4.0,530.0,4.39,1.99,80,872,2005
4,2714,Bubble Pizza,3.0,1.96,45.4,36.4,28.5,8.7,13.4,2375.00,8.0,624.0,4.35,1.96,117,2204,2001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1026,1353,Beef and Biscuit,0.9,0.63,15.5,18.9,18.4,3.7,8.6,1536.45,10.0,306.0,3.99,1.52,79,811,2001
1027,987,Creamy Pesto Shrimp,2.7,0.44,43.0,42.5,23.1,0.2,24.3,1655.88,8.0,646.0,4.58,1.98,225,3134,2000
1028,2903,Boiled Chicken,1.2,0.06,4.5,11.1,16.3,1.9,3.0,1780.00,8.0,186.0,4.70,1.20,74,1083,2001
1029,2136,Mushroom Sauce Baked Pork Chops,2.5,1.11,28.1,14.3,19.0,6.5,6.1,2241.75,6.0,316.0,4.45,1.86,121,1188,2001


The dataframe shown above has several columns that are of no use to us, we only want the columns "Recipe ID" and "Name". 

In [7]:
recipe_names = item_profiles2[['Recipe ID', 'Name']].copy()
recipe_names.head()

Unnamed: 0,Recipe ID,Name
0,2622,Slow Cooker Tender and Yummy Round Steak
1,722,Chicken Pot Pie II
2,1137,Chicken in a Pot
3,2502,Erin's Indonesian Chicken
4,2714,Bubble Pizza


Now that we have extracted only the preferred columns, we can merge this dataframe with "user_item_rating", our other dataframe that contains the user id, recipe id and ratings. To merge these, we use "Recipe ID". Before we merge them, we change the name of the column "Recipe ID" to "item_id" to match the column in "user_item_rating". 

In [8]:
recipe_names.rename(columns={'Recipe ID':'item_id'}, inplace=True)
recipe_names.head()

Unnamed: 0,item_id,Name
0,2622,Slow Cooker Tender and Yummy Round Steak
1,722,Chicken Pot Pie II
2,1137,Chicken in a Pot
3,2502,Erin's Indonesian Chicken
4,2714,Bubble Pizza


In [9]:
df = user_item_rating.merge(recipe_names, how= 'inner', on='item_id')
df.head()

Unnamed: 0,user_id,item_id,rating,Name
0,455,50,3.0,Baked Ziti I
1,2878,50,2.0,Baked Ziti I
2,3172,50,5.0,Baked Ziti I
3,3698,50,5.0,Baked Ziti I
4,3794,50,5.0,Baked Ziti I


In [39]:
# We want to check if any recipes have very few ratings, or if any 
# users have given very few ratings. We also find the mean. 

# Group data by user, count number of ratings and shows the mean for each user: 
aggregate_by_user = df.groupby('user_id').agg(mean_rating = ('rating', 'mean'), 
number_of_ratings = ('rating', 'count')).reset_index()



# Sorts the dataframe by number of ratings, helps us see what the lowest 
#   number of recipes any user has rated is

aggregate_by_user.sort_values('number_of_ratings')


Unnamed: 0,user_id,mean_rating,number_of_ratings
260,22109,4.550000,20
518,41530,4.600000,20
1085,75525,4.450000,20
1082,75380,5.000000,20
760,57892,3.950000,20
...,...,...,...
597,48719,4.750000,236
1142,78714,4.135802,243
320,25694,4.657258,248
1094,76151,3.771331,293


In [42]:

# Group data by recipe, count number of ratings and shows mean rating for each recipe:
aggregate_by_recipe = df.groupby('item_id').agg(mean_rating = ('rating', 'mean'), 
number_of_ratings = ('rating', 'count')).reset_index()


# Sorts the dataframe by number of ratings, shows us the number of times the least rated recipe has been rated.
#  By not using .head(), we can also see the recipes with the highest number of ratings. 

aggregate_by_recipe.sort_values('number_of_ratings')

Unnamed: 0,item_id,mean_rating,number_of_ratings
1030,15746,4.450000,20
654,1976,4.600000,20
657,2006,4.100000,20
680,2118,4.500000,20
685,2152,4.450000,20
...,...,...,...
64,134,4.600000,290
68,140,4.429553,291
21,50,4.594156,308
66,137,4.651090,321


Seeing that no recipe is rated less than 20 times, and none of our users have given less than 20 ratings, we decide that there is no reason to remove any data. 

In [46]:
# Shows the recipes that has the best mean - given best ratings - at the top,
#   the recipes with the lowest mean - worse ratings - at the bottom:
aggregate_by_recipe.sort_values('mean_rating', ascending=False)

Unnamed: 0,item_id,mean_rating,number_of_ratings
397,956,4.960000,25
245,572,4.954545,22
70,143,4.887097,62
842,3437,4.882353,34
704,2312,4.869565,23
...,...,...,...
168,372,3.583333,36
483,1247,3.553846,65
727,2534,3.428571,28
448,1108,3.393939,33


In [53]:
# NOT IMPORTANT, JUST SHOWS THE DIFFERENT MEANS

df_mean_users = aggregate_by_user['mean_rating'].mean()
df_ratings_mean = df['rating'].mean()
df_mean_items = aggregate_by_recipe['mean_rating'].mean()
print('Mean of all the users own means: ',df_mean_users, '\n' 'Mean of ratings from "normal" df:', df_ratings_mean,'\n' 'Mean of each recipes own mean:', df_mean_items)

Mean of all the users own means:  4.404332547821255 
Mean of ratings from "normal" df: 4.388054695053373 
Mean of each recipes own mean: 4.352838067432242


### -----Vanessa sine endringer bare over denne linjen-----

In [None]:
#Import the train_test_split function
from sklearn.model_selection import train_test_split

In [None]:

X = df.copy()
y = df['user_id']

print(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y,random_state=42)

In [None]:

#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [None]:
# Define the baseline model to always return 3.
def baseline(user_id, item_id):
    return 3.0

In [None]:
def score(cf_model):
    # List of user-item tuples from test set
    id_pairs = zip(X_test['user_id'], X_test['item_id'])

    # Predict rating for every user-item tuple
    y_pred = np.array([cf_model(user, item) for (user, item) in id_pairs])
    print(y_pred)

    #Get actual ratings
    y_true = np.array(X_test['rating'])
    print('true ratings: ')
    print(y_true)
    # Final RMSE score
    return rmse(y_true, y_pred)



In [None]:
score(baseline)

In [None]:
# BUILDING RATINGS MATRIX

ratings_matrix = X_train.pivot_table(values='rating', index='user_id', columns='item_id')

ratings_matrix.head()

In [None]:
#User Based Collaborative Filter using Mean Ratings
def cf_user_mean(user_id, item_id):
    
    #Check if movie_id exists in r_matrix
    if item_id in ratings_matrix:
        #Compute the mean of all the ratings given to the movie
        mean_rating = ratings_matrix[item_id].mean()
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        mean_rating = 3.0
    
    return mean_rating

In [None]:
score(cf_user_mean)

In [None]:
#Create a dummy ratings matrix with all null values imputed to 0
r_matrix_dummy = ratings_matrix.copy().fillna(0)

In [None]:

#Compute the cosine similarity matrix using the dummy ratings matrix
cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)

In [None]:
#Convert into pandas dataframe 
cosine_sim = pd.DataFrame(cosine_sim, index=ratings_matrix.index, columns=ratings_matrix.index)

cosine_sim.head(10)

In [None]:
#User Based Collaborative Filter using Weighted Mean Ratings
def cf_user_wmean(user_id, item_id):
    
    #Check if movie_id exists in r_matrix
    if item_id in ratings_matrix:
        
        #Get the similarity scores for the user in question with every other user
        sim_scores = cosine_sim[user_id]
        
        #Get the user ratings for the movie in question
        m_ratings = ratings_matrix[item_id]
        
        #Extract the indices containing NaN in the m_ratings series
        idx = m_ratings[m_ratings.isnull()].index
        
        #Drop the NaN values from the m_ratings Series
        m_ratings = m_ratings.dropna()
        
        #Drop the corresponding cosine scores from the sim_scores series
        sim_scores = sim_scores.drop(idx)
        
        #Compute the final weighted mean
        
        simScore = sim_scores.sum()
        if simScore == 0:
                simScore = 1
        wmean_rating = np.dot(sim_scores, m_ratings) / simScore
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        wmean_rating = 3.0
    
    return wmean_rating

In [None]:
score(cf_user_wmean)