In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import csv

#Import the mean_squared_error function
from sklearn.metrics import mean_squared_error

# Import cosine_score 
from sklearn.metrics.pairwise import cosine_similarity

In [23]:
df = pd.read_csv("user-item-rating.csv",  on_bad_lines='skip', sep = '\t', names = ['user_id', 'item_id', 'rating'])
df

Unnamed: 0,user_id,item_id,rating
0,455,50,3.0
1,455,457,4.0
2,455,28,5.0
3,455,458,3.0
4,455,459,5.0
...,...,...,...
50676,84839,131,3.0
50677,84839,109,5.0
50678,84839,145,5.0
50679,84839,133,5.0


In [24]:
print("Percentage null or na values in Dataset\n-------------------------------------")
((df.isnull() | df.isna()).sum() * 100 / df.index.size).round(2)

Percentage null or na values in Dataset
-------------------------------------


user_id    0.0
item_id    0.0
rating     0.0
dtype: float64

In [25]:
#Import the train_test_split function
from sklearn.model_selection import train_test_split

In [26]:

X = df.copy()
y = df['user_id']

print(y)

0          455
1          455
2          455
3          455
4          455
         ...  
50676    84839
50677    84839
50678    84839
50679    84839
50680    84839
Name: user_id, Length: 50681, dtype: int64


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y,random_state=42)

In [28]:

#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [29]:
# Define the baseline model to always return 3.
def baseline(user_id, item_id):
    return 3.0

In [30]:
def score(cf_model):
    # List of user-item tuples from test set
    id_pairs = zip(X_test['user_id'], X_test['item_id'])

    # Predict rating for every user-item tuple
    y_pred = np.array([cf_model(user, item) for (user, item) in id_pairs])
    print(y_pred)

    #Get actual ratings
    y_true = np.array(X_test['rating'])
    print('true ratings: ')
    print(y_true)
    # Final RMSE score
    return rmse(y_true, y_pred)



In [31]:
score(baseline)

[3. 3. 3. ... 3. 3. 3.]
true ratings: 
[4. 4. 4. ... 3. 5. 5.]


1.6191040450231147

In [32]:
# BUILDING RATINGS MATRIX

ratings_matrix = X_train.pivot_table(values='rating', index='user_id', columns='item_id')

ratings_matrix.head()

item_id,4,5,17,19,22,23,28,30,31,34,...,9098,9319,9661,9757,11194,11572,11895,13339,14078,15746
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
455,,,,,,,5.0,,,5.0,...,,,,,,,,,,
559,,,,,,,,,,,...,,,,,,,,,,
833,,,,,,,,,,,...,,,,,,,,,,
1155,,,,,,,,,,,...,,,,,,,,,,
1299,,,,,,,,,,,...,,,,,,,,,,


In [33]:
#User Based Collaborative Filter using Mean Ratings
def cf_user_mean(user_id, item_id):
    
    #Check if movie_id exists in r_matrix
    if item_id in ratings_matrix:
        #Compute the mean of all the ratings given to the movie
        mean_rating = ratings_matrix[item_id].mean()
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        mean_rating = 3.0
    
    return mean_rating

In [34]:
score(cf_user_mean)

[4.71153846 4.67088608 4.30769231 ... 4.42857143 4.29473684 4.68421053]
true ratings: 
[4. 4. 4. ... 3. 5. 5.]


0.8063760052885885

In [35]:
#Create a dummy ratings matrix with all null values imputed to 0
r_matrix_dummy = ratings_matrix.copy().fillna(0)

In [36]:

#Compute the cosine similarity matrix using the dummy ratings matrix
cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)

In [37]:
#Convert into pandas dataframe 
cosine_sim = pd.DataFrame(cosine_sim, index=ratings_matrix.index, columns=ratings_matrix.index)

cosine_sim.head(10)

user_id,455,559,833,1155,1299,1381,1537,1646,1873,2020,...,84374,84543,84570,84572,84583,84767,84778,84780,84790,84839
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
455,1.0,0.008569,0.066906,0.0,0.23862,0.031098,0.106433,0.0,0.0,0.085897,...,0.0,0.035244,0.054074,0.098218,0.049604,0.0,0.130223,0.099014,0.032229,0.021552
559,0.008569,1.0,0.015303,0.0,0.016893,0.0,0.0,0.0,0.0,0.0,...,0.0,0.052397,0.08039,0.099148,0.0,0.0,0.0,0.0,0.047914,0.02136
833,0.066906,0.015303,1.0,0.1386,0.165588,0.0,0.030412,0.094756,0.09472,0.185697,...,0.067479,0.115814,0.030902,0.099613,0.120478,0.023367,0.0,0.075344,0.115115,0.145744
1155,0.0,0.0,0.1386,1.0,0.058287,0.057825,0.054596,0.0,0.038646,0.0,...,0.0,0.056045,0.0,0.0,0.0,0.125845,0.051384,0.041405,0.024799,0.149245
1299,0.23862,0.016893,0.165588,0.058287,1.0,0.029428,0.050359,0.0,0.0,0.089128,...,0.026486,0.0,0.0,0.047811,0.0,0.068787,0.0,0.0,0.071163,0.036257
1381,0.031098,0.0,0.0,0.057825,0.029428,1.0,0.04594,0.0,0.054198,0.036588,...,0.045304,0.030426,0.03501,0.019627,0.0,0.0,0.032428,0.0,0.062601,0.0
1537,0.106433,0.0,0.030412,0.054596,0.050359,0.04594,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.058624,0.080539,0.121345,0.0,0.087606,0.033961
1646,0.0,0.0,0.094756,0.0,0.0,0.0,0.0,1.0,0.0,0.051222,...,0.0,0.049915,0.076582,0.0,0.070252,0.0,0.0,0.012194,0.0,0.0
1873,0.0,0.0,0.09472,0.038646,0.0,0.054198,0.0,0.0,1.0,0.0,...,0.0,0.0,0.045236,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020,0.085897,0.0,0.185697,0.0,0.089128,0.036588,0.0,0.051222,0.0,1.0,...,0.052689,0.044232,0.0,0.047556,0.077817,0.054736,0.0,0.0,0.0,0.157778


In [38]:
#User Based Collaborative Filter using Weighted Mean Ratings
def cf_user_wmean(user_id, item_id):
    
    #Check if movie_id exists in r_matrix
    if item_id in ratings_matrix:
        
        #Get the similarity scores for the user in question with every other user
        sim_scores = cosine_sim[user_id]
        
        #Get the user ratings for the movie in question
        m_ratings = ratings_matrix[item_id]
        
        #Extract the indices containing NaN in the m_ratings series
        idx = m_ratings[m_ratings.isnull()].index
        
        #Drop the NaN values from the m_ratings Series
        m_ratings = m_ratings.dropna()
        
        #Drop the corresponding cosine scores from the sim_scores series
        sim_scores = sim_scores.drop(idx)
        
        #Compute the final weighted mean
        
        simScore = sim_scores.sum()
        if simScore == 0:
                simScore = 1
        wmean_rating = np.dot(sim_scores, m_ratings) / simScore
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        wmean_rating = 3.0
    
    return wmean_rating

In [39]:
score(cf_user_wmean)

[4.67636158 4.57767246 4.34059002 ... 4.39870768 4.29136452 4.78673074]
true ratings: 
[4. 4. 4. ... 3. 5. 5.]


0.8127061141239906