In [1]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import math

DATATSET MovieLens 100K Ratings https://grouplens.org/datasets/movielens/100k/

In [2]:
col_names = ['user_id', 'item_id', 'rating']
# u1.base is a tab separated list of user id | item id | rating | timestamp. 
ratings = pd.read_csv('ml-100k//u1.base', sep = '\t' , names=col_names, usecols = [0, 1, 2] )

In [3]:
ratings.head()

Unnamed: 0,user_id,item_id,rating
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3


In [4]:
# Shape of the feature matrix
s = (ratings['user_id'].max(), ratings['item_id'].max())
print(s)
# Initializing the feature matrix
feature_matrix = np.zeros(s)

(943, 1682)


In [5]:
# Filing values in the feature matrix
# -1 since index starts from 0 and id starts from 1
for index, row in ratings.iterrows():
    feature_matrix[row['user_id'] - 1, row['item_id'] - 1] = row['rating']

In [6]:
feature_matrix[:5]

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [7]:
s = (ratings['user_id'].max(), ratings['item_id'].max())
user_movie_normalized = [] 
u_mean = []
u_variance = []
for user_rating in feature_matrix:

    # Number of movies rated by the user (non-zero ratings)
    Num_user_nonzero_rat = (user_rating > 0).sum()

    # Sum of all the ratings given by the user
    User_rat_sum = user_rating.sum()
    
    # Calculating the average rating given by the user
    user_mean_rat = User_rat_sum / Num_user_nonzero_rat
    u_mean.append(user_mean_rat)

    # Calculating sum of square of all the ratings given by the user
    squared_rat_sum = (np.square(user_rating)).sum()
     
    # Calculating the varianve of the rating given by the user
    user_variance_rat = (squared_rat_sum / Num_user_nonzero_rat) - (user_mean_rat**2)
    u_variance.append(user_variance_rat)
    
    # Calculating the new normalized rating for the user
    user_rat_norm = (user_rating - user_mean_rat) / user_variance_rat
    user_movie_normalized.append(user_rat_norm)

In [8]:
user_movie_normalized[:5]

[array([ 0.8116598 , -0.41950956,  0.19607512, ..., -2.2662636 ,
        -2.2662636 , -2.2662636 ]),
 array([ 0.20833333, -3.95833333, -3.95833333, ..., -3.95833333,
        -3.95833333, -3.95833333]),
 array([-1.82608696, -1.82608696, -1.82608696, ..., -1.82608696,
        -1.82608696, -1.82608696]),
 array([-4.61621622, -4.61621622, -4.61621622, ..., -4.61621622,
        -4.61621622, -4.61621622]),
 array([-1.5299375, -1.5299375, -1.5299375, ..., -1.5299375, -1.5299375,
        -1.5299375])]

In [9]:
# Calculating the user to user similarity
usersim_shape = (ratings['user_id'].max(), ratings['user_id'].max()) 
user_similarity = np.zeros(usersim_shape)

for u in range(ratings['user_id'].max()): 
    for v in range(ratings['user_id'].max()): 
        if(u == v): 
            continue 
        # Boolean array of True and False where the movie is a common movie for u and v
        common_movies_index = (feature_matrix[u] * feature_matrix[v]) > 0 
        # Number of common movies for u and v
        num_common_movies = common_movies_index.sum() 
        # sum of the product of the normalized ratings for the common movies             
        sum_prod_norm_rat = (user_movie_normalized[u] * user_movie_normalized[v])[common_movies_index].sum()
        
        # Similarty of user u and v is the sum of product of the normalized rating of u and v divided by the num of common movies
        if (num_common_movies > 0):
            user_similarity[u][v] = (1 / num_common_movies) * (sum_prod_norm_rat)

In [10]:
user_similarity[:5]

array([[ 0.        ,  0.22451108,  0.1646846 , ...,  0.42424583,
        -0.23412292,  0.15863642],
       [ 0.22451108,  0.        ,  0.08454106, ...,  0.03686327,
         0.26604887,  0.41697591],
       [ 0.1646846 ,  0.08454106,  0.        , ...,  0.39491782,
         0.45902404,  0.22769103],
       [ 0.39165728, -0.21283784,  0.        , ...,  2.18262445,
         0.57717678, -0.3396904 ],
       [ 0.08946611,  0.00473958,  0.64392391, ...,  0.63157934,
         0.20304425,  0.18622065]])

In [11]:
def predict_rating(user_id, movie_id, u_mean, u_variance, user_movie_normalized, user_similarity, feature_matrix):
    sum_weighted_rating = 0.0
    counter = 0
    for v in range(len(user_similarity)):
        if(feature_matrix[v][movie_id] == 0): 
            continue
        # sum of the ratings for the movie weighted by the simiarity with user 
        sum_weighted_rating += (user_similarity[user_id][v] * user_movie_normalized[v][movie_id])
        counter += 1
        
    # None of the users has rated the movie
    if counter == 0:
        predict_rating = 0
    else:
        # Get the weighted rating
        weighted_rating = sum_weighted_rating / counter
        # Predict the rating for the user 
        predict_rating = u_mean[user_id] + (weighted_rating * u_variance[user_id])
        
        # If the predicted rating is outside the range, adjust it
        if predict_rating < 0:
            predict_rating = 0
        if predict_rating > 5:
            predict_rating = 5
    return predict_rating

In [12]:
# checking on the test set
col_names = ['user_id', 'item_id', 'rating']
ratings = pd.read_csv('ml-100k\\u1.test', 
                         sep='\t', names=col_names, usecols=range(3))

In [13]:
predicted_rating = []
actual_rating = []
for index, row in ratings.iterrows():
    # Get predicted rating
    predicted_rating.append(predict_rating(row['user_id'] - 1, row['item_id'] - 1, u_mean, u_variance, user_movie_normalized, user_similarity, feature_matrix))
    # Get actual rating
    actual_rating.append(row['rating'])
predicted_rating = np.array(predicted_rating)
actual_rating = np.array(actual_rating)

In [14]:
rms = math.sqrt(sum(np.square(predicted_rating - actual_rating)) / len(predicted_rating))
print(rms)

1.0265019301563407


In [15]:
# Rating for user 2 (index 2-1) and movie 21 (index 21-1)
predict_rating(1, 20, u_mean, u_variance, user_movie_normalized, user_similarity, feature_matrix)

3.6043684332196193