In [1]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import math

DATATSET MovieLens 100K Ratings https://grouplens.org/datasets/movielens/100k/

In [2]:
col_names = ['user_id', 'item_id', 'rating']
# u1.base is a tab separated list of user id | item id | rating | timestamp. 
ratings = pd.read_csv('ml-100k//u1.base', sep = '\t' , names=col_names, usecols = [0, 1, 2] )

In [3]:
ratings.head()

Unnamed: 0,user_id,item_id,rating
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3


In [4]:
# Shape of the feature matrix
s = (ratings['user_id'].max(), ratings['item_id'].max())
print(s)
# Initializing the feature matrix
feature_matrix = np.zeros(s)

(943, 1682)


In [5]:
# Filing values in the feature matrix
# -1 since index starts from 0 and id starts from 1
for index, row in ratings.iterrows():
    feature_matrix[row['user_id'] - 1, row['item_id'] - 1] = row['rating']

In [6]:
feature_matrix[:5]

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [7]:
s = (ratings['user_id'].max(), ratings['item_id'].max())
user_movie_normalized = [] 
u_mean = []
u_variance = []
for user_rating in feature_matrix:

    # Number of movies rated by the user (non-zero ratings)
    Num_user_nonzero_rat = (user_rating > 0).sum()

    # Sum of all the ratings given by the user
    User_rat_sum = user_rating.sum()
    
    # Calculating the average rating given by the user
    user_mean_rat = User_rat_sum / Num_user_nonzero_rat
    u_mean.append(user_mean_rat)

    # Calculating sum of square of all the ratings given by the user
    squared_rat_sum = (np.square(user_rating)).sum()
     
    # Calculating the varianve of the rating given by the user
    user_variance_rat = (squared_rat_sum / Num_user_nonzero_rat) - (user_mean_rat**2)
    u_variance.append(user_variance_rat)
    
    # Calculating the new normalized rating for the user
    user_rat_norm = (user_rating - user_mean_rat) / user_variance_rat
    user_movie_normalized.append(user_rat_norm)

In [8]:
user_movie_normalized[:5]

[array([ 0.8116598 , -0.41950956,  0.19607512, ..., -2.2662636 ,
        -2.2662636 , -2.2662636 ]),
 array([ 0.20833333, -3.95833333, -3.95833333, ..., -3.95833333,
        -3.95833333, -3.95833333]),
 array([-1.82608696, -1.82608696, -1.82608696, ..., -1.82608696,
        -1.82608696, -1.82608696]),
 array([-4.61621622, -4.61621622, -4.61621622, ..., -4.61621622,
        -4.61621622, -4.61621622]),
 array([-1.5299375, -1.5299375, -1.5299375, ..., -1.5299375, -1.5299375,
        -1.5299375])]