In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
path = "data/"
model_path = path + 'models/'
if not os.path.exists(model_path): os.mkdir(model_path)


In [3]:
def similarity(util, person1, person2):
    
    #Extract the row for the User 1
    if person1 ==1:
        p1m = np.ma.filled(util[:person1,],0)
    else:
        #Calculate the previous index value, so that we can make use of the correct row
        prev = person1-1 if person1 >1 else 1
        p1m = np.ma.filled(util[prev:person1,],0)
    
    #Extract the row for the User 2
    if person2 ==1:
        p2m = np.ma.filled(util[:person2,],0)
    else:
        #Calculate the previous index value, so that we can make use of the correct row
        prev = person2-1 if person2 >1 else 1
        p2m = np.ma.filled(util[prev:person2,],0)
    
    #Now cosine similarity for the two given users
    similar = np.asscalar(np.inner(p1m,p2m)) /  (np.asscalar(np.sqrt(np.einsum('ij,ij ->i',p1m, p1m))) * 
                 np.asscalar(np.sqrt(np.einsum('ij,ij ->i',p2m, p2m))))
    return similar

In [4]:
def calculateRating(ratingMatrix, similarityMatrix, movieId=4, numberOfSimilarUsers=2):
    sumRating = 0
    norm = 0
    
    #Calculate the similar users indices
    sui = np.argsort(a=similarityMatrix)[-(numberOfSimilarUsers+1):-1]

    for i in sui:
        sumRating += ratingMatrix[i, movieId] * similarityMatrix[i]
        norm += similarityMatrix[i]
    
    return (sumRating/norm)

## Initial Setup
Lets read the ratings and the user datasets from the Movie Lens data set. 

Source: https://grouplens.org/datasets/movielens/
Small: 100,000 ratings and 1,300 tag applications applied to 9,000 movies by 700 users. Last updated 10/2016.

In [5]:
ratings = pd.read_csv(path+'ratings.csv')
ratings.head()


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [6]:
movie_names = pd.read_csv(path+'movies.csv').set_index('movieId')['title'].to_dict()

In [7]:
# Lets get the unique users and movie ids. 
users = ratings.userId.unique()
movies = ratings.movieId.unique()
movies.size

9066

In [None]:
# We update the movie and user ids so that they are contiguous integers
#userid2idx = {o:i for i,o in enumerate(users)}
#movieid2idx = {o:i for i,o in enumerate(movies)}
#ratings.movieId = ratings.movieId.apply(lambda x: movieid2idx[x])
#ratings.userId = ratings.userId.apply(lambda x: userid2idx[x])

## Subset
We will work on the sample dataset


In [8]:
# We want to create the top15 users who have rated the most movies. 
g=ratings.groupby('userId')['rating'].count()
topUsers=g.sort_values(ascending=False)[:15]



In [67]:
topsers

userId
547    2391
564    1868
624    1735
15     1700
73     1610
452    1340
468    1291
380    1063
311    1019
30     1011
294     947
509     923
580     922
213     910
212     876
Name: rating, dtype: int64

In [70]:
# We take a look at the top 15 movies sorted by the number of times each movie is rated. 
g=ratings.groupby('movieId')['rating'].count()
topMovies=g.sort_values(ascending=False)[:15]

In [71]:
topMovies

movieId
356     341
296     324
318     311
593     304
260     291
480     274
2571    259
1       247
527     244
589     237
1196    234
110     228
1270    226
608     224
1198    220
Name: rating, dtype: int64

In [72]:
top_r = ratings.join(topUsers, rsuffix='_r', how='inner', on='userId')
top_r = top_r.join(topMovies, rsuffix='_r', how='inner', on='movieId')

In [73]:
utility = pd.crosstab(top_r.userId, top_r.movieId, top_r.rating, aggfunc=np.sum).fillna(value=0)


In [74]:
utility1 = np.ma.masked_where(utility == 0, utility)
averages = np.ma.mean(utility1, axis=1, keepdims=True).filled(0)

In [76]:
utility

movieId,1,110,260,296,318,356,480,527,589,593,608,1196,1198,1270,2571
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
15,2.0,3.0,5.0,5.0,2.0,1.0,3.0,4.0,4.0,5.0,5.0,5.0,4.0,5.0,5.0
30,4.0,5.0,4.0,5.0,5.0,5.0,4.0,5.0,4.0,4.0,5.0,4.0,5.0,5.0,3.0
73,5.0,4.0,4.5,5.0,5.0,5.0,4.0,5.0,3.0,4.5,4.0,5.0,5.0,5.0,4.5
212,3.0,5.0,4.0,4.0,4.5,4.0,3.0,5.0,3.0,4.0,0.0,0.0,3.0,3.0,5.0
213,3.0,2.5,5.0,0.0,0.0,2.0,5.0,0.0,4.0,2.5,2.0,5.0,3.0,3.0,4.0
294,4.0,3.0,4.0,0.0,3.0,4.0,4.0,4.0,3.0,0.0,0.0,4.0,4.5,4.0,4.5
311,3.0,3.0,4.0,3.0,4.5,5.0,4.5,5.0,4.5,2.0,4.0,3.0,4.5,4.5,4.0
380,4.0,5.0,4.0,5.0,4.0,5.0,4.0,0.0,4.0,5.0,4.0,4.0,0.0,3.0,5.0
452,3.5,4.0,4.0,5.0,5.0,4.0,5.0,4.0,4.0,5.0,5.0,4.0,4.0,4.0,2.0
468,4.0,3.0,3.5,3.5,3.5,3.0,2.5,0.0,0.0,3.0,4.0,3.0,3.5,3.0,3.0


In [57]:
averages

array([[ 3.86666667],
       [ 4.46666667],
       [ 4.56666667],
       [ 3.88461538],
       [ 3.41666667],
       [ 3.83333333],
       [ 3.9       ],
       [ 4.30769231],
       [ 4.16666667],
       [ 3.26923077],
       [ 4.13333333],
       [ 3.75      ],
       [ 3.92857143],
       [ 3.96666667],
       [ 4.16666667]])

In [58]:
intermediate = utility1 - averages

In [59]:
count = intermediate.shape[0]
out = np.zeros(shape=(count))
baseUser = 4
for i in range(1,count+1):
    out[i-1] = similarity(intermediate,baseUser, i)

In [60]:
similarity(intermediate, 4, 1)

0.03734940533550573

In [78]:
np.ma.filled(intermediate[3:4,],0)

array([[-0.88461538,  1.11538462,  0.11538462,  0.11538462,  0.61538462,
         0.11538462, -0.88461538,  1.11538462, -0.88461538,  0.11538462,
         0.        ,  0.        , -0.88461538, -0.88461538,  1.11538462]])

In [62]:
utility2 = utility.as_matrix()

In [63]:
calculateRating(utility2, out, movieId=9, numberOfSimilarUsers=2)

4.4729406079615543