In [1]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity

In [2]:
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_csv('data/u.user', sep = '|', names = user_cols)
users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [3]:
rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('data/u.data', sep = '\t', names = rating_cols)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
item_cols = ['movie_id', 'title', 'release_date','video_release_date', 'imdb_url', 'unknown', 'action', 
          'adventure', 'animation', 'child', 'comedy', 'crime', 'documentary', 'drama', 'fantasy',
          'film_noir', 'horror', 'musical', 'mystery', 'romance', 'sci_fi', 'thriller', 'war', 'western']
items = pd.read_csv('data/u.item', sep = '|', names = item_cols, encoding = 'iso8859_15')
items.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,unknown,action,adventure,animation,child,...,fantasy,film_noir,horror,musical,mystery,romance,sci_fi,thriller,war,western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
print(users.shape)
print(ratings.shape)
print(items.shape)

(943, 5)
(100000, 4)
(1682, 24)


In [6]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 5 columns):
user_id       943 non-null int64
age           943 non-null int64
gender        943 non-null object
occupation    943 non-null object
zip_code      943 non-null object
dtypes: int64(2), object(3)
memory usage: 36.9+ KB


In [7]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
user_id      100000 non-null int64
movie_id     100000 non-null int64
rating       100000 non-null int64
timestamp    100000 non-null int64
dtypes: int64(4)
memory usage: 3.1 MB


In [8]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 24 columns):
movie_id              1682 non-null int64
title                 1682 non-null object
release_date          1681 non-null object
video_release_date    0 non-null float64
imdb_url              1679 non-null object
unknown               1682 non-null int64
action                1682 non-null int64
adventure             1682 non-null int64
animation             1682 non-null int64
child                 1682 non-null int64
comedy                1682 non-null int64
crime                 1682 non-null int64
documentary           1682 non-null int64
drama                 1682 non-null int64
fantasy               1682 non-null int64
film_noir             1682 non-null int64
horror                1682 non-null int64
musical               1682 non-null int64
mystery               1682 non-null int64
romance               1682 non-null int64
sci_fi                1682 non-null int64
thriller 

In [9]:
items = items.drop(['video_release_date'], axis = 1)
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 23 columns):
movie_id        1682 non-null int64
title           1682 non-null object
release_date    1681 non-null object
imdb_url        1679 non-null object
unknown         1682 non-null int64
action          1682 non-null int64
adventure       1682 non-null int64
animation       1682 non-null int64
child           1682 non-null int64
comedy          1682 non-null int64
crime           1682 non-null int64
documentary     1682 non-null int64
drama           1682 non-null int64
fantasy         1682 non-null int64
film_noir       1682 non-null int64
horror          1682 non-null int64
musical         1682 non-null int64
mystery         1682 non-null int64
romance         1682 non-null int64
sci_fi          1682 non-null int64
thriller        1682 non-null int64
war             1682 non-null int64
western         1682 non-null int64
dtypes: int64(20), object(3)
memory usage: 302.3+ KB


In [12]:
ratings_train = pd.read_csv('data/ua.base', sep = '\t', names = rating_cols)
ratings_train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [13]:
ratings_test = pd.read_csv('data/ua.test', sep = '\t', names = rating_cols)
ratings_test.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,20,4,887431883
1,1,33,4,878542699
2,1,61,4,878542420
3,1,117,3,874965739
4,1,155,2,878542201


In [14]:
ratings_test_y = np.array(ratings_test['rating'] ) # storing test ratings separately
ratings_test_y

array([4, 4, 4, ..., 1, 4, 2])

In [15]:
ratings_test = ratings_test.drop(['rating'], axis = 1)

In [16]:
ratings_test.head()

Unnamed: 0,user_id,movie_id,timestamp
0,1,20,887431883
1,1,33,878542699
2,1,61,878542420
3,1,117,874965739
4,1,155,878542201


In [17]:
n_users = ratings.user_id.unique().shape[0] #number of uniqye users in ratings data
n_users

943

In [18]:
n_items = ratings.movie_id.unique().shape[0] #number of unique movies in ratings data
n_items                                         

1682

In [19]:
user_item = np.zeros((n_users,n_items))    #creating n_users * n_itens user-item matrix 
for row in ratings.itertuples():
    user_item[row[1] - 1, row[2] - 1] = row[3]

In [20]:
user_similarity = cosine_similarity(user_item)     #cosine similarity between user pairs
item_similarity = cosine_similarity(user_item.T)   #cosine similarity between item pairs

In [21]:
user_similarity

array([[1.        , 0.16693098, 0.04745954, ..., 0.14861694, 0.17950788,
        0.39817474],
       [0.16693098, 1.        , 0.11059132, ..., 0.16148478, 0.17226781,
        0.10579788],
       [0.04745954, 0.11059132, 1.        , ..., 0.10124256, 0.13341615,
        0.02655587],
       ...,
       [0.14861694, 0.16148478, 0.10124256, ..., 1.        , 0.1016418 ,
        0.09511958],
       [0.17950788, 0.17226781, 0.13341615, ..., 0.1016418 , 1.        ,
        0.18246466],
       [0.39817474, 0.10579788, 0.02655587, ..., 0.09511958, 0.18246466,
        1.        ]])

In [22]:
item_similarity

array([[1.        , 0.40238218, 0.33024479, ..., 0.        , 0.04718307,
        0.04718307],
       [0.40238218, 1.        , 0.27306918, ..., 0.        , 0.07829936,
        0.07829936],
       [0.33024479, 0.27306918, 1.        , ..., 0.        , 0.        ,
        0.09687505],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.04718307, 0.07829936, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.04718307, 0.07829936, 0.09687505, ..., 0.        , 0.        ,
        1.        ]])

In [62]:
def predict(rating_matrix, base):
    mean_user_rating = rating_matrix.sum(axis = 1) / (rating_matrix > 0).sum(axis = 1) #row wise mean
    ratings_diff = np.where(rating_matrix > 0, rating_matrix - mean_user_rating[:,np.newaxis], 0)
    if base == 'user':        
        similarity = cosine_similarity(ratings_diff)
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis = 1)]).T
    elif base == 'item':
        similarity = cosine_similarity(ratings_diff.T)        
#         pred = rating_matrix.dot(similarity)/(np.array([np.abs(similarity).sum(axis = 1)]))
        pred = rating_matrix.dot(similarity)/(np.array([similarity.sum(axis = 1)]))
    
    return pred
    

In [63]:
user_pred = predict(user_item, 'user')
item_pred = predict(user_item, 'item')

In [64]:
ratings_test.head()

Unnamed: 0,user_id,movie_id,timestamp
0,1,20,887431883
1,1,33,878542699
2,1,61,878542420
3,1,117,874965739
4,1,155,878542201


In [55]:
test_user_pred = np.array([user_pred[row[1]-1][row[2]-1] for row in ratings_test.itertuples()])

In [56]:
test_item_pred = np.array([item_pred[row[1]-1][row[2]-1] for row in ratings_test.itertuples()])

In [65]:
ratings_test_y

array([4, 4, 4, ..., 1, 4, 2])

In [66]:
test_user_pred

array([3.61823696, 3.60507223, 3.65173014, ..., 3.2943471 , 3.39409699,
       3.36903631])

In [67]:
sum(ratings_test_y == np.round(test_user_pred))/9430

0.43043478260869567