# Collaborative Filtering: User Based Recommendations

In [38]:
import pandas as pd
import numpy as np

### Load Movies Data

In [39]:
movies = pd.read_csv('movie_genres_final.csv')

In [40]:
movies.head()

Unnamed: 0,movie_id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,The Shawshank Redemption,01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [41]:
movies.shape

(1682, 24)

In [42]:
movies.columns

Index(['movie_id', 'movie title', 'release date', 'video release date',
       'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children's',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western'],
      dtype='object')

In [43]:
movies = movies[['movie_id', 'movie title']]

In [44]:
movies.head()

Unnamed: 0,movie_id,movie title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,The Shawshank Redemption
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [45]:
movies.shape

(1682, 2)

## Load Ratings Data

In [46]:
ratings = pd.read_csv('ratings.csv')

In [47]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [48]:
ratings.shape

(100000, 4)

In [49]:
ratings.drop(['unix_timestamp'], axis=1, inplace=True)

In [50]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [51]:
ratings.describe()

Unnamed: 0,user_id,movie_id,rating
count,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986
std,266.61442,330.798356,1.125674
min,1.0,1.0,1.0
25%,254.0,175.0,3.0
50%,447.0,322.0,4.0
75%,682.0,631.0,4.0
max,943.0,1682.0,5.0


In [52]:
n_users = ratings.user_id.nunique()

n_items = ratings.movie_id.nunique()

In [53]:
n_users

943

In [54]:
n_items

1682

## Dividing the dataset into train and test

In [55]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(ratings, test_size=0.30, random_state=31)

In [56]:
print(train.shape)
print(test.shape)

(70000, 3)
(30000, 3)


In [57]:
ratings.shape

(100000, 3)

In [58]:
train.head()

Unnamed: 0,user_id,movie_id,rating
68796,429,200,3
1933,29,332,4
95396,901,211,4
75818,593,405,3
19356,453,246,5


In [59]:
test.head()

Unnamed: 0,user_id,movie_id,rating
25988,378,496,3
56260,186,820,2
44918,588,781,2
73041,761,275,4
11692,76,960,3


## Create empty data matrix: user*movie

In [60]:
data_matrix = np.zeros((n_users, n_items))


In [61]:
data_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [62]:
data_matrix.shape   

(943, 1682)

## Fill user*movie Train matrix with rating values

In [63]:
for line in train.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]

In [64]:
data_matrix

array([[0., 3., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [65]:
data_matrix.shape

(943, 1682)

## Create Data Matrix with Test Data

In [66]:
data_matrix_test = np.zeros((n_users, n_items))

In [67]:
data_matrix_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [68]:
for line in test.itertuples():
    data_matrix_test[line[1]-1, line[2]-1] = line[3]

In [69]:
data_matrix_test.shape

(943, 1682)

In [70]:
data_matrix_test

array([[5., 0., 4., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Create Data Matrix with full data

In [71]:
data_matrix_full = np.zeros((n_users, n_items))

In [72]:
for line in ratings.itertuples():
    data_matrix_full[line[1]-1, line[2]-1] = line[3]

In [73]:
data_matrix_full

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [74]:
data_matrix_full.shape

(943, 1682)

# Pairwise Distance

In [75]:
from sklearn.metrics.pairwise import pairwise_distances 

In [76]:
user_similarity = 1- pairwise_distances(data_matrix, metric='cosine')

In [77]:
user_similarity

array([[1.        , 0.09148626, 0.00635462, ..., 0.09022171, 0.14281216,
        0.2873638 ],
       [0.09148626, 1.        , 0.04769444, ..., 0.08312922, 0.12820962,
        0.09161472],
       [0.00635462, 0.04769444, 1.        , ..., 0.03055483, 0.08078487,
        0.        ],
       ...,
       [0.09022171, 0.08312922, 0.03055483, ..., 1.        , 0.08995831,
        0.12271921],
       [0.14281216, 0.12820962, 0.08078487, ..., 0.08995831, 1.        ,
        0.1642334 ],
       [0.2873638 , 0.09161472, 0.        , ..., 0.12271921, 0.1642334 ,
        1.        ]])

In [78]:
user_similarity.shape

(943, 943)

In [79]:
data_matrix.shape

(943, 1682)

In [80]:
data_matrix.T

array([[0., 4., 0., ..., 0., 0., 0.],
       [3., 0., 0., ..., 0., 0., 5.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [81]:
data_matrix.T.shape

(1682, 943)

## Movies pairwise similarity distance

In [82]:
item_similarity = 1-pairwise_distances(data_matrix.T, metric='cosine')

In [83]:
item_similarity

array([[1.        , 0.26059722, 0.20639028, ..., 0.        , 0.        ,
        0.05646699],
       [0.26059722, 1.        , 0.15588599, ..., 0.        , 0.        ,
        0.09184152],
       [0.20639028, 0.15588599, 1.        , ..., 0.        , 0.        ,
        0.11572751],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.05646699, 0.09184152, 0.11572751, ..., 0.        , 0.        ,
        1.        ]])

In [84]:
item_similarity.shape

(1682, 1682)

## Dot product of Data Matrix with similarity matrix

In [85]:
item_prediction = np.dot(user_similarity, data_matrix_test)

In [86]:
item_prediction

array([[115.4914122 ,  35.10515846,  22.91211018, ...,   0.        ,
          0.94057039,   0.        ],
       [ 73.7527643 ,  15.29083028,  11.89784905, ...,   0.        ,
          0.41068318,   0.        ],
       [ 28.7384985 ,   6.78198001,   4.745609  , ...,   0.        ,
          0.26058737,   0.        ],
       ...,
       [ 69.47986613,  12.23076401,   9.25426779, ...,   0.        ,
          0.41005349,   0.        ],
       [ 75.2248144 ,  20.56954557,  11.40065483, ...,   0.        ,
          0.60628368,   0.        ],
       [105.166655  ,  34.15296187,  19.72145208, ...,   0.        ,
          1.12177594,   0.        ]])

In [87]:
item_prediction.shape

(943, 1682)

In [88]:
prediction_df = pd.DataFrame(item_prediction)

In [89]:
prediction_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,115.491412,35.105158,22.91211,56.221213,16.278129,10.554989,103.887714,62.738227,66.306938,34.09023,...,0.0,0.861579,0.0,0.0,0.0,0.0,0.0,0.0,0.94057,0.0
1,73.752764,15.29083,11.897849,25.898152,7.425375,3.375928,70.838939,30.204968,56.441903,19.727097,...,0.0,0.162145,0.0,0.0,0.0,0.0,0.0,0.0,0.410683,0.0
2,28.738498,6.78198,4.745609,10.482964,4.772594,1.175917,24.175527,11.224088,21.857049,7.032099,...,0.0,0.096273,0.0,0.0,0.0,0.0,0.0,0.0,0.260587,0.0
3,42.468076,9.888666,5.729946,15.339814,5.551546,1.434959,34.412207,16.362609,26.902871,8.470336,...,0.0,0.345312,0.0,0.0,0.0,0.0,0.0,0.0,0.254328,0.0
4,101.03541,34.532779,16.658801,49.461321,14.751983,5.544494,89.735865,56.419328,50.539925,26.966412,...,0.0,0.997951,0.0,0.0,0.0,0.0,0.0,0.0,0.765834,0.0


In [90]:
prediction_df.shape

(943, 1682)

## Get recommended movies for user 117

In [91]:
prediction_df.iloc[117]

0       64.571594
1       20.945403
2       12.654755
3       35.717007
4       12.796215
          ...    
1677     0.000000
1678     0.000000
1679     0.000000
1680     0.648160
1681     0.000000
Name: 117, Length: 1682, dtype: float64

In [92]:
prediction_df.iloc[117].sort_values(ascending=False)[:10]

49     99.102508
99     93.800499
97     88.091699
180    86.148372
173    80.003919
55     74.792302
126    69.090818
6      68.430103
182    67.370384
171    65.797124
Name: 117, dtype: float64

In [93]:
recommended_movie_df = pd.DataFrame(prediction_df.iloc[117].sort_values(ascending=False))

In [94]:
recommended_movie_df.head()

Unnamed: 0,117
49,99.102508
99,93.800499
97,88.091699
180,86.148372
173,80.003919


In [95]:
recommended_movie_df.reset_index(inplace=True)

In [96]:
recommended_movie_df.head()

Unnamed: 0,index,117
0,49,99.102508
1,99,93.800499
2,97,88.091699
3,180,86.148372
4,173,80.003919


In [97]:
recommended_movie_df.columns = ['movie_id', 'score']

In [98]:
recommended_movie_df.head(10)

Unnamed: 0,movie_id,score
0,49,99.102508
1,99,93.800499
2,97,88.091699
3,180,86.148372
4,173,80.003919
5,55,74.792302
6,126,69.090818
7,6,68.430103
8,182,67.370384
9,171,65.797124


## Merge movie_id with movie title

In [99]:
merged = pd.merge(recommended_movie_df, movies, on='movie_id', how='left')

In [100]:
merged.head(10)

Unnamed: 0,movie_id,score,movie title
0,49,99.102508,I.Q. (1994)
1,99,93.800499,Snow White and the Seven Dwarfs (1937)
2,97,88.091699,Dances with Wolves (1990)
3,180,86.148372,Apocalypse Now (1979)
4,173,80.003919,"Princess Bride, The (1987)"
5,55,74.792302,"Professional, The (1994)"
6,126,69.090818,"Spitfire Grill, The (1996)"
7,6,68.430103,Shanghai Triad (Yao a yao yao dao waipo qiao) ...
8,182,67.370384,GoodFellas (1990)
9,171,65.797124,Delicatessen (1991)


# Top 10 recommended movies for user 117

In [101]:
merged['movie title'].head(10)

0                                          I.Q. (1994)
1               Snow White and the Seven Dwarfs (1937)
2                            Dances with Wolves (1990)
3                                Apocalypse Now (1979)
4                           Princess Bride, The (1987)
5                             Professional, The (1994)
6                           Spitfire Grill, The (1996)
7    Shanghai Triad (Yao a yao yao dao waipo qiao) ...
8                                    GoodFellas (1990)
9                                  Delicatessen (1991)
Name: movie title, dtype: object