In [1]:
#in this notebook we will implement 3 recommendation systems:
#1. Item item collaborative filtering
#2. Matrix factorization
#3. SLIM which is a technique shopify uses

import pandas as pd
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np



In [22]:
movies =pd.read_csv('ml-latest-small/movies.csv')
#movies_more_info=pd.read_csv('tmdb-movie-metadata/tmdb_5000_movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
links = pd.read_csv('ml-latest-small/links.csv')

In [121]:
print (movies)
print (movies.shape)
print (movies.movieId.value_counts())

      movieId                                              title  \
0           1                                   Toy Story (1995)   
1           2                                     Jumanji (1995)   
2           3                            Grumpier Old Men (1995)   
3           4                           Waiting to Exhale (1995)   
4           5                 Father of the Bride Part II (1995)   
5           6                                        Heat (1995)   
6           7                                     Sabrina (1995)   
7           8                                Tom and Huck (1995)   
8           9                                Sudden Death (1995)   
9          10                                   GoldenEye (1995)   
10         11                     American President, The (1995)   
11         12                 Dracula: Dead and Loving It (1995)   
12         13                                       Balto (1995)   
13         14                                   

In [65]:
print (ratings.head())
print (ratings.shape)
print (ratings.movieId.value_counts())

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
(100836, 4)
356       329
318       317
296       307
593       279
2571      278
260       251
480       238
110       237
589       224
527       220
2959      218
1         215
1196      211
2858      204
50        204
47        203
780       202
150       201
1198      200
4993      198
1210      196
858       192
457       190
592       189
5952      188
2028      188
7153      185
588       183
608       181
2762      179
         ... 
4451        1
31086       1
62718       1
188797      1
31150       1
33201       1
94735       1
80454       1
94799       1
2656        1
26732       1
8293        1
65601       1
6178        1
7459        1
179511      1
5602        1
128542      1
44597       1
7843        1
42740       1
1824        1
5922        1
1533

In [33]:
#getting the genres from the movies dataframe
movies['genres_list']= movies['genres'].str.split('|')

In [35]:
dummy_genre = pd.get_dummies(movies['genres_list'].apply(pd.Series).stack()).sum(level=0)

In [36]:
movies = pd.concat([movies,dummy_genre], axis=1)

In [63]:
#Now implementing item item collaborative filtering

#Creating a dataframe where user is row and movies is column

movie_user_df =pd.pivot_table(ratings, index='movieId', columns='userId', 
                              values='rating', aggfunc='mean').reset_index().rename_axis(None, axis=1)
movie_user_df.fillna(0,inplace=True)
#movie_user_df.reset_index(inplace=True)

In [66]:
#movie_user_df.reset_index(inplace=True,drop=True)
print (movie_user_df)
print (movie_user_df.shape)

      movieId    1    2    3    4    5    6    7    8    9  ...  601  602  \
0           1  4.0  0.0  0.0  0.0  4.0  0.0  4.5  0.0  0.0  ...  4.0  0.0   
1           2  0.0  0.0  0.0  0.0  0.0  4.0  0.0  4.0  0.0  ...  0.0  4.0   
2           3  4.0  0.0  0.0  0.0  0.0  5.0  0.0  0.0  0.0  ...  0.0  0.0   
3           4  0.0  0.0  0.0  0.0  0.0  3.0  0.0  0.0  0.0  ...  0.0  0.0   
4           5  0.0  0.0  0.0  0.0  0.0  5.0  0.0  0.0  0.0  ...  0.0  0.0   
5           6  4.0  0.0  0.0  0.0  0.0  4.0  0.0  0.0  0.0  ...  0.0  3.0   
6           7  0.0  0.0  0.0  0.0  0.0  4.0  0.0  0.0  0.0  ...  0.0  0.0   
7           8  0.0  0.0  0.0  0.0  0.0  3.0  0.0  0.0  0.0  ...  0.0  0.0   
8           9  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
9          10  0.0  0.0  0.0  0.0  0.0  3.0  0.0  2.0  0.0  ...  0.0  3.0   
10         11  0.0  0.0  0.0  0.0  0.0  4.0  0.0  4.0  0.0  ...  0.0  3.0   
11         12  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   

In [98]:
print (movie_user_df[movie_user_df['movieId']==18].index.item())

17


In [68]:
#Now calculating the similarity using pairwise cosine sim
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(movie_user_df.loc[:,movie_user_df.columns!='movieId'])


In [120]:
#creating a function which takes a movie and outputs the 5 similar movies
print ((similarities))
print (np.argsort(similarities,axis=1)[:,::-1][:,:6])
similarity = np.argsort(similarities,axis=1)[:,::-1][:,1:6]

[[1.         0.41056206 0.2969169  ... 0.         0.         0.        ]
 [0.41056206 1.         0.28243799 ... 0.         0.         0.        ]
 [0.2969169  0.28243799 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         1.         0.        ]
 [0.         0.         0.         ... 1.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]
[[   0 2353  418  615  224  314]
 [   1  322  436  325  418  504]
 [   2 2578  607  622  594    4]
 ...
 [9525 9358 9442 9439 9155 9556]
 [9525 9358 9442 9439 9155 9556]
 [9723 9665 9603 9691 7870 2957]]


In [127]:
def top5(input_movie):
    movieId=movies[movies['title']==input_movie]['movieId'].item()
    #print (movieId)
    row=movie_user_df[movie_user_df['movieId']==movieId].index.item()
    #print (row)
    for i in similarity[row]:
        movie_id=movie_user_df.loc[i,'movieId'].item()
        print (movies[movies['movieId']==movie_id]['title'].item())

In [133]:
top5('Heat (1995)')
#looks dope 

Rock, The (1996)
Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
Léon: The Professional (a.k.a. The Professional) (Léon) (1994)
Casino (1995)
Fargo (1996)


In [139]:
#Now using this similarity matrix to create the SLIM method of recommendation used
#by Shopify
#here the idea is - 
#User*Movie(sparse) matrix X movie*movie(similarity matrix)= user*movie matrix (dense)

user_movie_ratings = pd.pivot_table(ratings, index='userId', columns='movieId',
                                   values='rating', aggfunc='mean').reset_index().rename_axis(None, axis=1)
user_movie_ratings.fillna(0,inplace=True)


     1       2       3       4       5       6       7       8       9       \
0       4.0     0.0     4.0     0.0     0.0     4.0     0.0     0.0     0.0   
1       0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2       0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3       0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4       4.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5       0.0     4.0     5.0     3.0     5.0     4.0     4.0     3.0     0.0   
6       4.5     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
7       0.0     4.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
8       0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
9       0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
10      0.0     0.0     0.0     0.0     0.0     5.0     0.0     0.0     0.0   
11      0.0     0.0     0.0     0.0     0.0     0.0 

In [143]:
user_movie_ratings_new = np.matmul(user_movie_ratings.iloc[:,1:], similarities)

In [148]:
top10user = user_movie_ratings_new.argsort()[:,::-1][:,:10]

In [160]:
#creating a function which takes user as input and outputs the recommended movies
def top5movies_user(userId):
    row=user_movie_ratings[user_movie_ratings['userId']==userId].index.item()
    for i in top10user[row]:
        movie_id = user_movie_ratings.columns[i]
        print (movie_id)
        print (movies[movies['movieId']==movie_id]['title'].item())

In [201]:
top5movies_user(2)

79091
Despicable Me (2010)
58554
Class, The (Klass) (2007)
99112
Jack Reacher (2012)
68135
17 Again (2009)
74452
Wolfman, The (2010)
91500
The Hunger Games (2012)
48416
School for Scoundrels (2006)
122627
Oblivion 2: Backlash (1996)
106766
Inside Llewyn Davis (2013)
6873
Intolerable Cruelty (2003)


In [173]:
#lst = (list(ratings[(ratings.userId==1) & (ratings.rating==5.0)]['movieId']))

In [190]:
#Using matrix factorization

from scipy.sparse.linalg import svds
#The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 15
#Performs matrix factorization of the original user item matrix
U, sigma, Vt = svds(user_movie_ratings.iloc[:,1:], k = NUMBER_OF_FACTORS_MF)
sigma = np.diag(sigma)

In [191]:
all_user_predicted_ratings_MF = np.dot(np.dot(U, sigma), Vt) 

In [192]:
print (all_user_predicted_ratings_MF.shape)

(610, 9724)


In [197]:
top10user_MF = all_user_predicted_ratings_MF.argsort()[:,::-1][:,:10]

In [193]:
print (np.sort(ratings['movieId'].unique()))

[     1      2      3 ... 193585 193587 193609]


In [195]:
cf_preds_df = pd.DataFrame(all_user_predicted_ratings_MF, columns = np.sort(ratings['movieId'].unique()), index=np.sort(ratings['userId'].unique()))
cf_preds_df.head(10)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,2.483136,1.358859,1.106374,-0.02592,0.174511,1.614364,0.167845,0.043874,0.206104,1.823636,...,-0.003493,-0.002994,-0.003992,-0.003992,-0.003493,-0.003992,-0.003493,-0.003493,-0.003493,-0.02476
2,0.042254,0.010381,0.032058,0.002357,0.030592,-0.001568,0.013115,0.000889,0.019802,-0.139452,...,0.009957,0.008534,0.011379,0.011379,0.009957,0.011379,0.009957,0.009957,0.009957,0.013161
3,0.013802,0.02864,0.031904,-0.002689,-0.022141,0.064227,0.000262,0.009224,0.012006,0.038832,...,0.000942,0.000808,0.001077,0.001077,0.000942,0.001077,0.000942,0.000942,0.000942,-0.001751
4,2.082146,0.162685,0.102233,0.053761,0.146481,0.4548,0.30422,-0.061288,-0.030823,-0.003292,...,-0.003688,-0.003162,-0.004215,-0.004215,-0.003688,-0.004215,-0.003688,-0.003688,-0.003688,-0.014209
5,1.383537,0.957738,0.406646,0.129537,0.536569,0.756295,0.610276,0.130696,0.103396,1.16321,...,-0.000702,-0.000602,-0.000802,-0.000802,-0.000702,-0.000802,-0.000702,-0.000702,-0.000702,0.000739
6,3.083983,3.400135,1.883538,0.405848,1.867406,2.189975,2.306211,0.605855,0.471779,3.9169,...,0.005509,0.004722,0.006296,0.006296,0.005509,0.006296,0.005509,0.005509,0.005509,-0.011051
7,1.995476,0.789781,-0.082301,-0.031872,0.097296,0.860021,0.001919,-0.084484,-0.036732,1.176269,...,-0.010775,-0.009236,-0.012314,-0.012314,-0.010775,-0.012314,-0.010775,-0.010775,-0.010775,0.00582
8,1.672878,1.220942,0.575672,0.16343,0.7052,1.024762,0.788211,0.164804,0.153066,1.60397,...,-0.001049,-0.000899,-0.001199,-0.001199,-0.001049,-0.001199,-0.001049,-0.001049,-0.001049,0.002347
9,0.366844,0.042292,-0.025592,-0.011815,-0.004028,-0.002142,-0.044961,-0.022251,-0.020231,0.078649,...,-0.000115,-9.9e-05,-0.000131,-0.000131,-0.000115,-0.000131,-0.000115,-0.000115,-0.000115,0.002419
10,1.160564,0.399342,-0.066738,0.012906,0.217126,-0.53314,0.172168,-0.008901,-0.065745,-0.199155,...,0.010719,0.009187,0.01225,0.01225,0.010719,0.01225,0.010719,0.010719,0.010719,0.018345


In [198]:
#creating a function which takes user as input and outputs the recommended movies
def top5movies_user_MF(userId):
    row=user_movie_ratings[user_movie_ratings['userId']==userId].index.item()
    for i in top10user_MF[row]:
        movie_id = user_movie_ratings.columns[i]
        print (movie_id)
        print (movies[movies['movieId']==movie_id]['title'].item())

In [200]:
print (top5movies_user_MF(2))

79091
Despicable Me (2010)
58554
Class, The (Klass) (2007)
2956
Someone to Watch Over Me (1987)
317
Santa Clause, The (1994)
68135
17 Again (2009)
48416
School for Scoundrels (2006)
355
Flintstones, The (1994)
295
Pyromaniac's Love Story, A (1995)
91500
The Hunger Games (2012)
2328
Vampires (1998)
None
