In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation
from scipy.sparse import csr_matrix as sparse_matrix

from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV

from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors

# Ignore warnings :
import warnings
warnings.filterwarnings('ignore')

In [55]:
# Reading ratings, movies, tags file
# Ignore the timestamp column

ratings = pd.read_csv('ratings.csv', sep=',', usecols=['userId', 'movieId', 'rating'])
movies = pd.read_csv('movies.csv', sep=',', usecols=[ 'movieId','title', 'genres'])
links = pd.read_csv('links.csv', sep=',', usecols=['movieId', 'imdbId', 'tmdbId'])

###  Making some recommendations using SVD

In [89]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [56]:
collab = pd.merge(ratings, movies, on = 'movieId')
collab_data= pd.merge(links, collab, on = 'movieId')
collab_data

Unnamed: 0,movieId,imdbId,tmdbId,userId,rating,title,genres
0,1,114709,862.0,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,114709,862.0,5,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,1,114709,862.0,7,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,1,114709,862.0,15,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,1,114709,862.0,17,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...,...
100831,193581,5476944,432131.0,184,4.0,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
100832,193583,5914996,445030.0,184,3.5,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
100833,193585,6397426,479308.0,184,3.5,Flint (2017),Drama
100834,193587,8391976,483455.0,184,3.5,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [51]:
matrix = collab_data.pivot_table(index = 'userId', columns = 'title', values = ['rating','tag'])
matrix.head(20)

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
title,(500) Days of Summer (2009),...And Justice for All (1979),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),"11'09""01 - September 11 (2002)",12 Angry Men (1957),127 Hours (2010),13 Going on 30 (2004),...,Young Frankenstein (1974),Z (1969),Zack and Miri Make a Porno (2008),Zelary (2003),Zelig (1983),Zero Dark Thirty (2012),Zombieland (2009),Zoolander (2001),Zulu (1964),eXistenZ (1999)
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,,,,,,,,,,,...,5.0,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,3.0,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,5.0,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [52]:
user_matrix = matrix.copy()

# We will fill the row wise NaN's with the corresponding user's mean ratings, so that we can carry out Pearson correlation.
# Here we assume avg ratings for the movie that is not rated.
user_matrix = user_matrix.apply(lambda row: row.fillna(row.mean()), axis=1)
user_matrix.head(5)

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
title,(500) Days of Summer (2009),...And Justice for All (1979),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),"11'09""01 - September 11 (2002)",12 Angry Men (1957),127 Hours (2010),13 Going on 30 (2004),...,Young Frankenstein (1974),Z (1969),Zack and Miri Make a Porno (2008),Zelary (2003),Zelig (1983),Zero Dark Thirty (2012),Zombieland (2009),Zoolander (2001),Zulu (1964),eXistenZ (1999)
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,4.403509,4.403509,4.403509,4.403509,4.403509,4.403509,4.403509,4.403509,4.403509,4.403509,...,5.0,4.403509,4.403509,4.403509,4.403509,4.403509,4.403509,4.403509,4.403509,4.403509
2,4.15,4.15,4.15,4.15,4.15,4.15,4.15,4.15,4.15,4.15,...,4.15,4.15,4.15,4.15,4.15,4.15,3.0,4.15,4.15,4.15
3,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
4,3.688525,3.688525,3.688525,3.688525,3.688525,3.688525,3.688525,5.0,3.688525,3.688525,...,3.688525,3.688525,3.688525,3.688525,3.688525,3.688525,3.688525,3.688525,3.688525,3.688525
5,3.74359,3.74359,3.74359,3.74359,3.74359,3.74359,3.74359,3.74359,3.74359,3.74359,...,3.74359,3.74359,3.74359,3.74359,3.74359,3.74359,3.74359,3.74359,3.74359,3.74359


In [53]:
user_matrix.T.corr()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,-0.003424,,0.086193,0.038452,-0.093565,-0.024790,0.077054,4.751886e-02,0.002685,...,0.021855,0.013574,-0.002845,-9.074793e-02,1.059845e-02,0.040294,0.058590,0.121309,-0.047981,0.029090
2,-0.003424,1.000000,,-0.022109,0.049159,-0.046600,0.020600,-0.090953,1.248747e-15,0.011828,...,-0.106930,-0.058841,-0.004010,-9.206747e-16,-6.086303e-17,0.013590,-0.046480,-0.020786,-0.148054,0.073223
3,,,,,,,,,,,...,,,,,,,,,,
4,0.086193,-0.022109,,1.000000,-0.043078,0.054840,0.123121,0.017259,-1.223449e-02,0.133164,...,-0.060644,0.024707,0.053723,-2.948084e-02,6.782328e-02,0.038916,0.036490,-0.088129,-0.040539,-0.013142
5,0.038452,0.049159,,-0.043078,1.000000,-0.033540,0.016047,0.005517,-1.131009e-16,-0.049522,...,0.012946,0.006491,0.039920,5.237805e-02,-8.064051e-02,0.022047,0.043201,0.010971,0.174031,-0.005793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.040294,0.013590,,0.038916,0.022047,-0.064853,0.023329,0.040879,7.888350e-02,-0.051889,...,0.089497,0.006587,0.167864,6.249548e-02,1.408441e-02,1.000000,0.032427,0.108089,0.068225,0.093292
607,0.058590,-0.046480,,0.036490,0.043201,0.101653,0.069164,0.060232,-1.910450e-02,-0.050376,...,0.070187,0.068020,0.097181,-1.048557e-02,-3.354458e-02,0.032427,1.000000,0.058826,0.034337,-0.010357
608,0.121309,-0.020786,,-0.088129,0.010971,-0.039235,0.028102,0.108033,5.882416e-02,-0.084997,...,0.112378,0.082668,0.046518,2.884344e-02,-2.334787e-02,0.108089,0.058826,1.000000,0.072377,0.035296
609,-0.047981,-0.148054,,-0.040539,0.174031,0.147709,0.030085,0.103667,2.977811e-16,-0.070750,...,0.069796,0.108731,0.015039,1.106573e-01,-1.227985e-01,0.068225,0.034337,0.072377,1.000000,-0.053233


In [54]:
# NaN values are generated in corr() as the std dev is zero, which is required in calculating Pearson Similarity.
corr_matrix = user_matrix.T.corr()
corr_matrix.dropna(inplace = True)

In [55]:
collab_data[collab_data['userId'] == 1]

Unnamed: 0,movieId,tag,userId,rating,title,genres
3691,110,beautiful scenery,1,4.0,Braveheart (1995),Action|Drama|War
3928,110,epic,1,4.0,Braveheart (1995),Action|Drama|War
4165,110,historical,1,4.0,Braveheart (1995),Action|Drama|War
4402,110,inspirational,1,4.0,Braveheart (1995),Action|Drama|War
4639,110,Medieval,1,4.0,Braveheart (1995),Action|Drama|War
...,...,...,...,...,...,...
205854,3386,president,1,5.0,JFK (1991),Drama|Mystery|Thriller
206357,3489,Peter Pan,1,4.0,Hook (1991),Adventure|Comedy|Fantasy
228465,3671,dark humor,1,5.0,Blazing Saddles (1974),Comedy|Western
228527,3671,easygoing,1,5.0,Blazing Saddles (1974),Comedy|Western


In [57]:
collab_data[(collab_data['movieId'] == 45)]

Unnamed: 0,movieId,tag,userId,rating,title,genres
160498,45,Journalism,4,3.0,To Die For (1995),Comedy|Drama|Thriller
160499,45,Journalism,6,3.0,To Die For (1995),Comedy|Drama|Thriller
160500,45,Journalism,64,3.5,To Die For (1995),Comedy|Drama|Thriller
160501,45,Journalism,84,4.0,To Die For (1995),Comedy|Drama|Thriller
160502,45,Journalism,109,3.0,To Die For (1995),Comedy|Drama|Thriller
160503,45,Journalism,132,2.5,To Die For (1995),Comedy|Drama|Thriller
160504,45,Journalism,156,4.0,To Die For (1995),Comedy|Drama|Thriller
160505,45,Journalism,181,3.0,To Die For (1995),Comedy|Drama|Thriller
160506,45,Journalism,182,3.0,To Die For (1995),Comedy|Drama|Thriller
160507,45,Journalism,217,2.0,To Die For (1995),Comedy|Drama|Thriller


In [71]:
ratings_matrix = ratings.pivot_table(index=['movieId'],columns=['userId'],values='rating').reset_index(drop=True)
ratings_matrix.fillna( 0, inplace = True )

movie_similarity = 1 - pairwise_distances(ratings_matrix.to_numpy(), metric="cosine" )
np.fill_diagonal( movie_similarity, 0 ) 
ratings_matrix = pd.DataFrame( movie_similarity )


#Recommender

try:
    #user_inp=input('Enter the reference movie title based on which recommendations are to be made: ')
    user_inp="Speed (1994)"
    inp=movies[movies['title']==user_inp].index.tolist()
    inp=inp[0]
    
    movies['similarity'] = ratings_matrix.iloc[inp]
    movies.columns = ['movie_id', 'title', 'release_date','similarity']
    movies.head(5)
    
except:
    print("Sorry, the movie is not in the database!")
    
print("Recommended movies based on your choice of ",user_inp ,": \n", movies.sort_values( ["similarity"], ascending = False )[1:10])

Recommended movies based on your choice of  Speed (1994) : 
      movie_id                              title  \
418       480               Jurassic Park (1993)   
398       457               Fugitive, The (1993)   
138       165  Die Hard: With a Vengeance (1995)   
507       589  Terminator 2: Judgment Day (1991)   
436       500              Mrs. Doubtfire (1993)   
509       592                      Batman (1989)   
314       356                Forrest Gump (1994)   
514       597                Pretty Woman (1990)   
9          10                   GoldenEye (1995)   

                         release_date  similarity  
418  Action|Adventure|Sci-Fi|Thriller    0.661732  
398                          Thriller    0.648475  
138             Action|Crime|Thriller    0.634087  
507                     Action|Sci-Fi    0.630092  
436                      Comedy|Drama    0.613426  
509             Action|Crime|Thriller    0.577031  
314          Comedy|Drama|Romance|War    0.576160  
51

In [72]:
#reading in the range of ratings score
reader = Reader(rating_scale = (1,5))
#loading the dataframe into surprise
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

### Singular Value Decomposition (SVD)

In [73]:
method = SVD()
cross_validate(algo=method, data=data, measures=['RMSE'], cv=3)

{'test_rmse': array([0.88206018, 0.87779352, 0.87869081]),
 'fit_time': (0.5159218311309814, 0.515190839767456, 0.527357816696167),
 'test_time': (0.2272353172302246, 0.09750962257385254, 0.1082308292388916)}

In [74]:
param_grid = {'n_factors' : [50, 75], 'lr_all' : [0.5, 0.05], 'reg_all' : [0.06, 0.04]}

grid = GridSearchCV(algo_class=SVD, measures=['RMSE'], param_grid=param_grid,n_jobs = -1)
grid.fit(data)

# Best RMSE score
print('Best Score :', round(grid.best_score['rmse'],2)*100)

# Combination of parameters that gave the best RMSE score
print('Best Parameters :', grid.best_params['rmse'])

Best Score : 86.0
Best Parameters : {'n_factors': 75, 'lr_all': 0.05, 'reg_all': 0.06}


In [75]:
dataset = data.build_full_trainset()

In [76]:
svd = SVD(n_factors= 100, n_epochs = 60, reg_all=0.1, lr_all=0.01)
svd.fit(dataset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x139f4b1ad10>

In [77]:
svd.predict(1, 90)

Prediction(uid=1, iid=90, r_ui=None, est=4.278132715643666, details={'was_impossible': False})

For movie with ID 90, we get an estimated prediction of 4.27. One negative of this recommender system is that it doesn't care what the movie is (or what it contains). It works purely on the basis of an assigned movie ID and tries to predict based on how the other users have rated the movie.