#Simple Movie Recommendation System

Imports

In [9]:
import numpy as np
import pandas as pd
from surprise import Reader
from surprise import KNNWithMeans
from surprise import Dataset
import pickle
from surprise.model_selection import cross_validate
from surprise import accuracy, Dataset, SVD
from sklearn.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise import SVD
import random

Loading the Data

In [10]:
movie_df_csv = pd.read_csv("/content/watched_rated_df.csv")
df_sub = movie_df_csv[['userid','movieid','rating']]
train, test = train_test_split(df_sub, test_size=0.2)
print(df_sub.head)

<bound method NDFrame.head of         userid                           movieid  rating
0       122183              raising+arizona+1987       4
1        79445                    the+piano+1993       3
2        67863                 forrest+gump+1994       4
3       179877               101+dalmatians+1996       5
4       224353      the+empire+strikes+back+1980       4
...        ...                               ...     ...
153036  182952                  unbreakable+2000       5
153037   40018  theres+something+about+mary+1998       4
153038  210126           revolutionary+road+2008       1
153039   39098                 forrest+gump+1994       5
153040    8828                        42+up+1998       4

[153041 rows x 3 columns]>


Data Preprocessing

In [11]:
reader = Reader(rating_scale=(1, 5))    
data_train = Dataset.load_from_df(train[['userid','movieid','rating']], reader)
trainingSet = data_train.build_full_trainset()

Training the model

In [12]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

param_grid = {
"n_epochs": [5, 10],
"lr_all": [0.002, 0.005],
"reg_all": [0.4, 0.6]
}
# Get the best params using GridSearchCV
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=4)
gs.fit(data_train)
best_params = gs.best_params["rmse"]
print(gs.best_score["rmse"])
print(gs.best_params["rmse"])
# Extract and train model with best params
svd_algo = SVD(n_epochs=best_params['n_epochs'],
               lr_all=best_params['lr_all'],
               reg_all=best_params['reg_all'])
svd_algo.fit(trainingSet)
predictions = []
actuals = []
rmse_val = []
for col, row in test.iterrows():
   predictions.append(svd_algo.predict(row.userid, row.movieid).est)
   actuals.append(row.rating)
rmse_val = rmse(np.array(predictions), np.array(actuals))
print("Test RMSE for SVD : " + str(rmse_val))

0.9662466663081891
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
Test RMSE for SVD : 0.9571522718708014


Saving the model

In [13]:
filename = 'SVD_14k'
outfile = open(filename,'wb')
pickle.dump(svd_algo,outfile)
outfile.close()

Predicting a movie list for users

In [14]:
"""
The following function fetches a list of movies a user has not watched and uses the model to predict what rating 
they would give to that movie. We then take the top 20 predicted ratings of unwatched movies and recommend them to 
a particular user. 
"""
master_movie_list = movie_df_csv['movieid'].unique().tolist()
top_20 = movie_df_csv.groupby(['movieid']).mean().rating.sort_values(0, ascending = False)[0:20]
top_20_tup = [(top_20.index[i], top_20.values[i]) for i in range(len(top_20.values))]
def get_recommendations(userId,model):
    if(userId not in movie_df_csv['userid'].values.tolist()):
        return random.sample(top_20_tup, len(top_20_tup))
    watched_list = movie_df_csv[movie_df_csv['userid'] == userId]['movieid'].unique().tolist()
    pred_movies = [i for i in master_movie_list if i not in watched_list]
    pred_dict = dict()
    for i in pred_movies:
        pred_dict[i] = model.predict(userId, i).est
    pred_dict = sorted(pred_dict.items(), key=lambda x: -x[1])
    if(len(pred_dict) < 20):
        return pred_dict
    else:
        return pred_dict[:20]
    return pred_dict
print(get_recommendations(122183,svd_algo))

[('the+shawshank+redemption+1994', 4.098211685955927), ('life+is+beautiful+1997', 4.044715808805278), ('the+lives+of+others+2006', 4.041316968038501), ('schindlers+list+1993', 4.038423651077876), ('spirited+away+2001', 4.022538018135133), ('the+deer+hunter+1978', 4.021288879683404), ('seven+samurai+1954', 4.0144943531754915), ('once+upon+a+time+in+the+west+1968', 4.013404728771617), ('midnight+cowboy+1969', 4.0067535199336), ('the+big+sleep+1946', 3.9892455937723073), ('the+dark+knight+2008', 3.9875545487275876), ('harry+potter+and+the+deathly+hallows+part+1+2010', 3.9814972740673564), ('modern+times+1936', 3.979222436618549), ('the+great+escape+1963', 3.974457933972328), ('scarface+1983', 3.9692729725556006), ('midnight+express+1978', 3.967695073262296), ('raise+the+red+lantern+1991', 3.9662465903791806), ('raiders+of+the+lost+ark+1981', 3.9659944527916), ('the+godfather+1972', 3.9649279392386867), ('the+good_+the+bad+and+the+ugly+1966', 3.962613285766666)]


  top_20 = movie_df_csv.groupby(['movieid']).mean().rating.sort_values(0, ascending = False)[0:20]
