# Two Stage Recommender System Testing
*CS247 Project Winter 2022 - Ryan Li, Osama Hassen, Sam Anderson*

## Recommender Systems on MovieLens 1M Dataset

### Loading and Formatting the Dataset

In [1]:
#primary source: https://www.kaggle.com/scratchpad/notebookbdf4340e3d/edit
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)

import warnings
warnings.filterwarnings('ignore')

plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = [18, 8]

In [2]:
#Load m1-1m dataset into Colab
!wget https://files.grouplens.org/datasets/movielens/ml-1m.zip
#!unzip ml-1m.zip      #this works in colab but not windows/jupyter

#Use this for windows/jupyter to unzip
import zipfile
with zipfile.ZipFile("ml-1m.zip", "r") as zip_ref:
    zip_ref.extractall()

--2022-03-08 22:50:28--  https://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip.1’


2022-03-08 22:50:37 (726 KB/s) - ‘ml-1m.zip.1’ saved [5917549/5917549]



In [3]:
#extract data into dataframes
reviews = pd.read_csv('ml-1m/ratings.dat', names=['userId', 'movieId', 'rating', 'time'], delimiter='::', engine='python', encoding="iso-8859-1")
movies = pd.read_csv('ml-1m/movies.dat', names=['movieId', 'movie_names', 'genres'], delimiter='::', engine='python', encoding="iso-8859-1")
users = pd.read_csv('ml-1m/users.dat', names=['userId', 'gender', 'age', 'occupation', 'zip'], delimiter='::', engine='python', encoding="iso-8859-1")

print('Reviews shape:', reviews.shape)
print('Users shape:', users.shape)
print('Movies shape:', movies.shape)



Reviews shape: (1000209, 4)
Users shape: (6040, 5)
Movies shape: (3883, 3)


In [4]:
#reformat the dataset to retain/enhance useful features then merge the tables

reviews.drop(['time'], axis=1, inplace=True)
users.drop(['zip'], axis=1, inplace=True)

movies['release_year'] = movies['movie_names'].str.extract(r'(?:\((\d{4})\))?\s*$', expand=False)

ages_map = {1: 'Under 18',
            18: '18 - 24',
            25: '25 - 34',
            35: '35 - 44',
            45: '45 - 49',
            50: '50 - 55',
            56: '56+'}

occupations_map = {0: 'Not specified',
                   1: 'Academic / Educator',
                   2: 'Artist',
                   3: 'Clerical / Admin',
                   4: 'College / Grad Student',
                   5: 'Customer Service',
                   6: 'Doctor / Health Care',
                   7: 'Executive / Managerial',
                   8: 'Farmer',
                   9: 'Homemaker',
                   10: 'K-12 student',
                   11: 'Lawyer',
                   12: 'Programmer',
                   13: 'Retired',
                   14: 'Sales / Marketing',
                   15: 'Scientist',
                   16: 'Self-Employed',
                   17: 'Technician / Engineer',
                   18: 'Tradesman / Craftsman',
                   19: 'Unemployed',
                   20: 'Writer'}

gender_map = {'M': 'Male', 'F': 'Female'}

users['age'] = users['age'].map(ages_map)
users['occupation'] = users['occupation'].map(occupations_map)
users['gender'] = users['gender'].map(gender_map)

final_df = reviews.merge(movies, on='movieId', how='left').merge(users, on='userId', how='left')

print('final_df shape:', final_df.shape)
final_df

final_df shape: (1000209, 9)


Unnamed: 0,userId,movieId,rating,movie_names,genres,release_year,gender,age,occupation
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975),Drama,1975,Female,Under 18,K-12 student
1,1,661,3,James and the Giant Peach (1996),Animation|Children's|Musical,1996,Female,Under 18,K-12 student
2,1,914,3,My Fair Lady (1964),Musical|Romance,1964,Female,Under 18,K-12 student
3,1,3408,4,Erin Brockovich (2000),Drama,2000,Female,Under 18,K-12 student
4,1,2355,5,"Bug's Life, A (1998)",Animation|Children's|Comedy,1998,Female,Under 18,K-12 student
...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,Weekend at Bernie's (1989),Comedy,1989,Male,25 - 34,Doctor / Health Care
1000205,6040,1094,5,"Crying Game, The (1992)",Drama|Romance|War,1992,Male,25 - 34,Doctor / Health Care
1000206,6040,562,5,Welcome to the Dollhouse (1995),Comedy|Drama,1995,Male,25 - 34,Doctor / Health Care
1000207,6040,1096,4,Sophie's Choice (1982),Drama,1982,Male,25 - 34,Doctor / Health Care


In [5]:
n_users = final_df['userId'].nunique()
n_movies = final_df['movieId'].nunique()

print('Number of users:', n_users)
print('Number of movies:', n_movies)

Number of users: 6040
Number of movies: 3706


In [6]:
final_df_matrix = final_df.pivot(index='userId',
                                 columns='movieId',
                                 values='rating').fillna(0)

In [7]:
#save final_df to csv for nominators
final_df.to_csv('final_df')


### Begin Nominator Stage - KNNBasic Nominators
#### (1) Generate n nominator dfs from full dataset

In [8]:
final_df = pd.read_csv('./final_df')
final_df=final_df.drop(['Unnamed: 0'], axis=1)
final_df.head()

Unnamed: 0,userId,movieId,rating,movie_names,genres,release_year,gender,age,occupation
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975),Drama,1975,Female,Under 18,K-12 student
1,1,661,3,James and the Giant Peach (1996),Animation|Children's|Musical,1996,Female,Under 18,K-12 student
2,1,914,3,My Fair Lady (1964),Musical|Romance,1964,Female,Under 18,K-12 student
3,1,3408,4,Erin Brockovich (2000),Drama,2000,Female,Under 18,K-12 student
4,1,2355,5,"Bug's Life, A (1998)",Animation|Children's|Comedy,1998,Female,Under 18,K-12 student


In [9]:
#enter df of whole dataset, number of nominators, and fraction of sampling
#(frac=1/4 means each nominator will randomly sample 25% of the df)
#allow for some overlap. n_noms should be > denominator of frac.
def nom_sampling(df, n_noms, frac):
    nom_dfs = {}
    k = 0
    while k < n_noms:
        # dynamically create key
        nom_dfs[k] = df.sample(frac=frac, replace=False, axis=0) 
        k += 1
    print(len(nom_dfs), 'dfs of ', nom_dfs[0].shape[0], 'randomly chosen movies have been created')
    return nom_dfs

In [19]:
#!pip install surprise #
from surprise import Reader, Dataset, KNNBasic, SVD, SVDpp, accuracy

#### (2) Using dictionaries for the nominators and duplicating the training code for each item as individual functions

In [35]:

#fitting nominators
def fit_noms(nom_dfs):  #expects dictionary of nominator dfs
    reader = Reader(rating_scale=(1, 5))
    nom_datasets = {key:Dataset.load_from_df(nom_dfs[key][['userId', 'movieId', 'rating']], reader=reader) for key in nom_dfs}
    nom_trainsets = {key:nom_datasets[key].build_full_trainset() for key in nom_datasets}
    nom_knns = {key:KNNBasic(k=100).fit(nom_trainsets[key]) for key in nom_trainsets}
    print(len(nom_dfs), 'nominators have been trained')
    return nom_knns  #returns dictionary of trained models


def Build_Testset4User(df, u_id):
    
    global user_id
    user_id=u_id
    reader = Reader(rating_scale=(1, 5))
    dataset = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader=reader)
    trainset = dataset.build_full_trainset()  #whole dataset is used for trainig
    fill = trainset.global_mean
    testset = list()
    u = trainset.to_inner_uid(user_id)
    
    # ur == users ratings
    user_items = set([item_inner_id for (item_inner_id, rating) in trainset.ur[u]])
    
    testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
                            i in trainset.all_items() if i not in user_items]
    
    return testset

#enter testset (from Build_Testset4User), nom_knns (from fit_noms), # items per nominator to be passed to Ranker
def Nominator_KNN(testset, nom_knns, num_nomin=100):  
    nom_predict = {key:nom_knns[key].test(testset) for key in nom_knns}  # we can change to SVD++ later
    
    nom_recomm, recommendation = [], {}
    recommendation = {}
    
    for key in nom_predict:
        for userID, movieID, actualRating, estimatedRating, _ in nom_predict[key]:
            intMovieID = int(movieID)
            nom_recomm.append((intMovieID, estimatedRating))
        recommendation[key] = nom_recomm

        recommendation[key].sort(key=lambda x: x[1], reverse=True)  #sort nominated movies by rating desc
    
    #generate list of nominated movie_ids from all nominators
    nominated_mov_ids= []
    for key in recommendation:
        for i in range(num_nomin):
            nominated_mov_ids.append(recommendation[key][i][0])  
        
    nom_movies_df=final_df.loc[final_df['movieId'].isin(nominated_mov_ids)]
    
    return nom_movies_df



#### (3) Train Ranker with input from nominators

In [25]:
#create and train an SVD Ranker stage 
def fit_ranker_SVD(nom_movies_df):   #takes output of nominator stage as a df
    reader = Reader(rating_scale=(1, 5))
    ranker_dataset = Dataset.load_from_df(nom_movies_df[['userId', 'movieId', 'rating']], reader=reader)
    ranker_svd = SVD(n_factors=50)
    ranker_trainset = ranker_dataset.build_full_trainset()
    ranker_svd.fit(ranker_trainset)
    print('Ranker has been trained')
    return ranker_svd, ranker_trainset


def Rank_SVD(num_recommend=10, latest=False):
    
    testset = Build_Testset4User(final_df, user_id)
    predict = ranker_svd.test(testset)  
    
    recommendation = list()
    for userID, movieID, actualRating, estimatedRating, _ in predict:
        intMovieID = int(movieID)
        recommendation.append((intMovieID, estimatedRating))
        
    recommendation.sort(key=lambda x: x[1], reverse=True)
    
    id_2_names = dict()
    for idx, names in zip(movies['movieId'], movies['movie_names']):
        id_2_names[idx] = names
    
    movie_names, movie_ratings = [], []
    for name, ratings in recommendation[:20]:
        movie_names.append(id_2_names[name])
        movie_ratings.append(ratings)
        
    movie_dataframe =  pd.DataFrame({'movie_names': movie_names,
                                     'rating': movie_ratings}).merge(movies[['movie_names', 'release_year']],
                                            on='movie_names', how='left')
    
    if latest == True:
        return movie_dataframe.sort_values('release_year', ascending=False)[['movie_names', 'rating']].head(num_recommend)
    
    else:
        return movie_dataframe.drop('release_year', axis=1).head(num_recommend)



#### (4) Test

In [43]:
%%time
# Train Nominator
n_noms = 8
u_id = 1920
nom_dfs = nom_sampling(final_df, n_noms, 1/5)
nom_knns = fit_noms(nom_dfs)
testset = Build_Testset4User(final_df, u_id)
nom_movies_df = Nominator_KNN(testset, nom_knns, num_nomin=500)

8 dfs of  200042 randomly chosen movies have been created
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
8 nominators have been trained
CPU times: user 29.7 s, sys: 3.87 s, total: 33.6 s
Wall time: 36.3 s


In [44]:
%%time
#return top 10 recommended movies by predicted average rating
ranker_svd, ranker_trainset_svd = fit_ranker_SVD(nom_movies_df)
res_svd = Rank_SVD(num_recommend=10)
ranker_testset_svd = ranker_trainset_svd.build_anti_testset()
predictions_svd = ranker_svd.test(ranker_testset_svd)
print('KNNBasic/SVD - RMSE:', accuracy.rmse(predictions_svd, verbose=False))
print('KNNBasic/SVD - MAE:', accuracy.mae(predictions_svd, verbose=False))

Ranker has been trained
KNNBasic/SVD - RMSE: 0.45117106133882995
KNNBasic/SVD - MAE: 0.34666435392468176
CPU times: user 15.7 s, sys: 11.2 s, total: 26.9 s
Wall time: 36.3 s


#### SVD++ Ranker:

In [28]:
#create and train an SVD Ranker stage 
from surprise import SVDpp

def fit_ranker_SVDpp(nom_movies_df):   #takes output of nominator stage as a df
    reader = Reader(rating_scale=(1, 5))
    ranker_dataset = Dataset.load_from_df(nom_movies_df[['userId', 'movieId', 'rating']], reader=reader)
    global ranker_svdpp
    ranker_svdpp = SVDpp(n_factors=50)
    ranker_trainset = ranker_dataset.build_full_trainset()
    ranker_svdpp.fit(ranker_trainset)
    print('Ranker has been trained')
    return ranker_svdpp, ranker_trainset


def Rank_SVDpp(num_recommend=10, latest=False):
    
    testset = Build_Testset4User(final_df, user_id)
    predict = ranker_svdpp.test(testset)  
    
    recommendation = list()
    for userID, movieID, actualRating, estimatedRating, _ in predict:
        intMovieID = int(movieID)
        recommendation.append((intMovieID, estimatedRating))
        
    recommendation.sort(key=lambda x: x[1], reverse=True)
    
    id_2_names = dict()
    for idx, names in zip(movies['movieId'], movies['movie_names']):
        id_2_names[idx] = names
    
    movie_names, movie_ratings = [], []
    for name, ratings in recommendation[:20]:
        movie_names.append(id_2_names[name])
        movie_ratings.append(ratings)
        
    movie_dataframe =  pd.DataFrame({'movie_names': movie_names,
                                     'rating': movie_ratings}).merge(movies[['movie_names', 'release_year']],
                                            on='movie_names', how='left')
    
    if latest == True:
        return movie_dataframe.sort_values('release_year', ascending=False)[['movie_names', 'rating']].head(num_recommend)
    
    else:
        return movie_dataframe.drop('release_year', axis=1).head(num_recommend)



In [49]:
%%time
ranker_svdpp, ranker_trainset_svdpp = fit_ranker_SVDpp(nom_movies_df)
res_svdpp = Rank_SVDpp(num_recommend=10)
ranker_testset_svdpp = ranker_trainset_svdpp.build_anti_testset()
predictions_svdpp = ranker_svdpp.test(ranker_testset_svdpp)
print('KNNBasic/SVDpp - RMSE:', accuracy.rmse(predictions_svdpp, verbose=False))
print('KNNBasic/SVDpp - MAE:', accuracy.mae(predictions_svdpp, verbose=False))

Ranker has been trained
KNNBasic/SVDpp - RMSE: 0.4684584732705012
KNNBasic/SVDpp - MAE: 0.3603568878186442
CPU times: user 36.3 s, sys: 1.64 s, total: 38 s
Wall time: 42.4 s


### NMF Ranker:

In [50]:
#create and train an NMF Ranker stage 
from surprise import NMF

def fit_ranker_NMF(nom_movies_df):   #takes output of nominator stage as a df
    reader = Reader(rating_scale=(1, 5))
    ranker_dataset = Dataset.load_from_df(nom_movies_df[['userId', 'movieId', 'rating']], reader=reader)
    global ranker_nmf
    ranker_nmf = NMF(n_factors=50)
    ranker_trainset = ranker_dataset.build_full_trainset()
    ranker_nmf.fit(ranker_trainset)
    print('Ranker has been trained')
    return ranker_nmf, ranker_trainset


def Rank_NMF(num_recommend=10, latest=False):
    
    testset = Build_Testset4User(final_df, user_id)
    predict = ranker_nmf.test(testset)  
    
    recommendation = list()
    for userID, movieID, actualRating, estimatedRating, _ in predict:
        intMovieID = int(movieID)
        recommendation.append((intMovieID, estimatedRating))
        
    recommendation.sort(key=lambda x: x[1], reverse=True)
    
    id_2_names = dict()
    for idx, names in zip(movies['movieId'], movies['movie_names']):
        id_2_names[idx] = names
    
    movie_names, movie_ratings = [], []
    for name, ratings in recommendation[:20]:
        movie_names.append(id_2_names[name])
        movie_ratings.append(ratings)
        
    movie_dataframe =  pd.DataFrame({'movie_names': movie_names,
                                     'rating': movie_ratings}).merge(movies[['movie_names', 'release_year']],
                                            on='movie_names', how='left')
    
    if latest == True:
        return movie_dataframe.sort_values('release_year', ascending=False)[['movie_names', 'rating']].head(num_recommend)
    
    else:
        return movie_dataframe.drop('release_year', axis=1).head(num_recommend)

In [51]:
%%time
ranker_nmf, ranker_trainset_nmf = fit_ranker_NMF(nom_movies_df)
res_nmf = Rank_NMF(num_recommend=10)
ranker_testset_nmf = ranker_trainset_nmf.build_anti_testset()
predictions_nmf = ranker_nmf.test(ranker_testset_nmf)
print('KNNBasic/NMF - RMSE:', accuracy.rmse(predictions_nmf, verbose=False))
print('KNNBasic/NMF - MAE:', accuracy.mae(predictions_nmf, verbose=False))

Ranker has been trained
KNNBasic/NMF - RMSE: 0.8427751491238175
KNNBasic/NMF - MAE: 0.6438389811203957
CPU times: user 14.2 s, sys: 1.53 s, total: 15.7 s
Wall time: 17.2 s


### KNNBasic Ranker:

In [52]:
#create and train an KNNBasic Ranker stage 
from surprise import KNNBasic

def fit_ranker_KNN(nom_movies_df):   #takes output of nominator stage as a df
    reader = Reader(rating_scale=(1, 5))
    ranker_dataset = Dataset.load_from_df(nom_movies_df[['userId', 'movieId', 'rating']], reader=reader)
    global ranker_knn
    ranker_knn = KNNBasic(n_factors=50)
    ranker_trainset = ranker_dataset.build_full_trainset()
    ranker_knn.fit(ranker_trainset)
    print('Ranker has been trained')
    return ranker_knn, ranker_trainset


def Rank_KNN(num_recommend=10, latest=False):
    
    testset = Build_Testset4User(final_df, user_id)
    predict = ranker_knn.test(testset)  
    
    recommendation = list()
    for userID, movieID, actualRating, estimatedRating, _ in predict:
        intMovieID = int(movieID)
        recommendation.append((intMovieID, estimatedRating))
        
    recommendation.sort(key=lambda x: x[1], reverse=True)
    
    id_2_names = dict()
    for idx, names in zip(movies['movieId'], movies['movie_names']):
        id_2_names[idx] = names
    
    movie_names, movie_ratings = [], []
    for name, ratings in recommendation[:20]:
        movie_names.append(id_2_names[name])
        movie_ratings.append(ratings)
        
    movie_dataframe =  pd.DataFrame({'movie_names': movie_names,
                                     'rating': movie_ratings}).merge(movies[['movie_names', 'release_year']],
                                            on='movie_names', how='left')
    
    if latest == True:
        return movie_dataframe.sort_values('release_year', ascending=False)[['movie_names', 'rating']].head(num_recommend)
    
    else:
        return movie_dataframe.drop('release_year', axis=1).head(num_recommend)

In [54]:
%%time
ranker_knn, ranker_trainset_knn = fit_ranker_KNN(nom_movies_df)
res_knn = Rank_KNN(num_recommend=10)
ranker_testset_knn = ranker_trainset_knn.build_anti_testset()
predictions_knn = ranker_knn.test(ranker_testset_knn)
print('KNNBasic/KNNBasic - RMSE:', accuracy.rmse(predictions_knn, verbose=False))
print('KNNBasic/KNNBasic - MAE:', accuracy.mae(predictions_knn, verbose=False))

Computing the msd similarity matrix...
Done computing similarity matrix.
Ranker has been trained
KNNBasic/KNNBasic - RMSE: 0.7617291553875398
KNNBasic/KNNBasic - MAE: 0.5489114922496527
CPU times: user 1min 21s, sys: 3.9 s, total: 1min 25s
Wall time: 1min 33s


### KNN with Means Ranker:

In [55]:
#create and train an KNN With Means Ranker stage 
from surprise import KNNWithMeans

def fit_ranker_KNNWM(nom_movies_df):   #takes output of nominator stage as a df
    reader = Reader(rating_scale=(1, 5))
    ranker_dataset = Dataset.load_from_df(nom_movies_df[['userId', 'movieId', 'rating']], reader=reader)
    global ranker_knnwm
    ranker_knnwm = KNNWithMeans(n_factors=50)
    ranker_trainset = ranker_dataset.build_full_trainset()
    ranker_knnwm.fit(ranker_trainset)
    print('Ranker has been trained')
    return ranker_knnwm, ranker_trainset


def Rank_KNNWM(num_recommend=10, latest=False):
    
    testset = Build_Testset4User(final_df, user_id)
    predict = ranker_knnwm.test(testset)  
    
    recommendation = list()
    for userID, movieID, actualRating, estimatedRating, _ in predict:
        intMovieID = int(movieID)
        recommendation.append((intMovieID, estimatedRating))
        
    recommendation.sort(key=lambda x: x[1], reverse=True)
    
    id_2_names = dict()
    for idx, names in zip(movies['movieId'], movies['movie_names']):
        id_2_names[idx] = names
    
    movie_names, movie_ratings = [], []
    for name, ratings in recommendation[:20]:
        movie_names.append(id_2_names[name])
        movie_ratings.append(ratings)
        
    movie_dataframe =  pd.DataFrame({'movie_names': movie_names,
                                     'rating': movie_ratings}).merge(movies[['movie_names', 'release_year']],
                                            on='movie_names', how='left')
    
    if latest == True:
        return movie_dataframe.sort_values('release_year', ascending=False)[['movie_names', 'rating']].head(num_recommend)
    
    else:
        return movie_dataframe.drop('release_year', axis=1).head(num_recommend)

In [56]:
%%time
ranker_knnwm, ranker_trainset_knnwm = fit_ranker_KNNWM(nom_movies_df)
res_knnwm = Rank_KNNWM(num_recommend=10)
ranker_testset_knnwm = ranker_trainset_knnwm.build_anti_testset()
predictions_knnwm = ranker_knnwm.test(ranker_testset_knnwm)
print('KNNBasic/KNNWM - RMSE:', accuracy.rmse(predictions_knnwm, verbose=False))
print('KNNBasic/KNNWM - MAE:', accuracy.mae(predictions_knnwm, verbose=False))

Computing the msd similarity matrix...
Done computing similarity matrix.
Ranker has been trained
KNNBasic/KNNWM - RMSE: 0.8610385706961676
KNNBasic/KNNWM - MAE: 0.6343361568982232
CPU times: user 1min 13s, sys: 2.88 s, total: 1min 16s
Wall time: 1min 21s


### KNN with ZScore Ranker:

In [57]:
#create and train an KNN With ZScore Ranker stage 
from surprise import KNNWithZScore

def fit_ranker_KNNZS(nom_movies_df):   #takes output of nominator stage as a df
    reader = Reader(rating_scale=(1, 5))
    ranker_dataset = Dataset.load_from_df(nom_movies_df[['userId', 'movieId', 'rating']], reader=reader)
    global ranker_knnzs
    ranker_knnzs = KNNWithZScore(n_factors=50)
    ranker_trainset = ranker_dataset.build_full_trainset()
    ranker_knnzs.fit(ranker_trainset)
    print('Ranker has been trained')
    return ranker_knnwm, ranker_trainset


def Rank_KNNZS(num_recommend=10, latest=False):
    
    testset = Build_Testset4User(final_df, user_id)
    predict = ranker_knnzs.test(testset)  
    
    recommendation = list()
    for userID, movieID, actualRating, estimatedRating, _ in predict:
        intMovieID = int(movieID)
        recommendation.append((intMovieID, estimatedRating))
        
    recommendation.sort(key=lambda x: x[1], reverse=True)
    
    id_2_names = dict()
    for idx, names in zip(movies['movieId'], movies['movie_names']):
        id_2_names[idx] = names
    
    movie_names, movie_ratings = [], []
    for name, ratings in recommendation[:20]:
        movie_names.append(id_2_names[name])
        movie_ratings.append(ratings)
        
    movie_dataframe =  pd.DataFrame({'movie_names': movie_names,
                                     'rating': movie_ratings}).merge(movies[['movie_names', 'release_year']],
                                            on='movie_names', how='left')
    
    if latest == True:
        return movie_dataframe.sort_values('release_year', ascending=False)[['movie_names', 'rating']].head(num_recommend)
    
    else:
        return movie_dataframe.drop('release_year', axis=1).head(num_recommend)

In [59]:
%%time
ranker_knnzs, ranker_trainset_knnzs = fit_ranker_KNNZS(nom_movies_df)
res_knnzs = Rank_KNNZS(num_recommend=10)
ranker_testset_knnzs = ranker_trainset_knnzs.build_anti_testset()
predictions_knnzs = ranker_knnzs.test(ranker_testset_knnzs)
print('KNNBasic/KNNZS - RMSE:', accuracy.rmse(predictions_knnzs, verbose=False))
print('KNNBasic/KNNZS - MAE:', accuracy.mae(predictions_knnzs, verbose=False))

Computing the msd similarity matrix...
Done computing similarity matrix.
Ranker has been trained
KNNBasic/KNNZS - RMSE: 0.8610385706961676
KNNBasic/KNNZS - MAE: 0.6343361568982232
CPU times: user 1min 22s, sys: 2.24 s, total: 1min 25s
Wall time: 1min 34s


### KNNBaseline Ranker:

In [60]:
#create and train an KNN Baseline Ranker stage 
from surprise import KNNBaseline

def fit_ranker_KNNBL(nom_movies_df):   #takes output of nominator stage as a df
    reader = Reader(rating_scale=(1, 5))
    ranker_dataset = Dataset.load_from_df(nom_movies_df[['userId', 'movieId', 'rating']], reader=reader)
    global ranker_knnbl
    ranker_knnbl = KNNBaseline(n_factors=50)
    ranker_trainset = ranker_dataset.build_full_trainset()
    ranker_knnbl.fit(ranker_trainset)
    print('Ranker has been trained')
    return ranker_knnbl, ranker_trainset


def Rank_KNNBL(num_recommend=10, latest=False):
    
    testset = Build_Testset4User(final_df, user_id)
    predict = ranker_knnbl.test(testset)  
    
    recommendation = list()
    for userID, movieID, actualRating, estimatedRating, _ in predict:
        intMovieID = int(movieID)
        recommendation.append((intMovieID, estimatedRating))
        
    recommendation.sort(key=lambda x: x[1], reverse=True)
    
    id_2_names = dict()
    for idx, names in zip(movies['movieId'], movies['movie_names']):
        id_2_names[idx] = names
    
    movie_names, movie_ratings = [], []
    for name, ratings in recommendation[:20]:
        movie_names.append(id_2_names[name])
        movie_ratings.append(ratings)
        
    movie_dataframe =  pd.DataFrame({'movie_names': movie_names,
                                     'rating': movie_ratings}).merge(movies[['movie_names', 'release_year']],
                                            on='movie_names', how='left')
    
    if latest == True:
        return movie_dataframe.sort_values('release_year', ascending=False)[['movie_names', 'rating']].head(num_recommend)
    
    else:
        return movie_dataframe.drop('release_year', axis=1).head(num_recommend)

In [61]:
%%time
ranker_knnbl, ranker_trainset_knnbl = fit_ranker_KNNBL(nom_movies_df)
res_knnbl = Rank_KNNBL(num_recommend=10)
ranker_testset_knnbl = ranker_trainset_knnbl.build_anti_testset()
predictions_knnbl = ranker_knnbl.test(ranker_testset_knnbl)
print('KNNBasic/KNNBL - RMSE:', accuracy.rmse(predictions_knnbl, verbose=False))
print('KNNBasic/KNNBL - MAE:', accuracy.mae(predictions_knnbl, verbose=False))

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Ranker has been trained
KNNBasic/KNNBL - RMSE: 0.7104599984272624
KNNBasic/KNNBL - MAE: 0.5209843842687514
CPU times: user 1min 25s, sys: 7.18 s, total: 1min 32s
Wall time: 1min 43s


### SlopeOne Ranker:

In [63]:
#create and train an SlopeOne Ranker stage 
from surprise import SlopeOne

def fit_ranker_SO(nom_movies_df):   #takes output of nominator stage as a df
    reader = Reader(rating_scale=(1, 5))
    ranker_dataset = Dataset.load_from_df(nom_movies_df[['userId', 'movieId', 'rating']], reader=reader)
    global ranker_so
    ranker_so = SlopeOne()
    ranker_trainset = ranker_dataset.build_full_trainset()
    ranker_so.fit(ranker_trainset)
    print('Ranker has been trained')
    return ranker_so, ranker_trainset


def Rank_SO(num_recommend=10, latest=False):
    
    testset = Build_Testset4User(final_df, user_id)
    predict = ranker_so.test(testset)  
    
    recommendation = list()
    for userID, movieID, actualRating, estimatedRating, _ in predict:
        intMovieID = int(movieID)
        recommendation.append((intMovieID, estimatedRating))
        
    recommendation.sort(key=lambda x: x[1], reverse=True)
    
    id_2_names = dict()
    for idx, names in zip(movies['movieId'], movies['movie_names']):
        id_2_names[idx] = names
    
    movie_names, movie_ratings = [], []
    for name, ratings in recommendation[:20]:
        movie_names.append(id_2_names[name])
        movie_ratings.append(ratings)
        
    movie_dataframe =  pd.DataFrame({'movie_names': movie_names,
                                     'rating': movie_ratings}).merge(movies[['movie_names', 'release_year']],
                                            on='movie_names', how='left')
    
    if latest == True:
        return movie_dataframe.sort_values('release_year', ascending=False)[['movie_names', 'rating']].head(num_recommend)
    
    else:
        return movie_dataframe.drop('release_year', axis=1).head(num_recommend)

In [65]:
%%time
ranker_so, ranker_trainset_so = fit_ranker_SO(nom_movies_df)
res_so = Rank_SO(num_recommend=10)
ranker_testset_so = ranker_trainset_so.build_anti_testset()
predictions_so = ranker_so.test(ranker_testset_so)
print('KNNBasic/SO - RMSE:', accuracy.rmse(predictions_so, verbose=False))
print('KNNBasic/SO - MAE:', accuracy.mae(predictions_so, verbose=False))

Ranker has been trained
KNNBasic/SO - RMSE: 1.0000093128965781
KNNBasic/SO - MAE: 0.7491493367396678
CPU times: user 14.2 s, sys: 3.9 s, total: 18.1 s
Wall time: 19.6 s


### CoClustering Ranker:

In [66]:
#create and train an CoClustering Ranker stage 
from surprise import CoClustering

def fit_ranker_CC(nom_movies_df):   #takes output of nominator stage as a df
    reader = Reader(rating_scale=(1, 5))
    ranker_dataset = Dataset.load_from_df(nom_movies_df[['userId', 'movieId', 'rating']], reader=reader)
    global ranker_cc
    ranker_cc = CoClustering()
    ranker_trainset = ranker_dataset.build_full_trainset()
    ranker_cc.fit(ranker_trainset)
    print('Ranker has been trained')
    return ranker_cc, ranker_trainset


def Rank_CC(num_recommend=10, latest=False):
    
    testset = Build_Testset4User(final_df, user_id)
    predict = ranker_cc.test(testset)  
    
    recommendation = list()
    for userID, movieID, actualRating, estimatedRating, _ in predict:
        intMovieID = int(movieID)
        recommendation.append((intMovieID, estimatedRating))
        
    recommendation.sort(key=lambda x: x[1], reverse=True)
    
    id_2_names = dict()
    for idx, names in zip(movies['movieId'], movies['movie_names']):
        id_2_names[idx] = names
    
    movie_names, movie_ratings = [], []
    for name, ratings in recommendation[:20]:
        movie_names.append(id_2_names[name])
        movie_ratings.append(ratings)
        
    movie_dataframe =  pd.DataFrame({'movie_names': movie_names,
                                     'rating': movie_ratings}).merge(movies[['movie_names', 'release_year']],
                                            on='movie_names', how='left')
    
    if latest == True:
        return movie_dataframe.sort_values('release_year', ascending=False)[['movie_names', 'rating']].head(num_recommend)
    
    else:
        return movie_dataframe.drop('release_year', axis=1).head(num_recommend)

In [67]:
%%time
ranker_cc, ranker_trainset_cc = fit_ranker_CC(nom_movies_df)
res_cc = Rank_CC(num_recommend=10)
ranker_testset_cc = ranker_trainset_cc.build_anti_testset()
predictions_cc = ranker_cc.test(ranker_testset_cc)
print('KNNBasic/CC - RMSE:', accuracy.rmse(predictions_cc, verbose=False))
print('KNNBasic/CC - MAE:', accuracy.mae(predictions_cc, verbose=False))

Ranker has been trained
KNNBasic/CC - RMSE: 0.9491439142210314
KNNBasic/CC - MAE: 0.7147106648730456
CPU times: user 8.72 s, sys: 452 ms, total: 9.17 s
Wall time: 9.49 s


### Below is the baseline of a single stage

In [68]:
%%time
knnss=KNNBasic(k=40)
reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(final_df[['userId', 'movieId', 'rating']], reader=reader)
trainset = dataset.build_full_trainset()
knnss.fit(trainset)
testset = trainset.build_anti_testset()
predictions_knnss = knnss.test(testset)
print('KNN - RMSE:', accuracy.rmse(predictions_knnss, verbose=False))
print('KNN - MAE:', accuracy.mae(predictions_knnss, verbose=False))

Computing the msd similarity matrix...
Done computing similarity matrix.
KNN - RMSE: 0.7612654931634538
KNN - MAE: 0.5800607958862584
CPU times: user 1h 30min 51s, sys: 2min 58s, total: 1h 33min 49s
Wall time: 1h 46min 26s
