# Two Stage Recommender System Testing
*CS247 Project Winter 2022 - Ryan Li, Osama Hassen, Sam Anderson*

## Recommender Systems on MovieLens 1M Dataset

### Loading and Formatting the Dataset

In [1]:
#primary source: https://www.kaggle.com/scratchpad/notebookbdf4340e3d/edit
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)

import warnings
warnings.filterwarnings('ignore')

plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = [18, 8]

In [2]:
#Load m1-1m dataset into Colab
!wget https://files.grouplens.org/datasets/movielens/ml-1m.zip
#!unzip ml-1m.zip      #this works in colab but not windows/jupyter

#Use this for windows/jupyter to unzip
import zipfile
with zipfile.ZipFile("ml-1m.zip", "r") as zip_ref:
    zip_ref.extractall()

--2022-03-08 22:50:28--  https://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip.1’


2022-03-08 22:50:37 (726 KB/s) - ‘ml-1m.zip.1’ saved [5917549/5917549]



In [13]:
#extract data into dataframes
reviews = pd.read_csv('ml-1m/ratings.dat', names=['userId', 'movieId', 'rating', 'time'], delimiter='::', engine='python', encoding="iso-8859-1")
movies = pd.read_csv('ml-1m/movies.dat', names=['movieId', 'movie_names', 'genres'], delimiter='::', engine='python', encoding="iso-8859-1")
users = pd.read_csv('ml-1m/users.dat', names=['userId', 'gender', 'age', 'occupation', 'zip'], delimiter='::', engine='python', encoding="iso-8859-1")

print('Reviews shape:', reviews.shape)
print('Users shape:', users.shape)
print('Movies shape:', movies.shape)



Reviews shape: (1000209, 4)
Users shape: (6040, 5)
Movies shape: (3883, 3)


In [14]:
#reformat the dataset to retain/enhance useful features then merge the tables

reviews.drop(['time'], axis=1, inplace=True)
users.drop(['zip'], axis=1, inplace=True)

movies['release_year'] = movies['movie_names'].str.extract(r'(?:\((\d{4})\))?\s*$', expand=False)

ages_map = {1: 'Under 18',
            18: '18 - 24',
            25: '25 - 34',
            35: '35 - 44',
            45: '45 - 49',
            50: '50 - 55',
            56: '56+'}

occupations_map = {0: 'Not specified',
                   1: 'Academic / Educator',
                   2: 'Artist',
                   3: 'Clerical / Admin',
                   4: 'College / Grad Student',
                   5: 'Customer Service',
                   6: 'Doctor / Health Care',
                   7: 'Executive / Managerial',
                   8: 'Farmer',
                   9: 'Homemaker',
                   10: 'K-12 student',
                   11: 'Lawyer',
                   12: 'Programmer',
                   13: 'Retired',
                   14: 'Sales / Marketing',
                   15: 'Scientist',
                   16: 'Self-Employed',
                   17: 'Technician / Engineer',
                   18: 'Tradesman / Craftsman',
                   19: 'Unemployed',
                   20: 'Writer'}

gender_map = {'M': 'Male', 'F': 'Female'}

users['age'] = users['age'].map(ages_map)
users['occupation'] = users['occupation'].map(occupations_map)
users['gender'] = users['gender'].map(gender_map)

final_df = reviews.merge(movies, on='movieId', how='left').merge(users, on='userId', how='left')

print('final_df shape:', final_df.shape)
final_df

final_df shape: (1000209, 9)


Unnamed: 0,userId,movieId,rating,movie_names,genres,release_year,gender,age,occupation
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975),Drama,1975,Female,Under 18,K-12 student
1,1,661,3,James and the Giant Peach (1996),Animation|Children's|Musical,1996,Female,Under 18,K-12 student
2,1,914,3,My Fair Lady (1964),Musical|Romance,1964,Female,Under 18,K-12 student
3,1,3408,4,Erin Brockovich (2000),Drama,2000,Female,Under 18,K-12 student
4,1,2355,5,"Bug's Life, A (1998)",Animation|Children's|Comedy,1998,Female,Under 18,K-12 student
...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,Weekend at Bernie's (1989),Comedy,1989,Male,25 - 34,Doctor / Health Care
1000205,6040,1094,5,"Crying Game, The (1992)",Drama|Romance|War,1992,Male,25 - 34,Doctor / Health Care
1000206,6040,562,5,Welcome to the Dollhouse (1995),Comedy|Drama,1995,Male,25 - 34,Doctor / Health Care
1000207,6040,1096,4,Sophie's Choice (1982),Drama,1982,Male,25 - 34,Doctor / Health Care


In [11]:
n_users = final_df['userId'].nunique()
n_movies = final_df['movieId'].nunique()

print('Number of users:', n_users)
print('Number of movies:', n_movies)

Number of users: 6040
Number of movies: 3706


In [36]:
final_df_matrix = final_df.pivot(index='userId',
                                 columns='movieId',
                                 values='rating').fillna(0)

In [37]:
#save final_df to csv for nominators
final_df.to_csv('final_df')


### Begin Nominator Stage - SVD Nominators
#### (1) Generate n nominator dfs from full dataset

In [2]:
final_df = pd.read_csv('./final_df')
final_df=final_df.drop(['Unnamed: 0'], axis=1)
final_df.head()

Unnamed: 0,userId,movieId,rating,movie_names,genres,release_year,gender,age,occupation
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975),Drama,1975,Female,Under 18,K-12 student
1,1,661,3,James and the Giant Peach (1996),Animation|Children's|Musical,1996,Female,Under 18,K-12 student
2,1,914,3,My Fair Lady (1964),Musical|Romance,1964,Female,Under 18,K-12 student
3,1,3408,4,Erin Brockovich (2000),Drama,2000,Female,Under 18,K-12 student
4,1,2355,5,"Bug's Life, A (1998)",Animation|Children's|Comedy,1998,Female,Under 18,K-12 student


In [4]:
#enter df of whole dataset, number of nominators, and fraction of sampling
#(frac=1/4 means each nominator will randomly sample 25% of the df)
#allow for some overlap. n_noms should be > denominator of frac.
def nom_sampling(df, n_noms, frac):
    nom_dfs = {}
    k = 0
    while k < n_noms:
        # dynamically create key
        nom_dfs[k] = df.sample(frac=frac, replace=False, axis=0) 
        k += 1
    print(len(nom_dfs), 'dfs of ', nom_dfs[0].shape[0], 'randomly chosen movies have been created')
    return nom_dfs

In [5]:
#!pip install surprise #
from surprise import Reader, Dataset, SVD, SVDpp, accuracy

#### (2) Using dictionaries for the nominators and duplicating the training code for each item as individual functions

In [6]:

#fitting nominators
def fit_noms(nom_dfs):  #expects dictionary of nominator dfs
    reader = Reader(rating_scale=(1, 5))
    nom_datasets = {key:Dataset.load_from_df(nom_dfs[key][['userId', 'movieId', 'rating']], reader=reader) for key in nom_dfs}
    nom_trainsets = {key:nom_datasets[key].build_full_trainset() for key in nom_datasets}
    nom_svds = {key:SVD(n_factors=50, n_epochs=20, random_state=42).fit(nom_trainsets[key]) for key in nom_trainsets}
    print(len(nom_dfs), 'nominators have been trained')
    return nom_svds  #returns dictionary of trained models


def Build_Testset4User(df, u_id):
    
    global user_id
    user_id=u_id
    reader = Reader(rating_scale=(1, 5))
    dataset = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader=reader)
    trainset = dataset.build_full_trainset()  #whole dataset is used for trainig
    fill = trainset.global_mean
    testset = list()
    u = trainset.to_inner_uid(user_id)
    
    # ur == users ratings
    user_items = set([item_inner_id for (item_inner_id, rating) in trainset.ur[u]])
    
    testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
                            i in trainset.all_items() if i not in user_items]
    
    return testset

#enter testset (from Build_Testset4User), nom_svds (from fit_noms), # items per nominator to be passed to Ranker
def Nominator_SVD(testset, nom_knnbls, num_nomin=100):  
    nom_predict = {key:nom_svds[key].test(testset) for key in nom_svds}  # we can change to SVD++ later
    
    nom_recomm, recommendation = [], {}
    recommendation = {}
    
    for key in nom_predict:
        for userID, movieID, actualRating, estimatedRating, _ in nom_predict[key]:
            intMovieID = int(movieID)
            nom_recomm.append((intMovieID, estimatedRating))
        recommendation[key] = nom_recomm

        recommendation[key].sort(key=lambda x: x[1], reverse=True)  #sort nominated movies by rating desc
    
    #generate list of nominated movie_ids from all nominators
    nominated_mov_ids= []
    for key in recommendation:
        for i in range(num_nomin):
            nominated_mov_ids.append(recommendation[key][i][0])  
        
    nom_movies_df=final_df.loc[final_df['movieId'].isin(nominated_mov_ids)]
    
    return nom_movies_df



#### (3) Train Ranker with input from nominators

In [7]:
#create and train an SVD Ranker stage 
def fit_ranker_SVD(nom_movies_df):   #takes output of nominator stage as a df
    reader = Reader(rating_scale=(1, 5))
    ranker_dataset = Dataset.load_from_df(nom_movies_df[['userId', 'movieId', 'rating']], reader=reader)
    ranker_svd = SVD(n_factors=100,n_epochs=50,random_state=42)
    ranker_trainset = ranker_dataset.build_full_trainset()
    ranker_svd.fit(ranker_trainset)
    print('Ranker has been trained')
    return ranker_svd, ranker_trainset




#### (4) Test

### SVD Nominators:

In [33]:
%%time
# Train Nominator
n_noms = 10
u_id = 1920
nom_dfs = nom_sampling(final_df, n_noms, 1/2)
nom_svds = fit_noms(nom_dfs)
testset = Build_Testset4User(final_df, u_id)
nom_movies_df = Nominator_SVD(testset, nom_svds, num_nomin=500)

10 dfs of  500104 randomly chosen movies have been created
10 nominators have been trained
Wall time: 2min 57s


### KNNBasic Ranker:

In [9]:
#create and train an KNNBasic Ranker stage 
from surprise import KNNBasic

def fit_ranker_KNN(nom_movies_df):   #takes output of nominator stage as a df
    reader = Reader(rating_scale=(1, 5))
    ranker_dataset = Dataset.load_from_df(nom_movies_df[['userId', 'movieId', 'rating']], reader=reader)
    global ranker_knn
    ranker_knn = KNNBasic(n_factors=50, random_state=42)
    ranker_trainset = ranker_dataset.build_full_trainset()
    ranker_knn.fit(ranker_trainset)
    print('Ranker has been trained')
    return ranker_knn, ranker_trainset


def Rank_KNN(num_recommend=10, latest=False):
    
    testset = Build_Testset4User(final_df, user_id)
    predict = ranker_knn.test(testset)  
    
    recommendation = list()
    for userID, movieID, actualRating, estimatedRating, _ in predict:
        intMovieID = int(movieID)
        recommendation.append((intMovieID, estimatedRating))
        
    recommendation.sort(key=lambda x: x[1], reverse=True)
    
    id_2_names = dict()
    for idx, names in zip(movies['movieId'], movies['movie_names']):
        id_2_names[idx] = names
    
    movie_names, movie_ratings = [], []
    for name, ratings in recommendation[:20]:
        movie_names.append(id_2_names[name])
        movie_ratings.append(ratings)
        
    movie_dataframe =  pd.DataFrame({'movie_names': movie_names,
                                     'rating': movie_ratings}).merge(movies[['movie_names', 'release_year']],
                                            on='movie_names', how='left')
    
    if latest == True:
        return movie_dataframe.sort_values('release_year', ascending=False)[['movie_names', 'rating']].head(num_recommend)
    
    else:
        return movie_dataframe.drop('release_year', axis=1).head(num_recommend)

In [34]:
%%time
ranker_knn, ranker_trainset_knn = fit_ranker_KNN(nom_movies_df)
res_knn = Rank_KNN(num_recommend=10)
ranker_testset_knn = ranker_trainset_knn.build_anti_testset()
predictions_knn = ranker_knn.test(ranker_testset_knn)
print('SVD/KNNBasic - RMSE:', accuracy.rmse(predictions_knn, verbose=False))
print('SVD/KNNBasic - MAE:', accuracy.mae(predictions_knn, verbose=False))

Computing the msd similarity matrix...
Done computing similarity matrix.
Ranker has been trained
SVD/KNNBasic - RMSE: 0.3202063417827386
SVD/KNNBasic - MAE: 0.2535558833469828
Wall time: 6min 44s


### Below is the baseline of a single stage

In [None]:
%%time
svd=SVD(n_factors=50, n_epochs=20, random_state=42)
dataset = Dataset.load_from_df(final_df[['userId', 'movieId', 'rating']], reader=reader)
trainset = dataset.build_full_trainset()
svd.fit(trainset)
testset = trainset.build_anti_testset()
predictions_svd = svd.test(testset)
print('SVD - RMSE:', accuracy.rmse(predictions_svd, verbose=False))
print('SVD - MAE:', accuracy.mae(predictions_svd, verbose=False))

SVD - RMSE: 0.6998857258382658
SVD - MAE: 0.541771079017402


In [None]:
def Rank_SVD(num_recommend=10, latest=False):
    
    testset = Build_Testset4User(final_df, user_id)
    predict = ranker_svd.test(testset)  
    
    recommendation = list()
    for userID, movieID, actualRating, estimatedRating, _ in predict:
        intMovieID = int(movieID)
        recommendation.append((intMovieID, estimatedRating))
        
    recommendation.sort(key=lambda x: x[1], reverse=True)
    
    id_2_names = dict()
    for idx, names in zip(movies['movieId'], movies['movie_names']):
        id_2_names[idx] = names
    
    movie_names, movie_ratings = [], []
    for name, ratings in recommendation[:20]:
        movie_names.append(id_2_names[name])
        movie_ratings.append(ratings)
        
    movie_dataframe =  pd.DataFrame({'movie_names': movie_names,
                                     'rating': movie_ratings}).merge(movies[['movie_names', 'release_year']],
                                            on='movie_names', how='left')
    
    if latest == True:
        return movie_dataframe.sort_values('release_year', ascending=False)[['movie_names', 'rating']].head(num_recommend)
    
    else:
        return movie_dataframe.drop('release_year', axis=1).head(num_recommend)

