In [2]:
#import math modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#import surprise modules
from surprise.prediction_algorithms.random_pred import NormalPredictor
from surprise.prediction_algorithms.baseline_only import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise import evaluate, print_perf
from surprise import GridSearch
from surprise import accuracy

#Graphing
%matplotlib inline

In [3]:
#Setup functions to split data for testing and training
from sklearn.model_selection import train_test_split

def pick_users_books(df, num_users, num_books):
    #Get the top num_users most prolific users
    user_counts = pd.DataFrame(df.user_id.value_counts()).sort_values('user_id', ascending=False)
    top_10K_users = list(user_counts[0:num_users].index)
    user_filtered_df = df[df.user_id.isin(top_10K_users)]
    #Get the top num_books most reviewed books by the selected users
    filtered_book_counts = pd.DataFrame(user_filtered_df.book_id.value_counts()).sort_values('book_id', ascending = False)
    top_100_filtered_books = list(filtered_book_counts[0:num_books].index)
    #Generate new filtered dataframe
    filtered_df = user_filtered_df[user_filtered_df.book_id.isin(top_100_filtered_books)]
    print("New dataframe has {} users, {} items, and a sparsity of {}".format(len(filtered_df.user_id.unique()),len(filtered_df.book_id.unique()),len(filtered_df)/(len(filtered_df.user_id.unique())*len(filtered_df.book_id.unique()))))
    #Split dataframe into training and test sets
    train, test = train_test_split(filtered_df, test_size = 0.2, random_state=42)
    return train, test

def get_all_subsets(df):
    #Generate different subsets for scaling purposes
    train_500_20, test_500_20 = pick_users_books(df, 500, 20)
    train_2000_50, test_2000_50 = pick_users_books(df, 2000, 50)
    train_10000_100, test_10000_100 = pick_users_books(df, 10000, 100)

    return train_500_20, test_500_20, train_2000_50, test_2000_50, train_10000_100, test_10000_100

In [8]:
#Set up functions to nicely display gridsearch data in a table
#Need to do results_df = pd.DataFrame.from_dict(grid_search.cv_results)

#Function for alternating least squares
def convert_grid_results_als(df):

    my_dict = df.to_dict('list')
    method = []
    reg_i = []
    reg_u = []
    n_epochs = []
    
    for x in range(len(my_dict['bsl_options'])):
        y = my_dict['bsl_options'][x]
        method.append(y['method'])
        reg_i.append(y['reg_i'])    
        reg_u.append(y['reg_u'])
        n_epochs.append(y['n_epochs'])

    del my_dict['params']
    del my_dict['scores']
    del my_dict['bsl_options']

    my_dict['method'] = method
    my_dict['reg_i'] = reg_i
    my_dict['reg_u'] = reg_u
    my_dict['n_epochs'] = n_epochs
    my_df = pd.DataFrame.from_dict(my_dict).sort_values('RMSE')

    return my_df

#Function for gradient descent
def convert_grid_results_sgd(df):

    my_dict = df.to_dict('list')
    method = []
    reg = []
    learning_rate = [] 
    n_epochs = []

    for x in range(len(my_dict['bsl_options'])):
        y = my_dict['bsl_options'][x]
        method.append(y['method'])
        reg.append(y['reg'])        
        learning_rate.append(y['learning_rate'])
        n_epochs.append(y['n_epochs'])

    del my_dict['params']
    del my_dict['scores']
    del my_dict['bsl_options']

    my_dict['method'] = method
    my_dict['reg'] = reg    
    my_dict['learning_rate'] = learning_rate
    my_dict['n_epochs'] = n_epochs
    my_df = pd.DataFrame.from_dict(my_dict).sort_values('RMSE')

    return my_df

In [6]:
#load ratings data
ratings = pd.read_csv('../ratings.csv')

In [7]:
#Split data into trainint and test sets of different sizes
train_500_20, test_500_20, train_2000_50, test_2000_50, train_10000_100, test_10000_100 = get_all_subsets(ratings)

New dataframe has 487 users, 20 items, and a sparsity of 0.44260780287474333
New dataframe has 1981 users, 50 items, and a sparsity of 0.3745583038869258
New dataframe has 9980 users, 100 items, and a sparsity of 0.2719659318637275


In [None]:
#Run the Normal predictor - no gridsearch needed
algo = NormalPredictor()
reader = Reader(rating_scale=(1,5))
train = Dataset.load_from_df(train_10000_100, reader)
test = Dataset.load_from_df(test_10000_100, reader)
train.split(n_folds=3)
perf = evaluate(algo, train, measures=['RMSE', 'MAE'])

In [None]:
#Grid search for baseline - gradient descent
param_grid = {'bsl_options':{'method': ['sgd'], 'reg':[.02,.01,.05], 'learning_rate':[.005,.001, .01], 'n_epochs':[10,20,40]}}
gridsearch = GridSearch(BaselineOnly, param_grid, measures=['RMSE', 'MAE'], verbose=0)
reader = Reader(rating_scale=(1,5))
train = Dataset.load_from_df(train_10000_100, reader)
test = Dataset.load_from_df(test_10000_100, reader)
train.split(n_folds=3)
gridsearch.evaluate(train)
convert_grid_results_sgd(pd.DataFrame.from_dict(gridsearch.cv_results))

In [None]:
#Grid search for baseline - alternating least squares
param_grid = {'bsl_options':{'method': ['als'], 'reg_i':[5,10,20], 'reg_u':[10,15,20], 'n_epochs':[5,10,20]}}
gridsearch = GridSearch(BaselineOnly, param_grid, measures=['RMSE', 'MAE'], verbose=0)
reader = Reader(rating_scale=(1,5))
train = Dataset.load_from_df(train_500_20, reader)
test = Dataset.load_from_df(test_500_20, reader)
train.split(n_folds=3)
gridsearch.evaluate(train)
convert_grid_results_als(pd.DataFrame.from_dict(gridsearch.cv_results))

In [12]:
import time

#Combine train and test sets
all_10000_100 = train_10000_100.append(test_10000_100)
all_500_20 = train_500_20.append(test_500_20)
all_2000_50 = train_2000_50.append(test_2000_50)

def runBaselineSGD(dataframe, num_users, num_books):
    print("\nEvaluating for {} users and {} books".format(num_users, num_books))
    
    #Capture start time
    start_time = time.time()
    
    #set up reader and get data from dataframe
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(dataframe[['user_id', 'book_id', 'rating']], reader)

    #Split data into training set, testing set, and the anti-test set (all the items neither in test nor train)
    trainset = data.build_full_trainset()
    testset = trainset.build_testset()
    antitestset = trainset.build_anti_testset()

    #Set up algorithm
    bsl_options = {'method': 'sgd', 'learning_rate':.005, 'n_epochs':20, 'reg':.01}
    algo = BaselineOnly(bsl_options = bsl_options)

    #Train algorithm
    algo.train(trainset)

    #Get predictions
    predictions = algo.test(testset)
    
    #Print algorithm time
    print("--- %s seconds ---" % (time.time() - start_time))

    #Evaluate accuracy measures
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions, verbose=True)
    accuracy.fcp(predictions, verbose=True)
    
runBaselineSGD(all_500_20,500,20)
runBaselineSGD(all_2000_50,2000,50)
runBaselineSGD(all_10000_100,10000,100)


Evaluating for 500 users and 20 books
Estimating biases using sgd...
--- 0.08469676971435547 seconds ---
RMSE: 0.8418
MAE:  0.6653
FCP:  0.6223

Evaluating for 2000 users and 50 books
Estimating biases using sgd...
--- 0.8695199489593506 seconds ---
RMSE: 0.8432
MAE:  0.6586
FCP:  0.6421

Evaluating for 10000 users and 100 books
Estimating biases using sgd...
--- 8.371980905532837 seconds ---
RMSE: 0.8582
MAE:  0.6711
FCP:  0.6517
