In [13]:
#import math modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#import surprise modules
from surprise.prediction_algorithms.random_pred import NormalPredictor
from surprise.prediction_algorithms.baseline_only import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise import evaluate, print_perf
from surprise import GridSearch
from surprise import accuracy

#Graphing
%matplotlib inline

In [14]:
#Setup functions to split data for testing and training
from sklearn.model_selection import train_test_split

def pick_users_books(df, num_users, num_books):
    #Get the top num_users most prolific users
    user_counts = pd.DataFrame(df.user_id.value_counts()).sort_values('user_id', ascending=False)
    top_10K_users = list(user_counts[0:num_users].index)
    user_filtered_df = df[df.user_id.isin(top_10K_users)]
    #Get the top num_books most reviewed books by the selected users
    filtered_book_counts = pd.DataFrame(user_filtered_df.book_id.value_counts()).sort_values('book_id', ascending = False)
    top_100_filtered_books = list(filtered_book_counts[0:num_books].index)
    #Generate new filtered dataframe
    filtered_df = user_filtered_df[user_filtered_df.book_id.isin(top_100_filtered_books)]
    print("New dataframe has {} users, {} items, and a sparsity of {}".format(len(filtered_df.user_id.unique()),len(filtered_df.book_id.unique()),len(filtered_df)/(len(filtered_df.user_id.unique())*len(filtered_df.book_id.unique()))))
    #Split dataframe into training and test sets
    train, test = train_test_split(filtered_df, test_size = 0.2, random_state=42)
    return train, test

def get_all_subsets(df):
    #Generate different subsets for scaling purposes
    train_500_20, test_500_20 = pick_users_books(df, 500, 20)
    train_2000_50, test_2000_50 = pick_users_books(df, 2000, 50)
    train_10000_100, test_10000_100 = pick_users_books(df, 10000, 100)

    return train_500_20, test_500_20, train_2000_50, test_2000_50, train_10000_100, test_10000_100

In [15]:
#Set up functions to nicely display gridsearch data in a table
#Need to do results_df = pd.DataFrame.from_dict(grid_search.cv_results)

#Function for alternating least squares
def convert_grid_results_als(df):

    my_dict = df.to_dict('list')
    method = []
    reg_i = []
    reg_u = []
    n_epochs = []
    
    for x in range(len(my_dict['bsl_options'])):
        y = my_dict['bsl_options'][x]
        method.append(y['method'])
        reg_i.append(y['reg_i'])    
        reg_u.append(y['reg_u'])
        n_epochs.append(y['n_epochs'])

    del my_dict['params']
    del my_dict['scores']
    del my_dict['bsl_options']

    my_dict['method'] = method
    my_dict['reg_i'] = reg_i
    my_dict['reg_u'] = reg_u
    my_dict['n_epochs'] = n_epochs
    my_df = pd.DataFrame.from_dict(my_dict).sort_values('RMSE')

    return my_df

#Function for gradient descent
def convert_grid_results_sgd(df):

    my_dict = df.to_dict('list')
    method = []
    reg = []
    learning_rate = [] 
    n_epochs = []

    for x in range(len(my_dict['bsl_options'])):
        y = my_dict['bsl_options'][x]
        method.append(y['method'])
        reg.append(y['reg'])        
        learning_rate.append(y['learning_rate'])
        n_epochs.append(y['n_epochs'])

    del my_dict['params']
    del my_dict['scores']
    del my_dict['bsl_options']

    my_dict['method'] = method
    my_dict['reg'] = reg    
    my_dict['learning_rate'] = learning_rate
    my_dict['n_epochs'] = n_epochs
    my_df = pd.DataFrame.from_dict(my_dict).sort_values('RMSE')

    return my_df

In [16]:
#load ratings data
ratings = pd.read_csv('../ratings.csv')

In [55]:
#Split data into trainint and test sets of different sizes
train_500_20, test_500_20, train_2000_50, test_2000_50, train_10000_100, test_10000_100 = get_all_subsets(ratings)

#More datasets
tr1, t1 = pick_users_books(ratings, 1000, 50)
all_1000_50 = tr1.append(t1)
tr2, t2 = pick_users_books(ratings, 5000, 75)
all_5000_75 = tr2.append(t2)
tr3, t3 = pick_users_books(ratings, 3000, 50)
all_3000_50 = tr3.append(t3)
tr4, t4 = pick_users_books(ratings, 7500, 100)
all_7500_100 = tr4.append(t4)

New dataframe has 487 users, 20 items, and a sparsity of 0.44260780287474333
New dataframe has 1981 users, 50 items, and a sparsity of 0.3745583038869258
New dataframe has 9980 users, 100 items, and a sparsity of 0.2719659318637275
New dataframe has 988 users, 50 items, and a sparsity of 0.3857085020242915
New dataframe has 4985 users, 75 items, and a sparsity of 0.317753259779338
New dataframe has 2970 users, 50 items, and a sparsity of 0.3695084175084175
New dataframe has 7488 users, 100 items, and a sparsity of 0.28153044871794874


In [18]:
#Run the Normal predictor - no gridsearch needed
algo = NormalPredictor()
reader = Reader(rating_scale=(1,5))
train = Dataset.load_from_df(train_10000_100, reader)
test = Dataset.load_from_df(test_10000_100, reader)
train.split(n_folds=3)
perf = evaluate(algo, train, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm NormalPredictor.

------------
Fold 1
RMSE: 1.3519
MAE:  1.0657
------------
Fold 2
RMSE: 1.3599
MAE:  1.0734
------------
Fold 3
RMSE: 1.3569
MAE:  1.0686
------------
------------
Mean RMSE: 1.3562
Mean MAE : 1.0693
------------
------------


In [19]:
#Grid search for baseline - gradient descent
param_grid = {'bsl_options':{'method': ['sgd'], 'reg':[.02,.01,.05], 'learning_rate':[.005,.001, .01], 'n_epochs':[10,20,40]}}
gridsearch = GridSearch(BaselineOnly, param_grid, measures=['RMSE', 'MAE'], verbose=0)
reader = Reader(rating_scale=(1,5))
train = Dataset.load_from_df(train_10000_100, reader)
test = Dataset.load_from_df(test_10000_100, reader)
train.split(n_folds=3)
gridsearch.evaluate(train)
convert_grid_results_sgd(pd.DataFrame.from_dict(gridsearch.cv_results))

[{'bsl_options': {'n_epochs': 10, 'method': 'sgd', 'learning_rate': 0.005, 'reg': 0.02}}, {'bsl_options': {'n_epochs': 10, 'method': 'sgd', 'learning_rate': 0.005, 'reg': 0.01}}, {'bsl_options': {'n_epochs': 10, 'method': 'sgd', 'learning_rate': 0.005, 'reg': 0.05}}, {'bsl_options': {'n_epochs': 10, 'method': 'sgd', 'learning_rate': 0.001, 'reg': 0.02}}, {'bsl_options': {'n_epochs': 10, 'method': 'sgd', 'learning_rate': 0.001, 'reg': 0.01}}, {'bsl_options': {'n_epochs': 10, 'method': 'sgd', 'learning_rate': 0.001, 'reg': 0.05}}, {'bsl_options': {'n_epochs': 10, 'method': 'sgd', 'learning_rate': 0.01, 'reg': 0.02}}, {'bsl_options': {'n_epochs': 10, 'method': 'sgd', 'learning_rate': 0.01, 'reg': 0.01}}, {'bsl_options': {'n_epochs': 10, 'method': 'sgd', 'learning_rate': 0.01, 'reg': 0.05}}, {'bsl_options': {'n_epochs': 20, 'method': 'sgd', 'learning_rate': 0.005, 'reg': 0.02}}, {'bsl_options': {'n_epochs': 20, 'method': 'sgd', 'learning_rate': 0.005, 'reg': 0.01}}, {'bsl_options': {'n_epo

Unnamed: 0,RMSE,learning_rate,method,n_epochs,reg
10,0.896817,0.005,sgd,20,0.01
20,0.896881,0.005,sgd,40,0.05
9,0.896881,0.005,sgd,20,0.02
11,0.897163,0.005,sgd,20,0.05
18,0.8972,0.005,sgd,40,0.02
19,0.897356,0.005,sgd,40,0.01
7,0.898463,0.01,sgd,10,0.01
6,0.898544,0.01,sgd,10,0.02
17,0.898547,0.01,sgd,20,0.05
15,0.898816,0.01,sgd,20,0.02


In [20]:
#Grid search for baseline - alternating least squares
param_grid = {'bsl_options':{'method': ['als'], 'reg_i':[5,10,20], 'reg_u':[10,15,20], 'n_epochs':[5,10,20]}}
gridsearch = GridSearch(BaselineOnly, param_grid, measures=['RMSE', 'MAE'], verbose=0)
reader = Reader(rating_scale=(1,5))
train = Dataset.load_from_df(train_500_20, reader)
test = Dataset.load_from_df(test_500_20, reader)
train.split(n_folds=3)
gridsearch.evaluate(train)
convert_grid_results_als(pd.DataFrame.from_dict(gridsearch.cv_results))

[{'bsl_options': {'reg_i': 5, 'method': 'als', 'reg_u': 10, 'n_epochs': 5}}, {'bsl_options': {'reg_i': 5, 'method': 'als', 'reg_u': 10, 'n_epochs': 10}}, {'bsl_options': {'reg_i': 5, 'method': 'als', 'reg_u': 10, 'n_epochs': 20}}, {'bsl_options': {'reg_i': 5, 'method': 'als', 'reg_u': 15, 'n_epochs': 5}}, {'bsl_options': {'reg_i': 5, 'method': 'als', 'reg_u': 15, 'n_epochs': 10}}, {'bsl_options': {'reg_i': 5, 'method': 'als', 'reg_u': 15, 'n_epochs': 20}}, {'bsl_options': {'reg_i': 5, 'method': 'als', 'reg_u': 20, 'n_epochs': 5}}, {'bsl_options': {'reg_i': 5, 'method': 'als', 'reg_u': 20, 'n_epochs': 10}}, {'bsl_options': {'reg_i': 5, 'method': 'als', 'reg_u': 20, 'n_epochs': 20}}, {'bsl_options': {'reg_i': 10, 'method': 'als', 'reg_u': 10, 'n_epochs': 5}}, {'bsl_options': {'reg_i': 10, 'method': 'als', 'reg_u': 10, 'n_epochs': 10}}, {'bsl_options': {'reg_i': 10, 'method': 'als', 'reg_u': 10, 'n_epochs': 20}}, {'bsl_options': {'reg_i': 10, 'method': 'als', 'reg_u': 15, 'n_epochs': 5}},

Unnamed: 0,RMSE,method,n_epochs,reg_i,reg_u
18,0.941916,als,5,20,10
19,0.941916,als,10,20,10
20,0.941916,als,20,20,10
9,0.942123,als,5,10,10
10,0.942123,als,10,10,10
11,0.942123,als,20,10,10
0,0.942422,als,5,5,10
1,0.942422,als,10,5,10
2,0.942422,als,20,5,10
21,0.952976,als,5,20,15


In [56]:
import time

#Combine train and test sets
all_10000_100 = train_10000_100.append(test_10000_100)
all_500_20 = train_500_20.append(test_500_20)
all_2000_50 = train_2000_50.append(test_2000_50)


def runBaselineSGD(dataframe, num_users, num_books):
    print("\nEvaluating for {} users and {} books; number of ratings: {}".format(num_users, num_books, len(dataframe)))
    
    #set up reader and get data from dataframe
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(dataframe[['user_id', 'book_id', 'rating']], reader)

    #Split data into training set, testing set, and the anti-test set (all the items neither in test nor train)
    trainset = data.build_full_trainset()
    testset = trainset.build_testset()
    antitestset = trainset.build_anti_testset()

    #Set up algorithm
    bsl_options = {'method': 'sgd', 'learning_rate':.005, 'n_epochs':20, 'reg':.01}
    algo = BaselineOnly(bsl_options = bsl_options)
    
    #Capture start time
    start_time = time.time()

    #Train algorithm
    algo.train(trainset)
    
    #Print training algorithm time
    print("---Training: %s seconds ---" % (time.time() - start_time))

    #Capture start time - test
    start_time = time.time()
    
    #Get predictions
    predictions = algo.test(testset)
    
    #Print training algorithm time
    print("---Testing: %s seconds ---" % (time.time() - start_time))

    #Evaluate accuracy measures
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions, verbose=True)
    accuracy.fcp(predictions, verbose=True)
    print("SPEARMAN:  {}".format(spearman(predictions)))
    
runBaselineSGD(all_500_20,500,20)
runBaselineSGD(all_1000_50,1000,50)
runBaselineSGD(all_2000_50,2000,50)
runBaselineSGD(all_3000_50,3000,50)
runBaselineSGD(all_5000_75,5000,75)
runBaselineSGD(all_7500_100,7500,100)
runBaselineSGD(all_10000_100,10000,100)


Evaluating for 500 users and 20 books; number of ratings: 4311
Estimating biases using sgd...
---Training: 0.015040397644042969 seconds ---
---Testing: 0.0210568904876709 seconds ---
RMSE: 0.8418
MAE:  0.6653
FCP:  0.6223


  c /= stddev[:, None]
  c /= stddev[None, :]
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


SPEARMAN:  0.26246319425252296

Evaluating for 1000 users and 50 books; number of ratings: 19054
Estimating biases using sgd...
---Training: 0.12639832496643066 seconds ---
---Testing: 0.10679054260253906 seconds ---
RMSE: 0.8491
MAE:  0.6632
FCP:  0.6359
SPEARMAN:  0.2826033295098762

Evaluating for 2000 users and 50 books; number of ratings: 37100
Estimating biases using sgd...
---Training: 0.2462773323059082 seconds ---
---Testing: 0.2077312469482422 seconds ---
RMSE: 0.8432
MAE:  0.6586
FCP:  0.6421
SPEARMAN:  0.2833207893730011

Evaluating for 3000 users and 50 books; number of ratings: 54872
Estimating biases using sgd...
---Training: 0.4576988220214844 seconds ---
---Testing: 0.5168583393096924 seconds ---
RMSE: 0.8409
MAE:  0.6564
FCP:  0.6451
SPEARMAN:  0.28965952881578805

Evaluating for 5000 users and 75 books; number of ratings: 118800
Estimating biases using sgd...
---Training: 0.9623401165008545 seconds ---
---Testing: 0.9470932483673096 seconds ---
RMSE: 0.8468
MAE:  0.6

In [42]:
from scipy.stats import spearmanr
def spearman(predictions):
    dict_ratings = {}
    spearmans = []
    for uid, iid, true_r, est, _ in predictions:
        if float(true_r) and float(est) and not np.isnan(true_r) and not np.isnan(est):
            if uid in dict_ratings.keys():
                dict_ratings[uid][0].append(true_r)
                dict_ratings[uid][1].append(est)
            else:
                dict_ratings[uid]=[[true_r],[est]]
    for uid in dict_ratings.keys():
        if len(dict_ratings[uid][0])>1:
            spearman = spearmanr(dict_ratings[uid][0], dict_ratings[uid][1])[0]
            if np.isnan(spearman) == False: # spearman is NaN if all true ratings are the same! exclude these
                spearmans.append(spearman)
    return np.mean(spearmans)