In [46]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from surprise import SVD
from surprise import Reader
from surprise import Dataset
from surprise import evaluate, print_perf
from surprise import accuracy
from sklearn import cross_validation as cv

%matplotlib inline

In [47]:
df = pd.read_csv( './toronto_reviews.csv')

In [48]:
df2 = df[[ 'user_id', 'business_id', 'stars'] ]

In [49]:
train_df, test_df = cv.train_test_split(df2, test_size=0.4)

In [50]:
# Se establece el rango en el cual se aceptaran los ratings
reader = Reader( rating_scale = ( 1, 5 ) )

In [51]:
# Transformación de los datasets, puede leer los datasets directameente desde el disco sin necesidad de pasar por pandas
train_data = Dataset.load_from_df( train_df[ [ 'user_id', 'business_id', 'stars' ] ], reader )
#validation_data = Dataset.load_from_df( validation_df[ [ 'user_id', 'item_id', 'rating' ] ], reader )
test_data = Dataset.load_from_df( test_df[ [ 'user_id', 'business_id', 'stars' ] ], reader )

In [52]:
# Surprise requiere que explicitamente los datasets sean transformados a datasets de entrenamiento y prueba en cada caso 
# Si bien no se entrenará sobre los datasets de validación y prueba, surprise requiere que sean tranformados a entrenamiento para posteriormente ser transformados a prueba
train_data = train_data.build_full_trainset()
#validation_data = validation_data.build_full_trainset()
test_data = test_data.build_full_trainset()

In [53]:
# Finalmente, se convierten los 3 datasets a prueba ya que se medirá el error obtenido en los 3
train_data_2 = train_data.build_testset()
#validation_data = validation_data.build_testset()
test_data = test_data.build_testset()

In [54]:
mean = train_data.global_mean
print( mean )

3.605220625763645


In [55]:
algo = SVD( n_factors = 5, n_epochs = 200, biased = True, lr_all = 0.005, reg_all = 0, init_mean = 0, init_std_dev = 0.01, verbose = False )

In [56]:
# Se realiza el entrenamiento a partir del dataset debido
algo.fit( train_data )

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x106b467f0>

In [57]:
reconstruct = ( pu ).dot( ( qi ).T ) + bu + bi.T + mean

In [58]:
# Ya que algunos ratings reconstruidos están fuera del rango aceptado, se realiza un ajuste entre 1 y 5
reconstruct = np.clip( reconstruct, 1, 5 )

## Evaluación de predicciones

A partir del modelo entrenado, se mide el error del modelo en el conjunto de entrenamiento, validación y prueba

In [65]:
predictions_train = algo.test( train_data_2 )

In [None]:
#predictions_validation = algo.test( validation_data )
predictions_test = algo.test( test_data )

In [None]:
accuracy.rmse( predictions_train, verbose = True )

In [None]:
accuracy.rmse( predictions_test, verbose = True )

## Selección de hiper-parámetros

Seleccionar diferentes hiperparametros.

In [64]:
algos = {}
desc = {}
ii = [1, 5, 20, 50, 500]
ee = [10, 50, 200]
lrs = [0, 0.005, 0.01, 0.1, 1, 5]
regs = [0, 0.1, 1, 5, 10]
counter = 0
for i in ii:
    for e in ee:
        for lr in lrs:
            for reg in regs:
                desc[counter] = 'k = ' + str(i) + ', e = ' + str(e) + ', lr = ' + str(lr) + ', reg = ' + str(reg)
                print(desc[counter])
                algos[counter] = SVD( n_factors = i, n_epochs = e, biased = True, lr_all = lr, reg_all = reg, init_mean = 0, init_std_dev = 0.01, verbose = False )
                algos[counter].train(train_data)
                counter = counter + 1


k = 1, e = 10, lr = 0, reg = 0




k = 1, e = 10, lr = 0, reg = 0.1
k = 1, e = 10, lr = 0, reg = 1
k = 1, e = 10, lr = 0, reg = 5
k = 1, e = 10, lr = 0, reg = 10
k = 1, e = 10, lr = 0.005, reg = 0
k = 1, e = 10, lr = 0.005, reg = 0.1
k = 1, e = 10, lr = 0.005, reg = 1
k = 1, e = 10, lr = 0.005, reg = 5
k = 1, e = 10, lr = 0.005, reg = 10
k = 1, e = 10, lr = 0.01, reg = 0
k = 1, e = 10, lr = 0.01, reg = 0.1
k = 1, e = 10, lr = 0.01, reg = 1
k = 1, e = 10, lr = 0.01, reg = 5
k = 1, e = 10, lr = 0.01, reg = 10
k = 1, e = 10, lr = 0.1, reg = 0
k = 1, e = 10, lr = 0.1, reg = 0.1
k = 1, e = 10, lr = 0.1, reg = 1
k = 1, e = 10, lr = 0.1, reg = 5
k = 1, e = 10, lr = 0.1, reg = 10
k = 1, e = 10, lr = 1, reg = 0
k = 1, e = 10, lr = 1, reg = 0.1
k = 1, e = 10, lr = 1, reg = 1
k = 1, e = 10, lr = 1, reg = 5
k = 1, e = 10, lr = 1, reg = 10
k = 1, e = 10, lr = 5, reg = 0
k = 1, e = 10, lr = 5, reg = 0.1
k = 1, e = 10, lr = 5, reg = 1
k = 1, e = 10, lr = 5, reg = 5
k = 1, e = 10, lr = 5, reg = 10
k = 1, e = 50, lr = 0, reg = 0
k = 1, 

k = 20, e = 200, lr = 0.005, reg = 0
k = 20, e = 200, lr = 0.005, reg = 0.1
k = 20, e = 200, lr = 0.005, reg = 1
k = 20, e = 200, lr = 0.005, reg = 5
k = 20, e = 200, lr = 0.005, reg = 10
k = 20, e = 200, lr = 0.01, reg = 0
k = 20, e = 200, lr = 0.01, reg = 0.1
k = 20, e = 200, lr = 0.01, reg = 1
k = 20, e = 200, lr = 0.01, reg = 5
k = 20, e = 200, lr = 0.01, reg = 10
k = 20, e = 200, lr = 0.1, reg = 0
k = 20, e = 200, lr = 0.1, reg = 0.1
k = 20, e = 200, lr = 0.1, reg = 1
k = 20, e = 200, lr = 0.1, reg = 5
k = 20, e = 200, lr = 0.1, reg = 10
k = 20, e = 200, lr = 1, reg = 0
k = 20, e = 200, lr = 1, reg = 0.1
k = 20, e = 200, lr = 1, reg = 1
k = 20, e = 200, lr = 1, reg = 5
k = 20, e = 200, lr = 1, reg = 10
k = 20, e = 200, lr = 5, reg = 0
k = 20, e = 200, lr = 5, reg = 0.1
k = 20, e = 200, lr = 5, reg = 1
k = 20, e = 200, lr = 5, reg = 5
k = 20, e = 200, lr = 5, reg = 10
k = 50, e = 10, lr = 0, reg = 0
k = 50, e = 10, lr = 0, reg = 0.1
k = 50, e = 10, lr = 0, reg = 1
k = 50, e = 10, l

In [69]:
predictions_train = {}
predictions_test = {}
rmses_train = {}
rmses_test = {}
for i in range(0, 449):
    predictions_train[i] = algos[i].test(train_data_2)
    predictions_test[i] = algos[i].test(test_data)
    rmses_train[i] = accuracy.rmse( predictions_train[i], verbose = True )
    rmses_test[i] = accuracy.rmse( predictions_test[i], verbose = True )

RMSE: 1.3075
RMSE: 1.3062
RMSE: 1.3075
RMSE: 1.3062
RMSE: 1.3075
RMSE: 1.3062
RMSE: 1.3075
RMSE: 1.3062
RMSE: 1.3075
RMSE: 1.3062
RMSE: 1.1185
RMSE: 1.1947
RMSE: 1.1210
RMSE: 1.1955
RMSE: 1.1515
RMSE: 1.2141
RMSE: 1.2185
RMSE: 1.2589
RMSE: 1.2469
RMSE: 1.2764
RMSE: 1.0611
RMSE: 1.1829
RMSE: 1.0643
RMSE: 1.1826
RMSE: 1.1068
RMSE: 1.2027
RMSE: 1.1984
RMSE: 1.2550
RMSE: 1.2362
RMSE: 1.2747
RMSE: 0.7882
RMSE: 1.2799
RMSE: 0.8426
RMSE: 1.2306
RMSE: 0.9900
RMSE: 1.2165
RMSE: 1.1757
RMSE: 1.2606
RMSE: 1.2357
RMSE: 1.2817


  est = self.estimate(iuid, iiid)


RMSE: 1.9096
RMSE: 1.9051
RMSE: 1.9123
RMSE: 1.9065
RMSE: 1.9109
RMSE: 1.9066
RMSE: 1.9118
RMSE: 1.9068
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.3075
RMSE: 1.3062
RMSE: 1.3075
RMSE: 1.3062
RMSE: 1.3075
RMSE: 1.3062
RMSE: 1.3075
RMSE: 1.3062
RMSE: 1.3075
RMSE: 1.3062
RMSE: 0.9727
RMSE: 1.1811
RMSE: 0.9816
RMSE: 1.1787
RMSE: 1.0471
RMSE: 1.1974
RMSE: 1.1787
RMSE: 1.2539
RMSE: 1.2292
RMSE: 1.2744
RMSE: 0.8663
RMSE: 1.2075
RMSE: 0.9177
RMSE: 1.1889
RMSE: 1.0059
RMSE: 1.1998
RMSE: 1.1700
RMSE: 1.2535
RMSE: 1.2273
RMSE: 1.2741
RMSE: 0.7126
RMSE: 1.3583
RMSE: 0.7237
RMSE: 1.2969
RMSE: 0.9813
RMSE: 1.2203
RMSE: 1.1757
RMSE: 1.2606
RMSE: 1.2357
RMSE: 1.2817
RMSE: 1.9121
RMSE: 1.9060
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9090
RMSE: 1.9058
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118

RMSE: 0.4153
RMSE: 1.2094
RMSE: 0.9794
RMSE: 1.2052
RMSE: 1.1686
RMSE: 1.2538
RMSE: 1.2276
RMSE: 1.2744
RMSE: 0.0195
RMSE: 1.2638
RMSE: 0.3001
RMSE: 1.2158
RMSE: 0.9688
RMSE: 1.2089
RMSE: 1.1681
RMSE: 1.2535
RMSE: 1.2272
RMSE: 1.2741
RMSE: 0.0074
RMSE: 1.2616
RMSE: 0.3006
RMSE: 1.2146
RMSE: 0.9807
RMSE: 1.2203
RMSE: 1.1757
RMSE: 1.2606
RMSE: 1.2357
RMSE: 1.2817
RMSE: 1.9117
RMSE: 1.9063
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9094
RMSE: 1.9057
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.3075
RMSE: 1.3062
RMSE: 1.3075
RMSE: 1.3062
RMSE: 1.3075
RMSE: 1.3062
RMSE: 1.3075
RMSE: 1.3062
RMSE: 1.3075
RMSE: 1.3062
RMSE: 1.1119
RMSE: 1.1948
RMSE: 1.1165
RMSE: 1.1955
RMSE: 1.1502
RMSE: 1.2141
RMSE: 1.2182
RMSE: 1.2589
RMSE: 1.2469
RMSE: 1.2764
RMSE: 1.0397
RMSE: 1.1829
RMSE: 1.0555
RMSE: 1.1827
RMSE: 1.1054
RMSE: 1.2027
RMSE: 1.1982
RMSE: 1.2551
RMSE: 1.2361

  est = self.estimate(iuid, iiid)


RMSE: 1.9106
RMSE: 1.9068
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.3075
RMSE: 1.3062
RMSE: 1.3075
RMSE: 1.3062
RMSE: 1.3075
RMSE: 1.3062
RMSE: 1.3075
RMSE: 1.3062
RMSE: 1.3075
RMSE: 1.3062
RMSE: 0.7202
RMSE: 1.1822
RMSE: 0.9372
RMSE: 1.1784
RMSE: 1.0460
RMSE: 1.1974
RMSE: 1.1786
RMSE: 1.2539
RMSE: 1.2291
RMSE: 1.2744
RMSE: 0.2890
RMSE: 1.1954
RMSE: 0.6285
RMSE: 1.1862
RMSE: 1.0052
RMSE: 1.1998
RMSE: 1.1700
RMSE: 1.2535
RMSE: 1.2273
RMSE: 1.2741
RMSE: 0.0184
RMSE: 1.2050
RMSE: 0.2977
RMSE: 1.2055
RMSE: 0.9808
RMSE: 1.2203
RMSE: 1.1757
RMSE: 1.2606
RMSE: 1.2357
RMSE: 1.2817
RMSE: 1.9118
RMSE: 1.9063
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9096
RMSE: 1.9057
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.9118
RMSE: 1.9065
RMSE: 1.3075

In [90]:
min_config = min(rmses_test, key=rmses_test.get)

In [91]:
print(desc[min_config])

k = 500, e = 50, lr = 0.005, reg = 0.1
