In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from surprise import SVD
from surprise import Reader
from surprise import Dataset
from surprise import evaluate, print_perf
from surprise import accuracy
from sklearn import cross_validation as cv

%matplotlib inline

In [None]:
df = pd.read_csv( './toronto_reviews.csv')

In [None]:
df2 = df[[ 'user_id', 'business_id', 'stars'] ]

In [None]:
train_df, test_df = cv.train_test_split(df2, test_size=0.4)

In [None]:
# Se establece el rango en el cual se aceptaran los ratings
reader = Reader( rating_scale = ( 1, 5 ) )

In [None]:
# Transformación de los datasets, puede leer los datasets directameente desde el disco sin necesidad de pasar por pandas
train_data = Dataset.load_from_df( train_df[ [ 'user_id', 'business_id', 'stars' ] ], reader )
#validation_data = Dataset.load_from_df( validation_df[ [ 'user_id', 'item_id', 'rating' ] ], reader )
test_data = Dataset.load_from_df( test_df[ [ 'user_id', 'business_id', 'stars' ] ], reader )

In [None]:
# Surprise requiere que explicitamente los datasets sean transformados a datasets de entrenamiento y prueba en cada caso 
# Si bien no se entrenará sobre los datasets de validación y prueba, surprise requiere que sean tranformados a entrenamiento para posteriormente ser transformados a prueba
train_data = train_data.build_full_trainset()
#validation_data = validation_data.build_full_trainset()
test_data = test_data.build_full_trainset()

In [None]:
# Finalmente, se convierten los 3 datasets a prueba ya que se medirá el error obtenido en los 3
train_data_2 = train_data.build_testset()
#validation_data = validation_data.build_testset()
test_data = test_data.build_testset()

In [None]:
mean = train_data.global_mean
print( mean )

In [None]:
algo = SVD( n_factors = 5, n_epochs = 200, biased = True, lr_all = 0.005, reg_all = 0, init_mean = 0, init_std_dev = 0.01, verbose = False )

In [None]:
# Se realiza el entrenamiento a partir del dataset debido
algo.fit( train_data )

In [None]:
reconstruct = ( pu ).dot( ( qi ).T ) + bu + bi.T + mean

In [None]:
# Ya que algunos ratings reconstruidos están fuera del rango aceptado, se realiza un ajuste entre 1 y 5
reconstruct = np.clip( reconstruct, 1, 5 )

## Evaluación de predicciones

A partir del modelo entrenado, se mide el error del modelo en el conjunto de entrenamiento, validación y prueba

In [None]:
predictions_train = algo.test( train_data_2 )

In [None]:
#predictions_validation = algo.test( validation_data )
predictions_test = algo.test( test_data )

In [None]:
accuracy.rmse( predictions_train, verbose = True )

In [None]:
accuracy.rmse( predictions_test, verbose = True )

## Selección de hiper-parámetros

Seleccionar diferentes hiperparametros.

In [None]:
algos = {}
for i in range(5, 21):
    algos[i] = SVD( n_factors = i, n_epochs = 200, biased = True, lr_all = 0.005, reg_all = 0, init_mean = 0, init_std_dev = 0.01, verbose = False )
    algos[i].train(train_data)


In [None]:
predictions_train = {}
#predictions_validation = {}
predictions_test = {}
rmses_train = {}
#rmses_validation = {}
rmses_test = {}
for i in range(5, 21):
    predictions_train[i] = algos[i].test(train_data_2)
    #predictions_validation[i] = algos[i].test(validation_data)
    predictions_test[i] = algos[i].test(test_data)
    rmses_train[i] = accuracy.rmse( predictions_train[i], verbose = True )
    #rmses_validation[i] = accuracy.rmse( predictions_validation[i], verbose = True )
    rmses_test[i] = accuracy.rmse( predictions_test[i], verbose = True )

In [None]:
plt.plot(rmses_train.keys(), rmses_train.values())

In [None]:
algosr = {}
for r in [0.001, 0.1, 0.5, 1, 10]:
    algosr[r] = SVD( n_factors = 5, n_epochs = 200, biased = True, lr_all = 0.005, reg_all = r, init_mean = 0, init_std_dev = 0.01, verbose = True )
    algosr[r].train(train_data)


In [None]:
predictions_train = {}
#predictions_validation = {}
predictions_test = {}
rmses_train = {}
#rmses_validation = {}
rmses_test = {}
for i in [0.001, 0.1, 0.5, 1, 10]:
    predictions_train[i] = algosr[i].test(train_data_2)
    #predictions_validation[i] = algosr[i].test(validation_data)
    predictions_test[i] = algosr[i].test(test_data)
    rmses_train[i] = accuracy.rmse( predictions_train[i], verbose = True )
    #rmses_validation[i] = accuracy.rmse( predictions_validation[i], verbose = True )
    rmses_test[i] = accuracy.rmse( predictions_test[i], verbose = True )

In [None]:
plt.plot(rmses_train.keys(), rmses_train.values())




In [None]:
plt.plot(rmses_test.keys(), rmses_test.values())