# Sistema usuario-usuario

In [11]:
import pandas as pd, numpy as np, time, os
from IPython.display import display
import matplotlib.pyplot as plt
from surprise import Dataset, Reader, SVD, NormalPredictor
import pickle
from surprise.model_selection import cross_validate, train_test_split, KFold
from surprise.prediction_algorithms.knns import *
from surprise.accuracy import rmse, mae
from collections import defaultdict


In [9]:
def cargar_archivo(archivo, pickle, columnas, fechas, sep = '\t'):

    try:
        t1 = time.clock()
        print('Cargando el pickle (si existe)...')
        tabla = pd.read_pickle(pickle)
        t2 = time.clock()
        print('Leer la tabla DATASET toma {:.2f} segundos'.format(t2 - t1))

    except FileNotFoundError:
        
        print('El Pickle NO existe. Creando el Pickle...')
        t1 = time.clock()
        tabla = pd.read_csv(archivo, sep = sep, names = columnas)
        t2 = time.clock()

        print('Leer la tabla toma {:.2f} segundos'.format(t2 - t1))

        tabla.to_pickle(pickle)

    return tabla

def cargar_dataset():
    fechas = ['TIMESTAMP']
    archivo = os.path.join('.', 'toronto_reviews_plus90.csv')
    pickle = './dataset_reviews.pickle'
    columnas = ['review', 'stars', 'user', 'business', 'useful', 'cool', 'funny']
    return cargar_archivo(archivo, pickle, columnas, fechas)

Cargue del dataset (cuando no existe el _Pickle_, toma cerca de tres minutos; cuando sí existe, toma cerca de 12 segundos):

In [10]:
dataset = cargar_dataset()

./toronto_reviews_plus90.csv
Cargando el pickle (si existe)...
El Pickle NO existe. Creando el Pickle...
Leer la tabla toma 0.11 segundos


In [15]:
try:

    t1 = time.clock()
    print('Cargando el pickle (si existe)...')
    dataset_2 = pickle.load(open('./toronto_reviews_array.pickle', 'rb'))
    t2 = time.clock()
    print('Leer la tabla DATASET toma {:.2f} segundos'.format(t2 - t1))

except FileNotFoundError:

    print('El Pickle NO existe. Creando el Pickle...')
    t1 = time.clock()
    dataset_2 = dataset[['user', 'business', 'stars']].pivot_table(index = 'user', columns = 'business', aggfunc = np.sum, fill_value = 0)
    t2 = time.clock()

    print('Leer la tabla toma {:.2f} segundos'.format(t2 - t1))

    pickle.dump(dataset_2, open('./toronto_reviews_array.pickle', 'wb'))

#dataset[['USERID', 'ARTIST_NAME', 'VALOR']].pivot_table(index = 'USERID', columns = 'ARTIST_NAME', aggfunc = lambda x: len(x.unique()), fill_value = 0)
display(dataset_2)

Cargando el pickle (si existe)...
Leer la tabla DATASET toma 0.05 segundos


Unnamed: 0_level_0,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars
business,--DaPTJW3-tB1vP-PfdTEg,--SrzpvFLwP_YFwB_Cetow,-0CCHBui57tZ_1y_14X-5Q,-0DwB6Swi349EKfbBAOF7A,-0M3o2uWBnQZwd3hmfEwuw,-0NhdsDJsdarxyDPR523ZQ,-0RRiWDtfnS16AKCtfvBZg,-0aOudcaAyac0VJbMX-L1g,-113IAvSQ4Nn_Jk7OrrPkg,-2EAyppKR_2xuLyvJZEbRQ,...,zy-JsMCeTlY71akNeBuLvg,zy5UZvaG4iOLSbxuophtTQ,zyEnTgWFUydT-Qz_3UK-IA,zy_NHTqtfSrfTGGPoqy4Mw,zyw5DjrRks7a8OhmBsgCQQ,zz-f4Xrs1OGOhybeQaYgFQ,zz3CqZhNx2rQ_Yp6zHze-A,zzUj3ej4vm_DtvRxNvWDEw,zze6IysT7bJFS8gvi6fZ2A,zzf3RkMI1Y2E1QaZqeU8yA
user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
--Qh8yKWAvIP4V4K8ZPfHA,0,0,0,0,0,3,0,0,0,0,...,0,0,0,0,0,3,0,0,0,0
-KVxkJDSTjtPGsamMDG92Q,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-RCD8F7qbsLfzT3k1HtMxg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-_2h2cJlBOWAYrfplMU-Cg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-d2daWmftYumOaYpbD5D8Q,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-od707p4FHGul0gte29AoQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-pXs08gJq9ExIk275YLvPg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-qr-SDo-IHa5-YISjtmesQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-tdsrQ3QIkGmmP2n6-DTeg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
02AOwJB1g9ETIDGZznyuAw,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Convierto el dataset a un dataset de surprise.

In [16]:
surprise_ds = dataset_2.stack()
surprise_ds.reset_index(level = [0, 1], inplace = True)
surprise_ds = surprise_ds[surprise_ds.stars != 0]

In [17]:
kf = KFold(n_splits = 2)

In [18]:
reader = Reader(rating_scale = (1, 5))

In [23]:
data = Dataset.load_from_df(surprise_ds[['user', 'business', 'stars']], reader)

for trainset, testset in kf.split(data):
    sim_options = {'name': 'msd', 'user_based': False, 'min_k': 10, 'k': 15}
    algo = KNNBasic(sim_options = sim_options)
    algo.fit(trainset)
    
    test_set = testset

    predictions = algo.test(testset)
    rmse(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0469
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0479


In [62]:
data = Dataset.load_from_df(surprise_ds[['user', 'business', 'stars']], reader)
train_data = data.build_full_trainset()
sim_options = {'name': 'msd', 'user_based': False, 'min_k': 10, 'k': 15}
algo = KNNBasic(sim_options = sim_options)
algo.fit(train_data)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x12d35dc50>

In [63]:
test = [x for x in train_data.build_testset() if x[0] == "--Qh8yKWAvIP4V4K8ZPfHA"]

In [64]:
len(test)

383

In [65]:
predictions = algo.test(test)

In [66]:
def get_top_n(predictions, n = 10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key = lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n = get_top_n(predictions, n = 10)

top_n = dict(top_n)
top_n['--Qh8yKWAvIP4V4K8ZPfHA']

[('HPr3UUlwzfmXpwhUSg6_-g', 4.620689655172414),
 ('Jn17OhYIWrSlXHI3bpZsLQ', 4.617142857142857),
 ('GdiKn-JExlIHCQUDxQ9xgQ', 4.613636363636363),
 ('-PLXDic2ALf15NLKHFHx3A', 4.605714285714285),
 ('fubrLSuFj9IHh_Jk9sTI8w', 4.605714285714285),
 ('FEE0dl5XqLS0RHKijqt-Lw', 4.601156069364162),
 ('YY23OABxBKOryNqtmMOKjA', 4.592178770949721),
 ('M30Ndb5zaFyU0m-qVLngiw', 4.586206896551723),
 ('uMOOP1DtKAAAiAtkyidDWA', 4.584527220630371),
 ('PgPw-6YU3Hny5nQx2qkzUg', 4.571108622620381)]