# Descripción paso-a-paso del Taller 1
El archivo donde se encuentra el dataset está en __[este enlace](http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/lastfm-1K.html)__.<br />
Importo las librerías que se usarán en el taller.

In [53]:
import pandas as pd, numpy as np, time, os
from IPython.display import display
import matplotlib.pyplot as plt
from surprise import Dataset, Reader, SVD, NormalPredictor
import joblib
from surprise.model_selection import cross_validate, train_test_split, KFold
from surprise.prediction_algorithms.knns import *
from surprise.accuracy import rmse, mae
from collections import defaultdict


En la primera ejecución debo cargar el dataset _**userid-timestamp-artid-artname-traid-traname.tsv**_ original. Este proceso es lento; sin embargo puedo generar un _Pickle_ al terminar de cargarlo para que en las ejecuciones posteriores pueda usar el _Pickle_ con el _Dataframe_ serializado en lugar del archivo original. Empíricamente esto agiliza más de 10 veces el cargue!

In [38]:
def cargar_archivo(archivo, pickle, columnas, fechas, sep = '\t'):

    try:

        t1 = time.clock()
        print('Cargando el pickle (si existe)...')
        tabla = pd.read_pickle(pickle)
        t2 = time.clock()
        print('Leer la tabla DATASET toma {:.2f} segundos'.format(t2 - t1))

    except FileNotFoundError:
        
        print('El Pickle NO existe. Creando el Pickle...')
        t1 = time.clock()
        tabla = pd.read_csv(archivo, sep = sep, names = columnas)
        t2 = time.clock()

        print('Leer la tabla toma {:.2f} segundos'.format(t2 - t1))

        tabla.to_pickle(pickle)

    return tabla

def cargar_dataset():
    fechas = ['TIMESTAMP']
    archivo = os.path.join('toronto', 'toronto_reviews_plus90.csv')
    print(archivo)
    pickle = 'toronto/dataset_reviews.pickle'
    columnas = ['review', 'stars', 'user', 'business', 'useful', 'cool', 'funny']
    return cargar_archivo(archivo, pickle, columnas, fechas)

Cargue del dataset (cuando no existe el _Pickle_, toma cerca de tres minutos; cuando sí existe, toma cerca de 12 segundos):

In [39]:
dataset = cargar_dataset()

toronto/toronto_reviews_plus90.csv
Cargando el pickle (si existe)...
El Pickle NO existe. Creando el Pickle...
Leer la tabla toma 0.13 segundos


Muestro las primeras 10 columnas del _Dataframe_ para hacerme una mejor idea de los datos

In [40]:
display(dataset.head(10))

Unnamed: 0,review,stars,user,business,useful,cool,funny
0,--4LgB2HgH5Cyf-I2NpWYg,4,oLLYmkej2TMHaTbudug5eA,isYj0rlYyUv3bUd4ZChxlQ,0,0,0
1,--7snIGa_xrKdbO8a-zSUw,4,tWBLn4k1M7PLBtAtwAg73g,F69E-AY9QDea9YPaxokILg,0,0,0
2,--Aja3FOqdak2zxSp4BAxg,3,KBh4r16e9Grc1HI9pG4wTg,CztDemHy1q1f2R3VV9jmvA,1,1,1
3,--caLpnr1SpZl1z7DANgJw,5,YBT3EKUNN4IP8m4x7sGu1g,TwVtAAGoD_AEGzjH--DAyw,0,0,1
4,--EswXhYO3vRWmWBhYakNA,4,hqmnMdDS-Opjp3BfBJA8qA,SYGoFUdLLPyh1lZ9_nwOUA,1,1,0
5,--H9_oEMAGUn-8hCMD9suw,2,FWDwuX8k8dwUICg8EG9dUQ,3qTSOuJ-ZyU1L7o62KHSIg,2,0,0
6,--jSYbaYQ7I21lqlO2SzuQ,4,U5YQX_vMl_xQy8EQDqlNQQ,WN0hOGQhLTJu5uGIv75_7w,2,2,0
7,--mvsRDRC6_Y4y4TKIgVMw,5,69xsJFjiOAd8Ld7WWk_tEw,BYDs_o6U4CLxJec8k1Q90Q,0,0,0
8,--qNZFpsjjRk6PUhReQqCQ,4,52MlnjkvSLLqKKSRrbrH0w,0FH5h58Qls1VIuXh5QORlg,0,0,0
9,--Sb1Lcw0aIVy-9reQ8Ovw,4,buv4fFmsJhDfFWvOgUWxtw,Fl2yDmC0B0TTVsNixxVXfA,1,0,0


_"Describo"_ el dataset de eventos

In [41]:
dataset.describe()

Unnamed: 0,stars,useful,cool,funny
count,75139.0,75139.0,75139.0,75139.0
mean,3.554599,2.30743,1.226939,1.011805
std,1.037383,3.789674,2.454386,2.633278
min,1.0,0.0,0.0,0.0
25%,3.0,0.0,0.0,0.0
50%,4.0,1.0,0.0,0.0
75%,4.0,3.0,2.0,1.0
max,5.0,169.0,62.0,193.0


In [42]:
try:

    t1 = time.clock()
    print('Cargando el pickle (si existe)...')
    dataset_2 = joblib.load('toronto/toronto_reviews_array.pickle')
    t2 = time.clock()
    print('Leer la tabla DATASET toma {:.2f} segundos'.format(t2 - t1))

except FileNotFoundError:

    print('El Pickle NO existe. Creando el Pickle...')
    t1 = time.clock()
    dataset_2 = dataset[['user', 'business', 'stars']].pivot_table(index = 'user', columns = 'business', aggfunc = np.sum, fill_value = 0)
    t2 = time.clock()

    print('Leer la tabla toma {:.2f} segundos'.format(t2 - t1))

    joblib.dump(dataset_2,'toronto/toronto_reviews_array.pickle')

#dataset[['USERID', 'ARTIST_NAME', 'VALOR']].pivot_table(index = 'USERID', columns = 'ARTIST_NAME', aggfunc = lambda x: len(x.unique()), fill_value = 0)
display(dataset_2)

Cargando el pickle (si existe)...
El Pickle NO existe. Creando el Pickle...
Leer la tabla toma 2.65 segundos


Unnamed: 0_level_0,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars
business,--DaPTJW3-tB1vP-PfdTEg,--SrzpvFLwP_YFwB_Cetow,-0CCHBui57tZ_1y_14X-5Q,-0DwB6Swi349EKfbBAOF7A,-0M3o2uWBnQZwd3hmfEwuw,-0NhdsDJsdarxyDPR523ZQ,-0RRiWDtfnS16AKCtfvBZg,-0aOudcaAyac0VJbMX-L1g,-113IAvSQ4Nn_Jk7OrrPkg,-2EAyppKR_2xuLyvJZEbRQ,...,zy-JsMCeTlY71akNeBuLvg,zy5UZvaG4iOLSbxuophtTQ,zyEnTgWFUydT-Qz_3UK-IA,zy_NHTqtfSrfTGGPoqy4Mw,zyw5DjrRks7a8OhmBsgCQQ,zz-f4Xrs1OGOhybeQaYgFQ,zz3CqZhNx2rQ_Yp6zHze-A,zzUj3ej4vm_DtvRxNvWDEw,zze6IysT7bJFS8gvi6fZ2A,zzf3RkMI1Y2E1QaZqeU8yA
user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
--Qh8yKWAvIP4V4K8ZPfHA,0,0,0,0,0,3,0,0,0,0,...,0,0,0,0,0,3,0,0,0,0
-KVxkJDSTjtPGsamMDG92Q,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-RCD8F7qbsLfzT3k1HtMxg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-_2h2cJlBOWAYrfplMU-Cg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-d2daWmftYumOaYpbD5D8Q,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-od707p4FHGul0gte29AoQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-pXs08gJq9ExIk275YLvPg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-qr-SDo-IHa5-YISjtmesQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-tdsrQ3QIkGmmP2n6-DTeg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
02AOwJB1g9ETIDGZznyuAw,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Elimino todos los usuarios que han dado menos 10 ratings.

In [43]:
dataset_2.describe()

Unnamed: 0_level_0,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars,stars
business,--DaPTJW3-tB1vP-PfdTEg,--SrzpvFLwP_YFwB_Cetow,-0CCHBui57tZ_1y_14X-5Q,-0DwB6Swi349EKfbBAOF7A,-0M3o2uWBnQZwd3hmfEwuw,-0NhdsDJsdarxyDPR523ZQ,-0RRiWDtfnS16AKCtfvBZg,-0aOudcaAyac0VJbMX-L1g,-113IAvSQ4Nn_Jk7OrrPkg,-2EAyppKR_2xuLyvJZEbRQ,...,zy-JsMCeTlY71akNeBuLvg,zy5UZvaG4iOLSbxuophtTQ,zyEnTgWFUydT-Qz_3UK-IA,zy_NHTqtfSrfTGGPoqy4Mw,zyw5DjrRks7a8OhmBsgCQQ,zz-f4Xrs1OGOhybeQaYgFQ,zz3CqZhNx2rQ_Yp6zHze-A,zzUj3ej4vm_DtvRxNvWDEw,zze6IysT7bJFS8gvi6fZ2A,zzf3RkMI1Y2E1QaZqeU8yA
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,...,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,0.033937,0.090498,0.011312,0.036199,0.011312,0.061086,0.006787,0.006787,0.00905,0.015837,...,0.015837,0.054299,0.067873,0.015837,0.104072,0.033937,0.076923,0.190045,0.045249,0.020362
std,0.357907,0.580054,0.237826,0.379225,0.237826,0.439931,0.142695,0.142695,0.190261,0.237567,...,0.237567,0.443386,0.494789,0.20696,0.593259,0.317626,0.511511,0.805362,0.390055,0.304228
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,4.0,5.0,5.0,4.0,5.0,4.0,3.0,3.0,4.0,4.0,...,4.0,5.0,5.0,3.0,5.0,3.0,5.0,5.0,4.0,5.0


Convierto el dataset a un dataset de surprise.

In [44]:
surprise_ds = dataset_2.stack()
surprise_ds.reset_index(level = [0, 1], inplace = True)
surprise_ds = surprise_ds[surprise_ds.stars != 0]

In [45]:
kf = KFold(n_splits = 2)

In [46]:
reader = Reader(rating_scale = (1, 5))

In [51]:
data = Dataset.load_from_df(surprise_ds[['user', 'business', 'stars']], reader)

for trainset, testset in kf.split(data):
    sim_options = {'name': 'msd', 'user_based': False, 'min_k': 10, 'k': 15}
    algo = KNNBasic(sim_options = sim_options)
    algo.fit(trainset)
    
    test_set = trainset.build_anti_testset()

    predictions = algo.test(test_set)
    rmse(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.6128
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.6283


In [54]:
def get_top_n(predictions, n = 10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key = lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n = get_top_n(predictions, n = 10)

#for uid, user_ratings in top_n.items():
#    print(uid, [iid for (iid, _) in user_ratings])

top_n = dict(top_n)
top_n['zsZVg16yjZu5NIiS0ayjrQ']

[('q3AXOAPxwJsSzdUmQHmoYQ', 5),
 ('9148hBKC-ghWiU1uuVywYg', 5),
 ('u2Ro_DI21IsUPBOsdbWDdw', 5),
 ('I6qw6GX9FTMiglyzHqOkYg', 5),
 ('A_eLE3cYQOzUfkVK-k4UeA', 5),
 ('D4qFvxY2hIPrZ_o7bFhhWA', 5),
 ('xur6vu3ApyBU9ccz8_bRdg', 5),
 ('MWTnrW38EvSc652Nbox8mA', 5),
 ('zr1C68_HLmn1bBRu17p65A', 5),
 ('WVfsgZHihypIztOJGVJGFw', 5)]