In [33]:
import os
import numpy as np
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise import KNNBasic
from surprise import accuracy
import random

#Para garantizar reproducibilidad en resultados
seed = 10
random.seed(seed)
np.random.seed(seed)

In [34]:
file_path = './data/preprocessed_user_item_rating.csv'

In [35]:
if not (os.path.exists(file_path)):
  raise ValueError('El archivo preprocessed_user_item_rating.csv no fue encontrado en el path')
else:
  print("El archivo ha sido cargado")

El archivo ha sido cargado


In [36]:
ratings=pd.read_csv(file_path, sep = ',', header=0, names = [ 'userid', 'artist-name', 'rating' ] )
ratings = ratings.loc[:,['userid', 'artist-name','rating']]
ratings

Unnamed: 0,userid,artist-name,rating
0,user_001000,Wilco,5.0
1,user_001000,Radiohead,4.9
2,user_001000,Animal Collective,4.7
3,user_001000,Girl Talk,4.6
4,user_001000,Aesop Rock,4.2
...,...,...,...
896795,user_000001,Steve Reich,0.8
896796,user_000001,Enya,0.8
896797,user_000001,Nitin Sawhney,0.8
896798,user_000001,Reel People,0.8


In [37]:
ratings.rating.describe()

count    38872.000000
mean         1.626001
std          0.994192
min          0.800000
25%          0.900000
50%          1.200000
75%          1.900000
max          5.000000
Name: rating, dtype: float64

In [38]:
# Creacion sistema de recomendacion

In [39]:
reader = Reader( rating_scale = ( 0, 5 ) )
#Se crea el dataset a partir del dataframe
surprise_dataset = Dataset.load_from_df( ratings[ [ 'userid', 'artist-name', 'rating' ] ], reader )

In [40]:
trainset, testset=  train_test_split(surprise_dataset, test_size=.2)

### Modelo basado en distancias coseno

In [51]:
sim_options = {'name': 'cosine',
               'user_based': True  # calcule similitud item-item
               }
algo = KNNBasic(k=30, min_k=5, sim_options=sim_options)

### Modelo basado en índice de Jaccard

### Modelo basado en correlación de Pearson

In [41]:
sim_options = {'name': 'pearson_baseline',
               'user_based': True,
               'shrinkage': 0  # no shrinkage
               }
algo = KNNBasic(sim_options=sim_options)

### Recomendaciones

In [52]:
predictions = algo.fit(trainset).test(testset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [53]:
#Predicciones para usuario user_001000
user_predictions=list(filter(lambda x: x[0]=='user_001000' and x[4]['was_impossible']==False,predictions))

In [54]:
#Ordenamos de mayor a menor estimación de relevancia
user_predictions.sort(key=lambda x : x.est, reverse=True)

In [55]:
#tomamos las 10 primeras predicciones
user_predictions=user_predictions[0:10]

In [56]:
#Se convierte a dataframe
labels = ['artist', 'estimation']
df_predictions = pd.DataFrame.from_records(list(map(lambda x: (x.iid, x.est) , user_predictions)), columns=labels)

### Recomendaciones usuario 001000

In [57]:
df_predictions

Unnamed: 0,artist,estimation
0,Animal Collective,2.152346
1,Aesop Rock,1.995802
2,Common,1.972778
3,Regina Spektor,1.879531
4,Devendra Banhart,1.826429
5,Iron & Wine,1.778812
6,Koop,1.566184
7,Fleet Foxes,1.522369
8,Arcade Fire,1.335965


In [58]:
accuracy.rmse( user_predictions, verbose = True )

RMSE: 1.4552


1.4551636312360021

In [59]:
algo.predict('user_001000','Common')

Prediction(uid='user_001000', iid='Common', r_ui=None, est=1.9727783376215695, details={'actual_k': 11, 'was_impossible': False})

In [61]:
ratings_100 = ratings[ratings['userid']=='user_001000']
ratings_100[ratings_100['artist-name']=='Arcade Fire']

Unnamed: 0,userid,artist-name,rating
19,user_001000,Arcade Fire,2.1
