In [9]:
import os
import numpy as np
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise import KNNBasic
from surprise import accuracy
import random

#Para garantizar reproducibilidad en resultados
seed = 10
random.seed(seed)
np.random.seed(seed)

In [5]:
file_path_big = './data/preprocessed_user_item_rating.csv'
file_path_first_half = './data/preprocessed_user_item_rating_shuffled_first_half.csv'
file_path = file_path_first_half

In [10]:
if not (os.path.exists(file_path)):
  raise ValueError('El archivo preprocessed_user_item_rating.csv no fue encontrado en el path')
else:
  print("El archivo ha sido cargado")

El archivo ha sido cargado


In [11]:
ratings=pd.read_csv(file_path, sep = ',', header=0, names = [ '', 'userid', 'artist-name', 'count', 'max', 'rating' ] )
ratings = ratings.loc[:,['userid', 'artist-name','rating']]
ratings

Unnamed: 0,userid,artist-name,rating
0,user_000872,Kt Tunstall,0.0
1,user_000327,Interpol,0.8
2,user_000706,The Police,0.0
3,user_000807,Shankar-Ehsaan-Loy,0.0
4,user_000223,Timo Räisänen,0.0
...,...,...,...
894494,user_000172,Ocean'S Eleven Soundtrack,0.0
894495,user_000655,Pat Benatar,0.1
894496,user_000983,The Band,0.0
894497,user_000298,Outkast,0.0


In [12]:
ratings.rating.describe()

count    894499.000000
mean          0.131956
std           0.389017
min           0.000000
25%           0.000000
50%           0.000000
75%           0.100000
max           5.000000
Name: rating, dtype: float64

In [13]:
# Creacion sistema de recomendacion

In [14]:
reader = Reader( rating_scale = ( 0, 5 ) )
#Se crea el dataset a partir del dataframe
surprise_dataset = Dataset.load_from_df( ratings[ [ 'userid', 'artist-name', 'rating' ] ], reader )

In [15]:
trainset, testset=  train_test_split(surprise_dataset, test_size=.2)

### Modelo basado en distancias coseno

In [16]:
sim_options = {'name': 'cosine',
               'user_based': True  # calcule similitud item-item
               }
algo = KNNBasic(k=10, min_k=5, sim_options=sim_options)

### Modelo basado en índice de Jaccard

### Modelo basado en correlación de Pearson

In [41]:
sim_options = {'name': 'pearson_baseline',
               'user_based': True,
               'shrinkage': 0  # no shrinkage
               }
algo = KNNBasic(sim_options=sim_options)

### Recomendaciones

In [17]:
predictions = algo.fit(trainset).test(testset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [18]:
#Predicciones para usuario user_001000
user_predictions=list(filter(lambda x: x[0]=='user_001000',predictions))

In [19]:
#Ordenamos de mayor a menor estimación de relevancia
user_predictions.sort(key=lambda x : x.est, reverse=True)

In [20]:
#tomamos las 10 primeras predicciones
user_predictions=user_predictions[0:10]

In [21]:
#Se convierte a dataframe
labels = ['artist', 'estimation']
df_predictions = pd.DataFrame.from_records(list(map(lambda x: (x.iid, x.est) , user_predictions)), columns=labels)

### Recomendaciones usuario 001000

In [22]:
df_predictions

Unnamed: 0,artist,estimation
0,Kings Of Leon,1.241916
1,Belle And Sebastian,1.06658
2,Bloc Party,0.870899
3,Elliott Smith,0.770619
4,Animal Collective,0.709156
5,Friendly Fires,0.706049
6,The Shins,0.672026
7,Andrew Bird,0.619298
8,Arcade Fire,0.572819
9,Pavement,0.570884


In [23]:
accuracy.rmse( user_predictions, verbose = True )

RMSE: 1.8350


1.8350220682328273

In [24]:
algo.predict('user_001000','John Maus')

Prediction(uid='user_001000', iid='John Maus', r_ui=None, est=0.24389435333394394, details={'actual_k': 8, 'was_impossible': False})

In [25]:
ratings_100 = ratings[ratings['userid']=='user_001000']
ratings_100[ratings_100['artist-name']=='Spoon']

Unnamed: 0,userid,artist-name,rating
612667,user_001000,Spoon,1.1
