In [1]:
import os
import numpy as np
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise import KNNBasic
from surprise import accuracy
import random
import pickle

#Para garantizar reproducibilidad en resultados
seed = 10
random.seed(seed)
np.random.seed(seed)

In [2]:
file_path = './data/preprocessed_user_item_rating.csv'

In [3]:
if not (os.path.exists(file_path)):
  raise ValueError('El archivo preprocessed_user_item_rating.csv no fue encontrado en el path')
else:
  print("El archivo ha sido cargado")

El archivo ha sido cargado


In [4]:
ratings=pd.read_csv(file_path, sep = ',', header=0, names = [ 'userid', 'artist-name', 'rating' ] )
ratings = ratings.loc[:,['userid', 'artist-name','rating']]
ratings

Unnamed: 0,userid,artist-name,rating
0,user_001000,Wilco,5.0
1,user_001000,Radiohead,4.9
2,user_001000,Animal Collective,4.7
3,user_001000,Girl Talk,4.6
4,user_001000,Aesop Rock,4.2
...,...,...,...
896880,user_000001,Jamie Lidell,0.2
896881,user_000001,Nick Holmes,0.2
896882,user_000001,Nuyorican Soul,0.2
896883,user_000001,The Birthday,0.2


In [5]:
ratings.rating.describe()

count    167494.000000
mean          0.635761
std           0.737520
min           0.200000
25%           0.200000
50%           0.400000
75%           0.700000
max           5.000000
Name: rating, dtype: float64

# Creacion sistema de recomendacion

In [8]:
reader = Reader( rating_scale = ( 0, 5 ) )
#Se crea el dataset a partir del dataframe
surprise_dataset = Dataset.load_from_df( ratings[ [ 'userid', 'artist-name', 'rating' ] ], reader )

In [9]:
trainset, testset=  train_test_split(surprise_dataset, test_size=.2)

### Modelo basado en distancias coseno

In [100]:
sim_options = {'name': 'cosine',
               'user_based': True  # calcule similitud item-item
               }
algo = KNNBasic(k=50, min_k=10, sim_options=sim_options)

In [101]:
predictions = algo.fit(trainset).test(testset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [102]:
pickle.dump( predictions, open( "./Data/predictions_uu_cosine.p", "wb" ) )

### Modelo basado en índice de Jaccard

In [18]:
ratings_jaccard=ratings.groupby('userid')['artist-name'].apply(lambda x: ','.join(x)).tolist()

In [19]:
ListaL=[]
for x in ratings_jaccard:
    lista=[]
    for y in ratings_jaccard:
        if x is not y:
            z = len(set(x).intersection(y)) / len(set(x).union(y))
            lista.append(z)
    ListaL.append(lista)
    
jaccard_df = pd.DataFrame(ListaL)
jaccard_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,981,982,983,984,985,986,987,988,989,990
0,0.522523,0.592233,0.500000,0.578431,0.407895,0.513274,0.163265,0.491071,0.564356,0.570093,...,0.464000,0.490385,0.504505,0.568627,0.540000,0.495050,0.397959,0.560748,0.462810,0.559633
1,0.522523,0.756410,0.771084,0.763158,0.460938,0.714286,0.225352,0.627907,0.746667,0.762500,...,0.659574,0.542169,0.772152,0.773333,0.763889,0.698630,0.549296,0.818182,0.630435,0.765432
2,0.592233,0.756410,0.670588,0.816901,0.491803,0.716049,0.242424,0.646341,0.852941,0.789474,...,0.606383,0.640000,0.776316,0.882353,0.794118,0.776119,0.590909,0.800000,0.666667,0.769231
3,0.500000,0.771084,0.670588,0.716049,0.443609,0.674157,0.210526,0.593407,0.638554,0.697674,...,0.594059,0.528736,0.705882,0.703704,0.714286,0.675325,0.513158,0.726190,0.614583,0.701149
4,0.578431,0.763158,0.816901,0.716049,0.479339,0.743590,0.253968,0.650000,0.782609,0.727273,...,0.591398,0.666667,0.692308,0.838235,0.803030,0.757576,0.619048,0.760000,0.632184,0.753247
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
987,0.495050,0.698630,0.776119,0.675325,0.757576,0.432203,0.657895,0.277778,0.626667,0.712121,...,0.571429,0.586207,0.641791,0.671233,0.796875,0.816667,0.703704,0.718310,0.609756,0.689189
988,0.397959,0.549296,0.590909,0.513158,0.619048,0.336207,0.534247,0.309524,0.542857,0.622951,...,0.433333,0.458824,0.627119,0.565217,0.629032,0.696429,0.703704,0.565217,0.475000,0.541667
989,0.560748,0.818182,0.800000,0.726190,0.760000,0.480000,0.651163,0.231884,0.623529,0.791667,...,0.673684,0.621053,0.615385,0.840000,0.723684,0.736111,0.718310,0.565217,0.608696,0.762500
990,0.462810,0.630435,0.666667,0.614583,0.632184,0.392857,0.617021,0.202532,0.557895,0.616279,...,0.564815,0.547170,0.545455,0.608696,0.678571,0.607143,0.609756,0.475000,0.608696,0.589474


### Modelo basado en correlación de Pearson

In [90]:
sim_options = {'name': 'pearson_baseline',
               'user_based': True,
               'shrinkage': 0  # no shrinkage
               }
algo = KNNBasic(sim_options=sim_options)

In [91]:
predictions = algo.fit(trainset).test(testset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [92]:
pickle.dump( predictions, open( "./Data/predictions_uu_pearson.p", "wb" ) )

### Recomendaciones

In [93]:
pickle_predictions = pickle.load( open( "./Data/predictions_uu_pearson.p", "rb" ) )

In [94]:
#Predicciones para usuario user_001000
user_predictions=list(filter(lambda x: x[0]=='user_001000' and x[4]['was_impossible']==False,pickle_predictions))

In [95]:
#Ordenamos de mayor a menor estimación de relevancia
user_predictions.sort(key=lambda x : x.est, reverse=True)

In [96]:
#tomamos las 10 primeras predicciones
user_predictions=user_predictions[0:10]

In [97]:
#Se convierte a dataframe
labels = ['artist', 'estimation']
df_predictions = pd.DataFrame.from_records(list(map(lambda x: (x.iid, x.est) , user_predictions)), columns=labels)

### Recomendaciones usuario 001000

In [98]:
df_predictions

Unnamed: 0,artist,estimation
0,Crystal Castles,1.290774
1,Belle And Sebastian,1.191414
2,Cut Copy,1.119432
3,Bloc Party,1.053604
4,Of Montreal,0.965159
5,Massive Attack,0.935226
6,The Decemberists,0.927302
7,Ben Folds Five,0.916262
8,Morcheeba,0.86992
9,Final Fantasy,0.866152


In [99]:
accuracy.rmse( user_predictions, verbose = True )

RMSE: 0.4596


0.45959910359602174

In [31]:
algo.predict('user_001000','Kenny Burrell')

Prediction(uid='user_001000', iid='Kenny Burrell', r_ui=None, est=1.6393711339908208, details={'actual_k': 5, 'was_impossible': False})

In [27]:
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x102b81e20>

In [41]:
user_inner_id = algo.trainset.to_inner_uid('user_001000')

In [49]:
# Retrieve inner ids of the nearest neighbors of Item.
user_neighbors = algo.get_neighbors(user_inner_id, k=10)
user_neighbors

[106, 15, 16, 28, 57, 90, 97, 151, 172, 223]

In [51]:
neighbors = (algo.trainset.to_raw_uid(rid)
                       for rid in user_neighbors)
for neighbor in neighbors:
    print(neighbor)

user_000947
user_000859
user_000586
user_000153
user_000651
user_000097
user_000425
user_000379
user_000468
user_000718
