In [1]:
import os
import numpy as np
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise import KNNBasic
from surprise import accuracy
import random
import pickle

#Para garantizar reproducibilidad en resultados
seed = 10
random.seed(seed)
np.random.seed(seed)

In [2]:
file_path = './Data/preprocessed_user_item_rating.csv'

In [3]:
if not (os.path.exists(file_path)):
  raise ValueError('El archivo preprocessed_user_item_rating.csv no fue encontrado en el path')
else:
  print("El archivo ha sido cargado")

El archivo ha sido cargado


In [4]:
ratings=pd.read_csv(file_path, sep = ',', header=0, names = [ 'userid', 'artist-name', 'rating' ] )
ratings = ratings.loc[:,['userid', 'artist-name','rating']]
ratings

Unnamed: 0,userid,artist-name,rating
0,user_001000,Wilco,5.0
1,user_001000,Radiohead,4.9
2,user_001000,Animal Collective,4.7
3,user_001000,Girl Talk,4.6
4,user_001000,Aesop Rock,4.2
...,...,...,...
896880,user_000001,Jamie Lidell,0.2
896881,user_000001,Nick Holmes,0.2
896882,user_000001,Nuyorican Soul,0.2
896883,user_000001,The Birthday,0.2


In [5]:
ratings.rating.describe()

count    167494.000000
mean          0.635761
std           0.737520
min           0.200000
25%           0.200000
50%           0.400000
75%           0.700000
max           5.000000
Name: rating, dtype: float64

# Creacion sistema de recomendacion

In [6]:
reader = Reader( rating_scale = ( 0, 5 ) )
#Se crea el dataset a partir del dataframe
surprise_dataset = Dataset.load_from_df( ratings[ [ 'userid', 'artist-name', 'rating' ] ], reader )

In [7]:
trainset, testset=  train_test_split(surprise_dataset, test_size=.2)

### Modelo basado en distancias coseno

In [8]:
sim_options = {'name': 'cosine',
               'user_based': True  # calcule similitud item-item
               }
algo = KNNBasic(k=50, min_k=10, sim_options=sim_options)

In [9]:
predictions = algo.fit(trainset).test(testset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [10]:
pickle.dump( predictions, open( "./Data/predictions_uu_cosine.p", "wb" ) )

### Modelo basado en índice de Jaccard

In [8]:
ratings_jaccard=ratings.groupby('userid')['artist-name'].apply(lambda x: ','.join(x)).tolist()

In [9]:
Lista=[]
for x in ratings_jaccard:
    lista=[]
    cont=0
    for y in ratings_jaccard:
        if x is not y:
            z = len(set(x).intersection(y)) / len(set(x).union(y))
            lista.append(str(z)+str(': ')+str(cont))
            cont+=1
    Lista.append(lista)
    
jaccard_df = pd.DataFrame(Lista)

In [10]:
pickle.dump( Lista, open( "./Data/predictions_uu_jaccard.p", "wb" ) )
pickle_predictions_jaccard = pickle.load( open( "./Data/predictions_uu_jaccard.p", "rb" ) )

In [11]:
all_five_first_prediction_uu_jaccard=[]
for x in pickle_predictions_jaccard:
    a=np.sort(np.array(x))
    all_five_first_prediction_uu_jaccard.append(a[::-1][0:5].tolist())

pd.DataFrame(all_five_first_prediction_uu_jaccard)
all_five_first_prediction_uu_jaccard

[['0.6: 962', '0.6: 849', '0.6: 441', '0.6: 354', '0.6: 117'],
 ['0.9054054054054054: 433', '0.8: 726', '0.8: 486', '0.8: 407', '0.8: 239'],
 ['0.9: 282',
  '0.9142857142857143: 510',
  '0.9027777777777778: 720',
  '0.8: 988',
  '0.8: 98'],
 ['0.7: 957', '0.7: 91', '0.7: 842', '0.7: 79', '0.7: 638'],
 ['0.9117647058823529: 261',
  '0.9090909090909091: 242',
  '0.8: 977',
  '0.8: 900',
  '0.8: 88'],
 ['0.7441860465116279: 50',
  '0.7388059701492538: 587',
  '0.7238805970149254: 690',
  '0.7226890756302521: 581',
  '0.7213114754098361: 257'],
 ['0.8: 671', '0.7: 977', '0.7: 970', '0.7: 335', '0.7: 314'],
 ['0.41935483870967744: 558',
  '0.4074074074074074: 886',
  '0.3: 956',
  '0.3: 879',
  '0.3: 621'],
 ['0.7: 323',
  '0.7: 122',
  '0.725: 772',
  '0.7051282051282052: 568',
  '0.704225352112676: 610'],
 ['0.8: 951', '0.8: 827', '0.8: 820', '0.8: 668', '0.8: 485'],
 ['0.8: 67',
  '0.8533333333333334: 781',
  '0.84: 286',
  '0.8472222222222222: 549',
  '0.8421052631578947: 19'],
 ['0.8: 

Predicciones para el primer usuario:

In [12]:
s=[]
s2=[]
s3=[]
for x in all_five_first_prediction_uu_jaccard[0]:
  aux='user_'
  iTemp=int(x.split(':')[1])
  while iTemp>1:
    iTemp = iTemp/10;
    aux+='0';
  s.append(str(aux+str(x.split(':')[1].strip())))
  s2.append(x.split(':')[1].strip())

print(ratings_jaccard[0].split(','))

for y in s2:
  for z in ratings_jaccard[int(y)].split(','):
    if z not in ratings_jaccard[0].split(','):
     s3.append(z)
s3

['坂本龍一', 'Underworld', 'Cornelius', 'Gilles Peterson', 'Björk', 'Plaid', 'The Cinematic Orchestra', 'Röyksopp', 'Clark', 'Minilogue', 'Suzukiski', 'Towa Tei', 'Radiohead', 'Flying Lotus', 'Masomenos', 'Scsi-9', 'Kuniyuki Takahashi', 'Loco Dice', 'Boards Of Canada', 'London Elektricity', 'Herbert', 'Onra & Quetzal', '矢野顕子', 'Jimpster', 'Zazen Boys', '原田郁子', 'Squarepusher', '高木正勝', 'The Black Dog', '4Hero', "Ian O'Brien", 'Steve Reich', 'Enya', 'Nitin Sawhney', 'Reel People', 'Richie Hawtin', 'Agustin Pereyra Lucena', 'Rasmus Faber', 'Milosh', 'Calm', 'Jazzanova', 'Janet Jackson', 'Third World Love', 'Bugge Wesseltoft', 'Woodblue', 'Zeep', 'Madlib', 'Recloose', 'Fleet Foxes', 'Afta-1', 'The Matthew Herbert Big Band', 'Jazzanova Feat. Paul Randolph', 'Pat Metheny Group', 'Hudson Mohawke', 'Ben Westbeech', 'I Am Robot And Proud', 'Plaid & Bob Jaroc', '大樹', 'Part Time Heroes', 'Designed People', 'Burial', 'A Hundred Birds', 'The Ananda Project', 'Lism', 'Shugo Tokumaru', 'Rasmus Faber Feat.

['The Beatles',
 'Alkaline Trio',
 'Johnny Cash',
 'Queen',
 'U2',
 'Ac/Dc',
 'Nofx',
 'Red Hot Chili Peppers',
 'Elvis Presley',
 'Massive Attack',
 'Bad Religion',
 'Bob Dylan',
 'R.E.M.',
 'A Perfect Circle',
 'Inxs',
 'Muse',
 'Ben Harper',
 'Dave Dobbyn',
 'Arctic Monkeys',
 'Green Day',
 'The Smashing Pumpkins',
 'Midnight Oil',
 'Counting Crows',
 'Foo Fighters',
 'Modest Mouse',
 'Powderfinger',
 'Oasis',
 'The White Stripes',
 'Pink Floyd',
 'Deftones',
 'Nirvana',
 'Lenny Kravitz',
 'Motörhead',
 'Alex Niedt',
 'Djpretzel',
 'Butthole Surfers',
 'Supergrass',
 'System Of A Down',
 'Elton John',
 'Metallica',
 'Morrissey',
 'Sting',
 'Norah Jones',
 'They Might Be Giants',
 'The Alan Parsons Project',
 'The Rolling Stones',
 'The Chemical Brothers',
 'Michael Jackson',
 'Eric Clapton',
 'Ryan Adams',
 'The All Star Band',
 'The Smiths',
 'Billy Joel',
 'David Bowie',
 'Death Cab For Cutie',
 'The All-American Rejects',
 'Belle And Sebastian',
 'My Chemical Romance',
 'Fleetwoo

In [13]:
recom=pd.DataFrame(s3, columns=['artist-name']).merge(ratings[['artist-name','rating']], how='left', left_on='artist-name', right_on='artist-name').drop_duplicates()
recom=recom.groupby('artist-name')['rating'].agg({'mean'}).sort_values(by='mean', ascending=False)
recom.head(15)

Unnamed: 0_level_0,mean
artist-name,Unnamed: 1_level_1
Nine Inch Nails,2.433333
The Beatles,2.418182
The Cure,2.257895
Pink Floyd,2.252632
Interpol,2.228571
David Bowie,2.223684
Elliott Smith,2.215
Death Cab For Cutie,2.189744
Air,2.189189
Oasis,2.174286


### Modelo basado en correlación de Pearson

In [0]:
sim_options = {'name': 'pearson_baseline',
               'user_based': True,
               'shrinkage': 0  # no shrinkage
               }
algo = KNNBasic(sim_options=sim_options)

In [61]:
predictions = algo.fit(trainset).test(testset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [62]:
pickle.dump( predictions, open( "./Data/predictions_uu_pearson.p", "wb" ) )

FileNotFoundError: ignored

### Recomendaciones

In [0]:
pickle_predictions = pickle.load( open( "./Data/predictions_uu_pearson.p", "rb" ) )

In [0]:
#Predicciones para usuario user_001000
user_predictions=list(filter(lambda x: x[0]=='user_001000' and x[4]['was_impossible']==False,pickle_predictions))

In [0]:
#Ordenamos de mayor a menor estimación de relevancia
user_predictions.sort(key=lambda x : x.est, reverse=True)

In [0]:
#tomamos las 10 primeras predicciones
user_predictions=user_predictions[0:10]

In [0]:
#Se convierte a dataframe
labels = ['artist', 'estimation']
df_predictions = pd.DataFrame.from_records(list(map(lambda x: (x.iid, x.est) , user_predictions)), columns=labels)

### Recomendaciones usuario 001000

In [0]:
df_predictions

Unnamed: 0,artist,estimation
0,Crystal Castles,1.290774
1,Belle And Sebastian,1.191414
2,Cut Copy,1.119432
3,Bloc Party,1.053604
4,Of Montreal,0.965159
5,Massive Attack,0.935226
6,The Decemberists,0.927302
7,Ben Folds Five,0.916262
8,Morcheeba,0.86992
9,Final Fantasy,0.866152


In [0]:
accuracy.rmse( user_predictions, verbose = True )

RMSE: 0.4596


0.45959910359602174

In [0]:
algo.predict('user_001000','Kenny Burrell')

Prediction(uid='user_001000', iid='Kenny Burrell', r_ui=None, est=1.6393711339908208, details={'actual_k': 5, 'was_impossible': False})

In [0]:
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x102b81e20>

In [0]:
user_inner_id = algo.trainset.to_inner_uid('user_001000')

In [0]:
# Retrieve inner ids of the nearest neighbors of Item.
user_neighbors = algo.get_neighbors(user_inner_id, k=10)
user_neighbors

[106, 15, 16, 28, 57, 90, 97, 151, 172, 223]

In [0]:
neighbors = (algo.trainset.to_raw_uid(rid)
                       for rid in user_neighbors)
for neighbor in neighbors:
    print(neighbor)

user_000947
user_000859
user_000586
user_000153
user_000651
user_000097
user_000425
user_000379
user_000468
user_000718
