In [1]:
import warnings
warnings.filterwarnings("ignore")

In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix
from knn_recommender import Recommender

In [141]:
song_info = pd.read_csv('dataset/triplets.txt',sep='\t',header=None)
song_info.columns = ['user_id', 'song_id', 'listen_count']

In [142]:
song_info.head()

Unnamed: 0,user_id,song_id,listen_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1.0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2.0
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1.0
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1.0
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1.0


In [143]:
song_actual =  pd.read_csv('dataset/song_data.csv')
song_actual.drop_duplicates(['song_id'], inplace=True)

In [144]:
song_actual.head()

Unnamed: 0,song_id,title,release,artist_name,year
0,SOQMMHC12AB0180CB8,Silent Night,Monster Ballads X-Mas,Faster Pussy cat,2003
1,SOVFVAK12A8C1350D9,Tanssi vaan,Karkuteillä,Karkkiautomaatti,1995
2,SOGTUKN12AB017F4F1,No One Could Ever,Butter,Hudson Mohawke,2006
3,SOBNYVR12A8C13558C,Si Vos Querés,De Culo,Yerba Brava,2003
4,SOHSBXH12A8C13B0DF,Tangle Of Aspens,Rene Ablaze Presents Winter Sessions,Der Mystic,0


In [145]:
df_songs = pd.merge(song_info, song_actual, on="song_id", how="left")

In [146]:
df_songs.head()

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1.0,The Cove,Thicker Than Water,Jack Johnson,0.0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2.0,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976.0
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1.0,Stronger,Graduation,Kanye West,2007.0
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1.0,Constellations,In Between Dreams,Jack Johnson,2005.0
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1.0,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999.0


In [147]:
print(f"There are {df_songs.shape[0]} observations in the dataset")

There are 20797 observations in the dataset


In [148]:
df_songs.isnull().sum()

user_id         0
song_id         1
listen_count    1
title           1
release         1
artist_name     1
year            1
dtype: int64

In [149]:
df_songs = df_songs.dropna(subset=['title', 'release'])

In [150]:
df_songs.isnull().sum()

user_id         0
song_id         0
listen_count    0
title           0
release         0
artist_name     0
year            0
dtype: int64

In [151]:
df_songs.dtypes

user_id          object
song_id          object
listen_count    float64
title            object
release          object
artist_name      object
year            float64
dtype: object

In [152]:
unique_songs = df_songs['title'].unique().shape[0]
print(f"There are {unique_songs} unique songs in the dataset")

There are 7117 unique songs in the dataset


In [153]:
unique_artists = df_songs['artist_name'].unique().shape[0]
print(f"There are {unique_artists} unique artists in the dataset")

There are 2676 unique artists in the dataset


In [154]:
unique_users = df_songs['user_id'].unique().shape[0]
print(f"There are {unique_users} unique users in the dataset")

There are 794 unique users in the dataset


In [155]:
ten_top_songs = df_songs.groupby('title')['listen_count'].count().reset_index().sort_values(['listen_count', 'title'], ascending = [0,1])
ten_top_songs['percentage']  = round(ten_top_songs['listen_count'].div(ten_top_songs['listen_count'].sum())*100, 2)

In [156]:
ten_top_songs = ten_top_songs[:10]
ten_top_songs

Unnamed: 0,title,listen_count,percentage
5093,Sehr kosmisch,83,0.4
1474,Dog Days Are Over (Radio Edit),72,0.35
5085,Secrets,67,0.32
6488,Undo,67,0.32
7062,You're The One,67,0.32
4836,Revelry,66,0.32
2567,Horn Concerto No. 4 in E flat K495: II. Romanc...,54,0.26
1931,Fireflies,52,0.25
6088,The Scientist,51,0.25
2489,Hey_ Soul Sister,48,0.23


In [157]:
ten_top_artists  = df_songs.groupby(['artist_name'])['listen_count'].count().reset_index().sort_values(['listen_count','artist_name'], ascending = [0,1])

In [158]:
ten_top_artists = ten_top_artists[:10]
ten_top_artists

Unnamed: 0,artist_name,listen_count
503,Coldplay,329
2265,The Black Keys,226
879,Florence + The Machine,205
1319,Kings Of Leon,195
1894,Radiohead,187
2338,The Killers,181
575,Daft Punk,172
767,Eminem,171
1236,Justin Bieber,155
1089,Jack Johnson,137


In [159]:
listen_counts = pd.DataFrame(df_songs.groupby('listen_count').size(), columns=['count'])
print(f"The maximum time the same user listened to the same songs was: {listen_counts.reset_index(drop=False)['listen_count'].iloc[-1]}")

The maximum time the same user listened to the same songs was: 247.0


In [160]:
print(f"On average, a user listen to the same song {df_songs['listen_count'].mean()} times")

On average, a user listen to the same song 2.9222927486055013 times


In [161]:
song_user = df_songs.groupby('user_id')['song_id'].count()

In [162]:
print(f"A user listens to an average of {np.mean(song_user)} songs, with minimum {np.min(song_user)} and maximum {np.max(song_user)} songs")

A user listens to an average of 26.191435768261965 songs, with minimum 1 and maximum 401 songs


In [163]:
values_matrix = unique_users * unique_songs
zero_values_matrix = values_matrix - df_songs.shape[0]
print(f"The matrix of users x songs has {zero_values_matrix} values that are zero")

The matrix of users x songs has 5630102 values that are zero


In [164]:
song_ten_id = song_user[song_user > 26].index.to_list()
df_song_id_more_ten = df_songs[df_songs['user_id'].isin(song_ten_id)].reset_index(drop=True)

In [165]:
df_songs_features = df_song_id_more_ten.pivot(index='song_id', columns='user_id', values='listen_count').fillna(0)
mat_songs_features = csr_matrix(df_songs_features.values)

In [166]:
df_songs_features.head()

user_id,0039bd8483d578997718cdc0bf6c7c88b679f488,01655ae6bc52e29c9cd100a7dde4e9eeae5e4031,019d0d1c7a01f8736ba59a124160e5fc70666db7,063ef0cd02c950d0fe5799257907d85c0d341470,07caa920795cd4f20bfeeb0e192a5ddd9566ecdd,08d31ac4452516e702815fef13b2059aa8210034,0a00498b9d607844a8826184ae7278097d1c008a,0b03286244d1cb0662fefddade241f56a1bae573,0b9a6a7975b818259222ae7434a1095834d25b92,0dd93f61fe69f292ac336715ef607214efb3dbaa,...,f608c215606e6421a429ea28ad08243241d5347d,f694565a4451ed33e6741ede8cd3a7eaad66bb84,f8ae59c070d0b3fadc84c4d9f18475a61d7d37eb,f904f6ed9ac7c35f4771c69a9ef38e797e4e4baf,f927dddfe26b0c74ada3eaadca41bfb462c37ee4,fc604c906ec7bb98c494ef7659b0d3dc75503812,fd13b9d49c54e00ff413fe3c095ba581c7fc611e,fe76c9d535c5834e4a9b91c13e29be6460cb79c4,fed37c4c49c9f217b3371c2f2c0e7541656e55cf,ff18ea9a13583f7f7aaa83719e0b22ce5618e9cf
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SOAAAGQ12A8C1420C8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SOAAEJI12AB0188AB5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SOAAFAC12A67ADF7EB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SOAAFYH12A8C13717A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SOAAKPM12A58A77210,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [167]:
df_unique_songs = df_songs.drop_duplicates(subset=['song_id']).reset_index(drop=True)[['song_id', 'title']]
decode_id_song = {
    song: i for i, song in 
    enumerate(list(df_unique_songs.set_index('song_id').loc[df_songs_features.index].title))
}

In [168]:
model = Recommender(metric='cosine', algorithm='brute', k=20, data=mat_songs_features, decode_id_song=decode_id_song)

In [169]:
df_songs[df_songs['artist_name'] == 'Eminem']

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year
80,4bd88bfb25263a75bbdd467e74018f4ae570e5df,SOKOXWU12AF72AD1BC,4.0,The Real Slim Shady,Curtain Call,Eminem,2000.0
83,4bd88bfb25263a75bbdd467e74018f4ae570e5df,SOVEUVC12A6310EAF1,5.0,Just Lose It,Curtain Call,Eminem,2004.0
85,4bd88bfb25263a75bbdd467e74018f4ae570e5df,SOWGXOP12A6701E93A,8.0,Without Me,Without Me,Eminem,2002.0
153,b64cdd1a0bd907e5e00b39e345194768e330d652,SOKOXWU12AF72AD1BC,1.0,The Real Slim Shady,Curtain Call,Eminem,2000.0
157,b64cdd1a0bd907e5e00b39e345194768e330d652,SOLXDDC12A6701FBFD,1.0,I'm Back,The Marshall Mathers LP,Eminem,2000.0
...,...,...,...,...,...,...,...
20742,dac821fa75a40a9c1dd845864b120cda076f3813,SOTULEI12A58A7CB72,1.0,Buffalo Bill,Relapse: Refill,Eminem,2009.0
20745,dac821fa75a40a9c1dd845864b120cda076f3813,SOVIMJC12A6D4F92EC,15.0,No Apologies,Eminem Presents The Re-Up,Eminem,2006.0
20747,dac821fa75a40a9c1dd845864b120cda076f3813,SOVXUCJ12A6701FBC2,4.0,Still Don't Give A Fuck,The Slim Shady LP,Eminem,1999.0
20748,dac821fa75a40a9c1dd845864b120cda076f3813,SOWBFFR12AF72A2AA1,1.0,Rock Bottom,The Slim Shady LP,Eminem,1999.0


In [170]:
song = 'The Real Slim Shady'

In [171]:
new_recommendations = model.make_recommendation(new_song=song, n_recommendations=10)
print(f"The recommendations for {song} are:")
print(f"{new_recommendations}")

Starting the recommendation process for The Real Slim Shady ...
... Done
The recommendations for The Real Slim Shady are:
['Not Big', 'Rap Game', 'The Islander', 'R.A.K.I.M.', 'Universe & U', "Ain't No Sunshine", 'Pass Out (Instrumental)', 'Hey Baby', 'Black Horse And The Cherry Tree (Radio Version)', 'Te Amo']
