In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline
plt.style.use('fivethirtyeight')

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [2]:
df_data = pd.read_table('lastfm-dataset-360K//usersha1-artmbid-artname-plays.tsv',
                         header = None, nrows = 2e7,
                         names = ['users', 'musicbrainz-artist-id', 'artist-name', 'plays'],
                         usecols = ['users', 'artist-name', 'plays'])

df_profile = pd.read_table('lastfm-dataset-360K//usersha1-profile.tsv',
                            header = None,
                            names = ['users', 'gender', 'age', 'country', 'signup'],
                            usecols = ['users', 'country'])


In [5]:
df_data.shape

(17535655, 3)

In [6]:
df_profile.shape

(359347, 2)

In [9]:
#drop null rows
df_data.dropna(inplace = True)

#us users only
us_profile = df_profile.loc[df_profile['country'] == 'United States']
us_data = us_profile.merge(df_data, how = 'left', left_on = 'users', right_on = 'users')

#total plays for an artist
artist_plays = us_data.groupby('artist-name')['plays'].sum().reset_index().rename(columns = {'plays':'total_artist_plays'})
top_artist_plays = artist_plays.loc[artist_plays['total_artist_plays'] > 10000]

#joining user data & profiles
user_data_with_top_total_plays = us_data.merge(top_artist_plays, how = 'inner', left_on = 'artist-name', right_on = 'artist-name')

In [10]:
#explore artist_plays
top_artist_plays.describe()

Unnamed: 0,total_artist_plays
count,7352.0
mean,104709.808
std,296115.449
min,10004.0
25%,15818.0
50%,29311.0
75%,75917.0
max,10705203.0


In [11]:
len(top_artist_plays['artist-name'].unique())

7352

In [12]:
del([df_data, df_profile])

In [13]:
top_plays = user_data_with_top_total_plays.groupby(['users','artist-name']).agg({'plays':np.sum, 
                                                                               'total_artist_plays': 'first'}).reset_index()
top_plays.head()

Unnamed: 0,users,artist-name,plays,total_artist_plays
0,00007a47085b9aab8af55f52ec8846ac479ac4fe,aesop rock,72.0,707850.0
1,00007a47085b9aab8af55f52ec8846ac479ac4fe,air,178.0,1196750.0
2,00007a47085b9aab8af55f52ec8846ac479ac4fe,amon tobin,106.0,357637.0
3,00007a47085b9aab8af55f52ec8846ac479ac4fe,animal collective,203.0,1808594.0
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,annie,75.0,68799.0


In [14]:
wide_user_artists = top_plays.pivot(index = 'artist-name', columns = 'users', values = 'plays').fillna(0)

wide_ua_sparse = csr_matrix(wide_user_artists)

wide_user_artists.head()

users,00007a47085b9aab8af55f52ec8846ac479ac4fe,0001a57568309b287363e72dc682e9a170ba6dc2,00024b5b85c40f990c28644d53257819980bf6bb,0002dd2154072434d26e5409faa591bfb260a01e,00032c7933e0eb05f2258f1147ef81a90f2d4d6c,00041cbfdd019b5431f926133266cc4ba38219bb,000429493d9716b66b02180d208d09b5b89fbe64,000701c3c006b091990162635b36b008c504c6a7,000752c87a61bc4247f5219b4769c347c0062c8a,0008538a0f505f72fdd66af3c4c71aef8d3bdea4,...,fff58a5c95280b7af63f9c552f9159b58ae5efa3,fff694cf332ed701dccbf17f1d9595ba8ad69f22,fff69e7cb53568c732909648527a778c31befec8,fff820efe22db6c868515436de82af39e013b910,fff89b6b5332f0f38996f11c88f908a3924926fe,fff9dc65e7f2763a7e8bce8d99cc1491c2ae4c6f,fffa9294e858a7c863b5ad363c748c2330d9bd45,fffa9d62caff0f038c7a35db70f109b1bba04a1d,fffaf6f9a1a3ad8bd0dff7b48b2eb9eef030fdee,fffe8c7f952d9b960a56ed4dcb40a415d924b224
artist-name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!!!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(hed) planet earth,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
*nsync,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
+/-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
+44,0.0,0.0,394.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
del([top_artist_plays, top_plays, us_data, us_profile, user_data_with_top_total_plays])

In [16]:
nn = NearestNeighbors(metric = 'cosine')
nn.fit(wide_ua_sparse)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [70]:
#'predicting' recommendations

NEIGHBORS = 8

distance, indices = nn.kneighbors(wide_user_artists.loc[wide_user_artists.index == 'the temptations'], n_neighbors = NEIGHBORS)

In [71]:
#setting up reco dataframe

reco_index = [i[0] for i in wide_user_artists.index[indices]]
reco_columns = [i+1 for i in range(NEIGHBORS-1)]

reco_data = [i[1:] for i in wide_user_artists.index[indices]]
reco_data2 = [i[1:] for i in distance]

pd.DataFrame(reco_data, index = reco_index, columns = reco_columns)

Unnamed: 0,1,2,3,4,5,6,7
the temptations,the four tops,ten years after,derek and the dominos,pete townshend,the who,the derek trucks band,the isley brothers


In [72]:
pd.DataFrame(reco_data2, index = reco_index, columns = reco_columns)

Unnamed: 0,1,2,3,4,5,6,7
the temptations,0.533,0.588,0.654,0.682,0.688,0.695,0.706
