In [1]:
import numpy as np
import pandas as pd

from sklearn.decomposition import NMF
from sklearn.utils.extmath import randomized_svd
from sklearn.model_selection import train_test_split

import faiss
import implicit
import threadpoolctl
from scipy.sparse import csr_matrix

In [2]:
threadpoolctl.threadpool_limits(1, "blas");

### Prepare data

In [3]:
df = pd.read_csv("../data/spotify_playlists.csv")

In [4]:
df.head()

Unnamed: 0,playlist_id,track_id,track_uri,playlist_position,artist_name,track_name
0,9615,146723,spotify:track:6w2VaBGWcColW05nZROWvS,0,Tory Lanez,To D.R.E.A.M.
1,9615,158343,spotify:track:7hMqz88kcK2Tx6wBka9INF,1,Tory Lanez,All The Girls
2,9615,123985,spotify:track:5rqfg8Hz5D1aCcn2XUxzNA,2,Tory Lanez,Traphouse
3,9615,117440,spotify:track:5Z2S8VemQCfM02SLq8o5cR,3,Tory Lanez,One Day
4,9615,66611,spotify:track:39Y7nKwnEoMSfzbSVcAL0J,4,Tory Lanez,B.L.O.W.


In [5]:
n_track = df['track_id'].unique().shape[0]
n_playlist = df['playlist_id'].unique().shape[0]
print('tracks: {}, playlists: {}'.format(n_track, n_playlist))

tracks: 164576, playlists: 14750


In [6]:
N_TOP = 10

df = df.merge(
        df.groupby('playlist_id')['track_id'].agg(neighbours=list),
        left_on='playlist_id',
        right_index=True
    )

df['neighbours'] = df.apply(lambda r: [n for n in r['neighbours'] if n != r['track_id']], axis=1)
df['n_neighbours'] = df['neighbours'].apply(len)
df = df[df['n_neighbours'] >= N_TOP].drop('n_neighbours', axis=1)

In [7]:
train_idx, test_idx = train_test_split(range(df['playlist_id'].unique().shape[0]), test_size=0.2)
train_idx = set(train_idx)
test_idx = set(test_idx)

df['train'] = df['playlist_id'].apply(lambda x: x in train_idx)
train_df = df[df['train'] == True].drop('train', axis=1)
test_df = df[df['train'] == False].drop('train', axis=1)


print(train_df.shape, test_df.shape)

(525712, 7) (130308, 7)


In [8]:
rows, cols, data = [], [], []
for i, r in train_df.iterrows():
    rows.append(r['playlist_id'])
    cols.append(r['track_id'])
    data.append(1)

In [9]:
interactions = csr_matrix((data, (rows, cols)))

In [10]:
interactions.shape

(14749, 164575)

### ALS

In [11]:
N_LATENT = 300

In [12]:
%%time

model = implicit.als.AlternatingLeastSquares(factors=N_LATENT)
model.fit(interactions)
als_vecs = model.item_factors

  0%|          | 0/15 [00:00<?, ?it/s]

CPU times: user 1min 48s, sys: 103 ms, total: 1min 48s
Wall time: 1min 48s


In [13]:
index_als = faiss.IndexFlat(N_LATENT)   
index_als.add(als_vecs)          
print(index_als.ntotal)

164575


In [14]:
np.save("../data/als_vecs.npy", als_vecs)

### SVD

In [15]:
%%time

u, s, vt = randomized_svd(interactions, n_components=N_LATENT)
svd_vecs = vt.T

CPU times: user 12.3 s, sys: 1.46 s, total: 13.7 s
Wall time: 13.9 s


In [16]:
index_svd = faiss.IndexFlat(N_LATENT)   
index_svd.add(svd_vecs)          
print(index_svd.ntotal)

164575


In [17]:
np.save("../data/svd_vecs.npy", svd_vecs)

### NMF Sklearn

In [18]:
%%time

nmf = NMF(n_components=N_LATENT, max_iter=10)
W = nmf.fit_transform(interactions)
H = nmf.components_
nmf_vecs = H.T



CPU times: user 3min 25s, sys: 2.16 s, total: 3min 27s
Wall time: 3min 28s


In [19]:
index_nmf = faiss.IndexFlat(N_LATENT)   
index_nmf.add(nmf_vecs)          
print(index_nmf.ntotal)

164575


In [20]:
np.save("../data/nmf_vecs.npy", nmf_vecs)

### Metrics

In [21]:
def search_top_by_vec(v, k, index):
    dist, idx = index.search(np.array([v]), k)
    return idx[0]


def search_top_by_id(id, k, index, vecs):
    res = search_top_by_vec(vecs[id], k, index)
    return [i for i in res if i != id]

In [22]:
N_TEST_MAX = 1000

test_df = test_df.sample(N_TEST_MAX)

In [23]:
test_df['neighbours_pred_als'] = test_df['track_id'].apply(lambda x: search_top_by_id(x, N_TOP, 
                                                                                      index_als, als_vecs))
test_df['neighbours_pred_svd'] = test_df['track_id'].apply(lambda x: search_top_by_id(x, N_TOP, 
                                                                                      index_svd, svd_vecs))
test_df['neighbours_pred_nmf'] = test_df['track_id'].apply(lambda x: search_top_by_id(x, N_TOP, 
                                                                                      index_nmf, nmf_vecs))

In [24]:
def precision_at_k(true, pred):
    return len(set(pred) & set(true)) / len(pred)


def recall_at_k(true, pred):
    return len(set(pred) & set(true)) / len(true)


def total_precision(df, model):
    assert(model == 'als' or model == 'svd' or model == 'nmf')
    pred_col = 'neighbours_pred_{}'.format(model)
    return df.apply(lambda r: precision_at_k(r['neighbours'], r[pred_col]), axis=1).mean()


def total_recall(df, model):
    assert(model == 'als' or model == 'svd' or model == 'nmf')
    pred_col = 'neighbours_pred_{}'.format(model)
    return df.apply(lambda r: recall_at_k(r['neighbours'], r[pred_col]), axis=1).mean()


als_precision = total_precision(test_df, 'als')
svd_precision = total_precision(test_df, 'svd')
nmf_precision = total_precision(test_df, 'nmf')

als_recall = total_recall(test_df, 'als')
svd_recall = total_recall(test_df, 'svd')
nmf_recall = total_recall(test_df, 'nmf')

print("ALS precision: {:.6f}, ALS recall: {:.6f}".format(als_precision, als_recall))
print("SVD precision: {:.6f}, SVD recall: {:.6f}".format(svd_precision, svd_recall))
print("NMF precision: {:.6f}, NMF recall: {:.6f}".format(nmf_precision, nmf_recall))

ALS precision: 0.054111, ALS recall: 0.008353
SVD precision: 0.048444, SVD recall: 0.007171
NMF precision: 0.030222, NMF recall: 0.004595


### Validation

In [25]:
id2name = {}
for i, t in df[['track_id', 'artist_name', 'track_name']].drop_duplicates().iterrows():
    id2name[t['track_id']] = [t['artist_name'], t['track_name']]

In [26]:
def search_track_id(track, artist):
    return df[df['artist_name'].str.contains(track) & 
              df['track_name'].str.contains(artist)].iloc[0]['track_id']


def similar_tracks(track, artist, topn=10):
    target_id = search_track_id(track, artist)
    return [id2name[id] for id in 
            search_top_by_id(target_id, topn, index_als, als_vecs)]

In [27]:
similar_tracks("The xx", "Intro")

[['Tycho', 'Awake'],
 ['The xx', 'Crystalised'],
 ['Sylvan Esso', 'Coffee'],
 ['Tycho', 'A Walk'],
 ['Pretty Lights', 'Finally Moving'],
 ['Tycho', 'Coastal Brake'],
 ['Phantogram', "Don't Move"],
 ['The xx', 'Angels'],
 ['RJD2', 'Ghostwriter']]

In [28]:
similar_tracks("Daft Punk", "One More Time")

[['Daft Punk', 'Harder Better Faster Stronger'],
 ['Daft Punk', 'Digital Love'],
 ['Daft Punk', 'Robot Rock'],
 ['Daft Punk', 'Around The World'],
 ['Daft Punk', 'Something About Us'],
 ['Daft Punk', 'Instant Crush'],
 ['Daft Punk', 'Human After All'],
 ['Daft Punk', 'Alive'],
 ['Daft Punk', "Rollin' & Scratchin'"]]

In [29]:
similar_tracks("Nirvana", "Come As You Are")

[['Nirvana', 'Lithium'],
 ['Nirvana', 'Heart-Shaped Box'],
 ['Nirvana', 'About A Girl'],
 ['Nirvana', 'In Bloom - Nevermind Version'],
 ['Nirvana', 'Rape Me'],
 ['The Smashing Pumpkins', 'Bullet With Butterfly Wings'],
 ['Red Hot Chili Peppers', 'Give It Away'],
 ['Pearl Jam', 'Alive'],
 ['Pearl Jam', 'Even Flow']]

In [30]:
similar_tracks("Beatles", "Yesterday")

[['The Beatles', "Can't Buy Me Love - Remastered"],
 ['The Beatles', 'Eleanor Rigby - Remastered'],
 ['The Beatles', 'Something - Remastered'],
 ['The Beatles', 'Yellow Submarine - Remastered'],
 ['The Beatles', 'All You Need Is Love - Remastered 2009'],
 ['The Beatles', 'Help! - Remastered'],
 ['The Beatles', 'While My Guitar Gently Weeps - Remastered'],
 ['The Beatles', 'Lucy In The Sky With Diamonds - Remastered'],
 ['The Beatles', 'Hello, Goodbye - Remastered']]

In [31]:
similar_tracks("Olive", "You're Not Alone")

[['Daft Punk', 'One More Time'],
 ['Scooter', 'Maria (I Like It Loud)'],
 ['KLM Music', 'I Need A Miracle - Original Extended Mix'],
 ['Scooter', '4 AM - Remastered'],
 ['Scooter', 'The Logical Song'],
 ['DJ Jean', 'The Launch - Radio Edit'],
 ['Bodyrox', 'Yeah Yeah - D Ramirez Radio Edit'],
 ['Scooter', 'The Night'],
 ['Faithless', 'Insomnia - Monster Mix']]