In [1]:
import numpy as np
import pandas as pd

from sklearn.decomposition import NMF
from sklearn.utils.extmath import randomized_svd
from sklearn.model_selection import train_test_split

import faiss
import implicit
from scipy.sparse import csr_matrix

### Prepare data

In [2]:
df = pd.read_csv("../data/rec_test_assignment_playlist2track.csv")

In [3]:
df.head()

Unnamed: 0,playlist_id,track_id,track_uri
0,0,0,spotify:track:1r0faljjM2b876iNoaDUh5
1,1,1,spotify:track:1JO1xLtVc8mWhIoE3YaCL0
2,2,2,spotify:track:3SuzCeGoNOWmbWOoDMou0B
3,3,3,spotify:track:4GJAd1nBylAEbw1dZDVzEQ
4,4,4,spotify:track:5cbpoIu3YjoOwbBDGUEp3P


In [4]:
n_track = df['track_id'].unique().shape[0]
n_playlist = df['playlist_id'].unique().shape[0]
print('tracks: {}, playlists: {}'.format(n_track, n_playlist))

tracks: 169548, playlists: 714818


In [5]:
N_TOP = 5

df = df.merge(
        df.groupby('playlist_id')['track_id'].agg(neighbours=list),
        left_on='playlist_id',
        right_index=True
    )

df['n_neighbours'] = df['neighbours'].apply(len)
df = df[df['n_neighbours'] >= N_TOP].drop('n_neighbours', axis=1)

In [6]:
train_idx, test_idx = train_test_split(range(df['playlist_id'].unique().shape[0]), test_size=0.2)
train_idx = set(train_idx)
test_idx = set(test_idx)

df['train'] = df['playlist_id'].apply(lambda x: x in train_idx)
train_df = df[df['train'] == True].drop('train', axis=1)
test_df = df[df['train'] == False].drop('train', axis=1)


print(train_df.shape, test_df.shape)

(256064, 4) (560574, 4)


In [7]:
rows, cols, data = [], [], []
for i, r in train_df.iterrows():
    rows.append(r['playlist_id'])
    cols.append(r['track_id'])
    data.append(1)

In [8]:
interactions = csr_matrix((data, (rows, cols)))

In [9]:
interactions.shape

(125095, 169542)

### ALS

In [10]:
N_LATENT = 128

In [11]:
%%time

model = implicit.als.AlternatingLeastSquares(factors=N_LATENT)
model.fit(interactions)
als_vecs = model.item_factors

  check_blas_config()


  0%|          | 0/15 [00:00<?, ?it/s]

CPU times: user 3min 17s, sys: 2min 50s, total: 6min 8s
Wall time: 42.5 s


In [12]:
index_als = faiss.IndexFlat(N_LATENT)   
index_als.add(als_vecs)          
print(index_als.ntotal)

169542


In [13]:
np.save("../data/als_vecs.npy", als_vecs)

### SVD

In [14]:
%%time

u, s, vt = randomized_svd(interactions, n_components=N_LATENT)
svd_vecs = vt.T

CPU times: user 1min 6s, sys: 25.8 s, total: 1min 32s
Wall time: 10.4 s


In [15]:
index_svd = faiss.IndexFlat(N_LATENT)   
index_svd.add(svd_vecs)          
print(index_svd.ntotal)

169542


In [16]:
np.save("../data/svd_vecs.npy", svd_vecs)

### NMF Sklearn

In [17]:
%%time

nmf = NMF(n_components=N_LATENT, max_iter=10)
W = nmf.fit_transform(interactions)
H = nmf.components_
nmf_vecs = H.T



CPU times: user 2min 28s, sys: 44.2 s, total: 3min 12s
Wall time: 1min 9s


In [18]:
index_nmf = faiss.IndexFlat(N_LATENT)   
index_nmf.add(nmf_vecs)          
print(index_nmf.ntotal)

169542


In [19]:
np.save("../data/nmf_vecs.npy", nmf_vecs)

### Metrics

In [20]:
def search_top_by_vec(v, k, index):
    dist, idx = index.search(np.array([v]), k)
    return idx[0]


def search_top_by_id(id, k, index, vecs):
    return search_top_by_vec(vecs[id], k, index)

In [21]:
N_TEST_MAX = 1000

if N_TEST_MAX:
    test_df = test_df.iloc[:N_TEST_MAX]

In [22]:
test_df['neighbours_pred_als'] = test_df['track_id'].apply(lambda x: search_top_by_id(x, N_TOP, 
                                                                                      index_als, als_vecs))
test_df['neighbours_pred_svd'] = test_df['track_id'].apply(lambda x: search_top_by_id(x, N_TOP, 
                                                                                      index_svd, svd_vecs))
test_df['neighbours_pred_nmf'] = test_df['track_id'].apply(lambda x: search_top_by_id(x, N_TOP, 
                                                                                      index_nmf, nmf_vecs))

In [23]:
def precision_at_k(true, pred):
    return len(set(pred) & set(true)) / len(pred)


def recall_at_k(true, pred):
    return len(set(pred) & set(true)) / len(true)


def total_precision(df, model):
    assert(model == 'als' or model == 'svd' or model == 'nmf')
    pred_col = 'neighbours_pred_{}'.format(model)
    return df.apply(lambda r: precision_at_k(r['neighbours'], r[pred_col]), axis=1).mean()


def total_recall(df, model):
    assert(model == 'als' or model == 'svd' or model == 'nmf')
    pred_col = 'neighbours_pred_{}'.format(model)
    return df.apply(lambda r: recall_at_k(r['neighbours'], r[pred_col]), axis=1).mean()


als_precision = total_precision(test_df, 'als')
svd_precision = total_precision(test_df, 'svd')
nmf_precision = total_precision(test_df, 'nmf')

als_recall = total_recall(test_df, 'als')
svd_recall = total_recall(test_df, 'svd')
nmf_recall = total_recall(test_df, 'nmf')

print("ALS precision: {:.4f}, ALS recall: {:.4f}".format(als_precision, als_recall))
print("SVD precision: {:.4f}, SVD recall: {:.4f}".format(svd_precision, svd_recall))
print("NMF precision: {:.4f}, NMF recall: {:.4f}".format(nmf_precision, nmf_recall))

ALS precision: 0.1732, ALS recall: 0.1222
SVD precision: 0.1726, SVD recall: 0.1217
NMF precision: 0.1718, NMF recall: 0.1212
