In [1]:
import numpy as np
import pandas as pd

from gensim.models import Word2Vec



In [2]:
pd.options.mode.chained_assignment = None

### Prepare data

In [3]:
df = pd.read_csv("../data/spotify_playlists.csv")

In [4]:
df.head()

Unnamed: 0,playlist_id,track_id,track_uri,playlist_position,artist_name,track_name
0,9615,146723,spotify:track:6w2VaBGWcColW05nZROWvS,0,Tory Lanez,To D.R.E.A.M.
1,9615,158343,spotify:track:7hMqz88kcK2Tx6wBka9INF,1,Tory Lanez,All The Girls
2,9615,123985,spotify:track:5rqfg8Hz5D1aCcn2XUxzNA,2,Tory Lanez,Traphouse
3,9615,117440,spotify:track:5Z2S8VemQCfM02SLq8o5cR,3,Tory Lanez,One Day
4,9615,66611,spotify:track:39Y7nKwnEoMSfzbSVcAL0J,4,Tory Lanez,B.L.O.W.


In [5]:
n_track = df['track_id'].unique().shape[0]
n_playlist = df['playlist_id'].unique().shape[0]
print('tracks: {}, playlists: {}'.format(n_track, n_playlist))

tracks: 164576, playlists: 14750


In [6]:
N_TOP = 10

playlists_df = df.groupby('playlist_id')['track_id'].agg(neighbours=list)
playlists_df['n_neighbours'] = playlists_df['neighbours'].apply(len)
playlists_df = playlists_df[playlists_df['n_neighbours'] >= N_TOP].drop('n_neighbours', axis=1)

playlists = playlists_df['neighbours'].to_list()

In [7]:
len(playlists)

14750

In [8]:
%%time

df = df.merge(
        playlists_df,
        left_on='playlist_id',
        right_index=True
    )

df['neighbours'] = df.apply(lambda r: [n for n in r['neighbours'] if n != r['track_id']], axis=1)

CPU times: user 48.6 s, sys: 195 ms, total: 48.8 s
Wall time: 49.1 s


### Train and metrics

In [9]:
DIM = 600
EPOCHS = 20
N_TEST = 1000

model = Word2Vec(playlists, min_count=1, vector_size=DIM, epochs=EPOCHS)

In [10]:
%%time

def search_top(id, topn=N_TOP):
    return [item[0] for item in model.wv.most_similar(id, topn=topn)]


test_df = df.sample(N_TEST)
test_df['neighbours_pred'] = test_df['track_id'].apply(lambda x: search_top(x, N_TOP))

CPU times: user 1min 29s, sys: 3.42 s, total: 1min 33s
Wall time: 11.7 s


In [11]:
def precision_at_k(true, pred):
    return len(set(pred) & set(true)) / len(pred)


def recall_at_k(true, pred):
    return len(set(pred) & set(true)) / len(true)


precision = test_df.apply(lambda r: precision_at_k(r['neighbours'], r['neighbours_pred']), axis=1).mean()
recall = test_df.apply(lambda r: recall_at_k(r['neighbours'], r['neighbours_pred']), axis=1).mean()

In [12]:
print("Word2Vec precision: {:.6f}".format(precision))
print("Word2Vec recall: {:.6f}".format(recall))

Word2Vec precision: 0.070800
Word2Vec recall: 0.013337


### Validation

In [13]:
id2name = {}
for i, t in df[['track_id', 'artist_name', 'track_name']].drop_duplicates().iterrows():
    id2name[t['track_id']] = [t['artist_name'], t['track_name']]

In [14]:
def search_track_id(track, artist):
    return df[df['artist_name'].str.contains(track) & 
              df['track_name'].str.contains(artist)].iloc[0]['track_id']


def similar_tracks(track, artist, topn=10):
    target_id = search_track_id(track, artist)
    return [id2name[id] for id in 
            [t[0] for t in model.wv.most_similar(target_id, topn=topn)]]

In [15]:
similar_tracks("The xx", "Intro")

[['The Naked And Famous', 'Young Blood'],
 ['MisterWives', 'Our Own House'],
 ['Foster The People', 'Call It What You Want'],
 ['Bastille', 'Oblivion'],
 ['The Neighbourhood', 'Sweater Weather'],
 ['Imagine Dragons', 'I Bet My Life'],
 ['Bastille', 'Flaws'],
 ['George Ezra', 'Barcelona'],
 ['Two Door Cinema Club', 'Changing Of The Seasons'],
 ['Lorde', 'Team']]

In [16]:
similar_tracks("Daft Punk", "One More Time")

[['Skee-Lo', 'I Wish'],
 ['Brian McKnight', 'Back At One'],
 ['C & C Music Factory', 'Gonna Make You Sweat (Everybody Dance Now)'],
 ['Aqua', 'Barbie Girl'],
 ['Eiffel 65', 'Blue [Da Ba Dee]'],
 ['La Roux', 'Bulletproof'],
 ['Karmin', 'Brokenhearted'],
 ['Technotronic', 'Pump Up The Jam - Edit'],
 ['Jennifer Lopez', 'Waiting for Tonight'],
 ['Aloe Blacc', 'I Need a Dollar']]

In [17]:
similar_tracks("Nirvana", "Come As You Are")

[['Beastie Boys', 'Fight For Your Right'],
 ['Santana', 'Smooth'],
 ['Led Zeppelin', 'Whole Lotta Love'],
 ['Jet', 'Are You Gonna Be My Girl'],
 ['Depeche Mode', 'Enjoy The Silence - Single Mix'],
 ['Mr. Big', 'To Be With You - 2010 Remastered Version'],
 ['Metallica', 'Master Of Puppets'],
 ['Pink Floyd', 'Time'],
 ['America', 'A Horse with No Name'],
 ['Eric Clapton', 'Wonderful Tonight']]

In [18]:
similar_tracks("Beatles", "Yesterday")

[['The Beatles', 'Eleanor Rigby - Remastered'],
 ['The Beatles', 'Blackbird - Remastered'],
 ['The Beatles', 'Something - Remastered'],
 ['George Harrison', 'My Sweet Lord'],
 ['The Beatles', 'Help! - Remastered'],
 ['Buddy Holly', 'Everyday - Single Version'],
 ['The Smiths', 'This Charming Man - 2011 Remastered Version'],
 ['Neil Young', 'Heart of Gold'],
 ['Kool & The Gang', 'Jungle Boogie'],
 ['Dion', 'The Wanderer']]

In [19]:
similar_tracks("Olive", "You're Not Alone")

[['Cœur De Pirate', 'Comme des enfants'],
 ['Kavinsky', 'Nightcall'],
 ['Sufjan Stevens', 'No Shade in the Shadow of The Cross'],
 ['Télépopmusik', 'Breathe'],
 ['Janelle Monáe', 'Tightrope - feat. Big Boi'],
 ['Johann Sebastian Bach',
  'Unaccompanied Cello Suite No. 1 in G Major, BWV 1007: Prélude'],
 ['Avicii', 'Levels - Skrillex Remix'],
 ['Massive Attack', 'Teardrop'],
 ['Gym Class Heroes', 'The Fighter - feat. Ryan Tedder'],
 ['Father John Misty', 'Chateau Lobby #4 (in C for Two Virgins)']]