In [1]:
import json
import os
import msgpack
import pickle
import copy
import numpy as np
import empty_playlist
from tqdm import tqdm_notebook
from datetime import datetime
from gensim.models import Word2Vec
from gensim.similarities.index import AnnoyIndexer

In [16]:
def getWord2Vec(data,embedding_dim,min_count=5,path='',num_trees=100,precomputed=False):
    directory = path
    annoy_fname = directory + '/annoy'
    w2v_fname = directory + '/word2vec-' + str(embedding_dim) + '.txt'
    if not os.path.exists(directory):
        os.makedirs(directory)
    if(not precomputed):
        print('Creating Word2Vec Model...')
        model = Word2Vec(data, size=embedding_dim, window=5, min_count=min_count, workers=6, iter=10,negative=10)
        model.init_sims()
        print('Initialized Sims')
        model.save(w2v_fname)
        print('Creating Annoy Indexes...')
        annoy_index = AnnoyIndexer(model, num_trees)
        # Persist index to disk
        annoy_index.save(annoy_fname)
    else:
        print('Loading Word2Vec Model...')
        model = Word2Vec.load(w2v_fname)
        print('Loading Annoy Indexing...')
        annoy_index = AnnoyIndexer()
        annoy_index.load(annoy_fname)
        annoy_index.model = model

    return model,annoy_index#,embedding_matrix

def averagePlaylistVec(data):
    avg_vec = np.zeros_like(w2v_model.wv.vectors[0])
    playlist_vectors = []
    empty_playlists = []
    pid_order = []
    for index,playlist in enumerate(tqdm_notebook(data)):
        w2vecs = []
        for songid in playlist:
            if(songid in w2v_model.wv.vocab):
                vec = w2v_model.wv[songid]
                w2vecs.append(vec)
        if(len(w2vecs)==0):
            empty_playlists.append(index)
            continue
        pid_order.append(index)
        avg_vec = np.mean(np.asarray(w2vecs),axis=0)
        playlist_vectors.append(avg_vec)
    return playlist_vectors,pid_order,empty_playlists

def averageNewPlaylistVec(playlist):
    avg_vec = np.zeros_like(w2v_model.wv.vectors[0])
    w2vecs = []
    for songuri in playlist:
        songid = str(track_uri2id[songuri])
        if(songid in w2v_model.wv.vocab):
            vec = w2v_model.wv[songid]
            w2vecs.append(vec)
    avg_vec = np.mean(np.asarray(w2vecs),axis=0)
    return avg_vec

def testPlaylistAvg(data):
    all_avg_vectors = [list() for i in range(len(data))]
    names_01 = names[0:1000] + names[9000:]  
    print("Length Bitch = ",len(names_01))    
    similar_names = empty_playlist.get_topN_playlists(names_01, 10, 8)
    print(similar_names)
    avg_vec = np.zeros_like(w2v_model.wv.vectors[0])
    for key,value in tqdm_notebook(similar_names.items()):
        w2vecs = []
        for v in value:
            if(v in new_playlist_vectors):
                w2vecs.append(new_playlist_vectors[v])
        avg_vec = np.mean(np.asarray(w2vecs),axis=0)
        if(key<1000):
            all_avg_vectors[key] = avg_vec
        else:
            songid = data[8000+key][0]
            if(songid in w2v_model.wv.vocab):
                vec = w2v_model.wv[songid]
                all_avg_vectors[8000+key] = 0.5*avg_vec + 0.5*vec
            else:
                all_avg_vectors[8000+key] = avg_vec   
    for index,playlist in enumerate(tqdm_notebook(data[1000:9000])):
        w2vecs = []
        for songid in playlist:
            if(songid in w2v_model.wv.vocab):
                vec = w2v_model.wv[songid]
                w2vecs.append(vec)
        if(len(w2vecs)==0):
            pl_name = names[1000+index]
            if(pl_name!=''):
                similar_names = empty_playlist.get_topN_playlists([pl_name], 10, 1)
                for v in similar_names[0]:
                    if(v in new_playlist_vectors):
                        w2vecs.append(new_playlist_vectors[v])
            else:
                print('No name at index',1000+index)
        if(len(w2vecs)==1):
            new_w2vec = []
            pl_name = names[1000+index]
            if(pl_name!=''):
                similar_names = empty_playlist.get_topN_playlists([pl_name], 10, 1)
                for v in similar_names[0]:
                    if(v in new_playlist_vectors):
                        new_w2vec.append(new_playlist_vectors[v])
                w2vecs.append(np.mean(np.asarray(new_w2vec),axis=0))
            else:
                print('No name at index',1000+index)
                print('Getting artist match...')
                w2vecs.append(artist_vecs[track2artist[int(playlist[0])]])
        avg_vec = np.mean(np.asarray(w2vecs),axis=0)
        all_avg_vectors[1000+index] = avg_vec
    return all_avg_vectors

In [3]:
dataset = '1000k'
#print('Loading data file...')
#with open('data-'+dataset+'.msgpack','rb') as fp:
#    data = msgpack.load(fp,encoding='utf-8')
print('Loading tracks...')
with open('tracks-'+dataset+'.msgpack','rb') as fp:
    track_uri2id,track_id2name = msgpack.load(fp,encoding='utf-8')
print('Loading artists...')
with open('artists-'+dataset+'.msgpack','rb') as fp:
    artist_uri2id,artist_id2name = msgpack.load(fp,encoding='utf-8')
print('Loading track to artist...')
with open('track2artist-'+dataset+'.msgpack','rb') as fp:
    track2artist = msgpack.load(fp)
track_id2uri = {v:k for k,v in track_uri2id.items()}

Loading tracks...
Loading artists...
Loading track to artist...


In [4]:
embedding_dim = 256
num_trees = 100
min_count = 10
path = '-'.join(['annoy','dim',str(embedding_dim),'tree',str(num_trees),'mincount',str(min_count)])

In [5]:
w2v_model,annoy_index = getWord2Vec([],embedding_dim,min_count,path=path,precomputed=True)

Loading Word2Vec Model...
Loading Annoy Indexing...


In [None]:
playlist_vectors,pid_order,empty_playlists = averagePlaylistVec(data)
print('Keep =',len(pid_order))
print('Delete =',len(empty_playlists))
#playlist_vectors = np.asarray(playlist_vectors)
#playlist_vectors = list(np.delete(playlist_vectors,empty_playlists))
with open(path+'/playlist_embeddings.pkl','wb') as fp:
    pickle.dump([playlist_vectors,pid_order],fp)

In [6]:
with open(path+'/playlist_embeddings.pkl','rb') as fp:
    playlist_vectors,pid_order = pickle.load(fp)

In [7]:
new_playlist_vectors = dict()
for i in range(len(pid_order)):
    new_playlist_vectors[pid_order[i]] = playlist_vectors[i]

In [8]:
no_track_index = []
one_track_index = []
rest_index = []
challenge_data = []
challenge_pids = []
names = []
filename = '../../challenge_set.json'

with open(filename, "r") as fp:
    data = json.load(fp)["playlists"]
    for idx,play_list in tqdm_notebook(enumerate(data)):
        song_ids = []
        pid = play_list['pid']
        challenge_pids.append(pid)
        pname = ''
        if('name' in play_list):
            pname = play_list['name']
        names.append(pname)
        if(play_list["num_samples"] == 0):
            no_track_index.append(idx)
        elif(play_list["num_samples"] == 1):
            song = play_list['tracks'][0]
            if song['track_uri'] in track_uri2id:
                song_ids.append(str(track_uri2id[song['track_uri']]))
                one_track_index.append(idx)
            else:
                no_track_index.append(idx)
        else:
            for song in play_list["tracks"]:
                if str(track_uri2id[song['track_uri']]) in w2v_model.wv.vocab:
                    song_ids.append(str(track_uri2id[song['track_uri']]))
            if(len(song_ids) == 1):
                print("Len 1",pid)
                one_track_index.append(idx)
            elif(len(song_ids) == 0):
                print("Len 0",pid)
                no_track_index.append(idx)
            else:
                rest_index.append(idx)
        challenge_data.append(song_ids)
        
with open('../../challenge_data.pkl','wb') as fp:
    pickle.dump(challenge_data,fp)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Len 1 1000631
Len 1 1006488
Len 0 1011975



In [9]:
from collections import defaultdict
artist_vecs = defaultdict(list)
for k,v in track2artist.items():
    if(str(k) in w2v_model.wv.vocab):
        artist_vecs[v].append(w2v_model.wv[str(k)])
for k in artist_vecs.keys():
    artist_vecs[k] = np.mean(np.asarray(artist_vecs[k]),axis=0)

In [31]:
test_vectors = testPlaylistAvg(challenge_data)

Length Bitch =  2000
Length of playlists =  2000




{0: [313, 738, 967, 1203, 1535, 2038, 2049, 3540, 4809, 5089], 1: [3068, 3599, 3705, 6661, 7894, 8999, 18615, 31736, 36435, 40130], 2: [67076, 79708, 82737, 111527, 112882, 116884, 146372, 146802, 150885, 203711], 3: [55926, 69855, 81093, 89804, 164697, 242525, 269043, 342493, 406365, 481176], 4: [16473, 28502, 43009, 47467, 101571, 106838, 143692, 251108, 269181, 272214], 5: [19542, 20214, 28770, 39183, 77606, 105803, 110547, 128856, 139066, 144622], 6: [59, 362, 1556, 4467, 6127, 6421, 6480, 7583, 8348, 8654], 7: [9399, 11304, 13178, 14010, 17055, 18762, 19420, 19594, 19889, 20367], 8: [4236, 24185, 64020, 67449, 137390, 145078, 156611, 160748, 169320, 177239], 9: [22092, 27031, 32125, 33469, 36977, 44255, 60032, 66245, 70411, 129676], 10: [137398, 171342, 180350, 185510, 278505, 291417, 297373, 329569, 336229, 401344], 11: [9584, 14237, 17837, 22237, 39095, 41881, 53814, 62682, 73648, 82180], 12: [683, 2347, 5045, 9031, 12740, 24761, 25249, 26076, 26229, 29187], 13: [229679, 258378,

HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8000), HTML(value='')))

Length of playlists =  1
No name at index 2809
Getting artist match...
No name at index 4344


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)





In [None]:
challenge_pids[4344]

In [35]:
test_vectors[4344] = artist_vecs[50644]

In [34]:
index = 4344
w2vec = [test_vectors[index]]
for track in challenge_data[index]:
    artist = track2artist[int(track)]
    w2vec.append(artist_vecs[artist])
avg_vec = np.mean(np.asarray(w2vec),axis=0)
test_vectors[index] = avg_vec

In [36]:
with open(path+'/test_embeddings.pkl','wb') as fp:
    pickle.dump(test_vectors,fp)

In [None]:
with open(path+'/test_embeddings.pkl','rb') as fp:
    test_vectors = pickle.load(fp)

In [37]:
with open('../ncae/output_embeddings.pkl','rb') as fp:
    output_vectors = pickle.load(fp)

In [49]:
def get_topN(vector,inputs,n=500):
    num_already_in_playlist = len(inputs)
    approximate_neighbors = w2v_model.wv.most_similar([vector], topn=n+num_already_in_playlist, indexer=annoy_index)
    approx_list = []
    approx_artist = set()
    for pair in approximate_neighbors:
        songid = pair[0]
        if(songid in inputs):
            continue
        #title_and_artist = ' '.join([track_id2name[int(songid)],'-',artist_id2name[track2artist[int(songid)]]])
        #approx_list.append(title_and_artist)
        approx_list.append(track_id2uri[int(songid)])
        #approx_artist.add(artist_id2name[track2artist[songid]])
    
    return approx_list[:n]

In [None]:
recommendations = []
recommendations.append(['team_info','main','neural-panda','v18saboo@g.ucla.edu'])
for idx,playlist in tqdm_notebook(enumerate(test_vectors)):
    pid = challenge_pids[idx]
    rec = get_topN(playlist,challenge_data[idx])
    recommendations.append([pid] + rec)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [None]:
print(test_vectors[0]==test_vectors[1])

In [47]:
for i in range(1,len(recommendations)):
    if len(recommendations[i]) != 501:
        print(i)

In [42]:
for i in [1,2,3,10000]:
    print('Playlist Name = ',names[i-1])
    print(recommendations[i][:10])
    print('---------------------------------------------------')

Playlist Name =  spanish playlist
[1000002, 'Mas Que Tu Amigo - Matt Hunter', 'Habanero - Gente De Zona', 'Yo Quiero Chupar - Grupo Branly', 'Ayer la Vi Bailando por Ahí - Marcos Medos', 'Me Voy Enamorando - Chino & Nacho', 'Llévame Despacio - Paulina Goto', 'Besito Con Lengua - El Chacal', 'Mi Niña Bonita - En Vivo Desde El Anfiteatro El Hatillo, Caracas-Venezuela/2014 - Chino & Nacho', 'Sin Tu Amor (feat. Elijah King) - Mario Bautista']
---------------------------------------------------
Playlist Name =  Groovin
[1000003, 'Move On Up - Extended Version - Curtis Mayfield', 'Always - Classixx Remix - Panama', 'Got To Give It Up (Part 1) - Marvin Gaye', 'Use Me - Bill Withers', 'Something Good Can Work (RAC Mix) - Two Door Cinema Club', 'Cocaine Blues - Escort', "Baby I'm Yours (feat. Irfane) - Breakbot", 'Got To Give It Up - Marvin Gaye', 'Get Down On It - Single Version - Kool & The Gang']
---------------------------------------------------
Playlist Name =  uplift
[1000004, 'Fix My Ey

In [48]:
import csv

with open("submission.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(recommendations)

In [None]:
similar_names = empty_playlist.get_topN_playlists(['country hits', 'pop', 'songs', 'rock', 'trap music'], 10, 1)
print(similar_names)

In [19]:
sample_playlist = ["spotify:track:5CQ30WqJwcep0pYcV4AMNc", "spotify:track:0hCB0YR03f6AmQaHbwWDe8","spotify:track:2WfaOiMkCvy7F5fcp2zZ8L",
                  "spotify:track:6bgS89vk8nMI7sdBdNxDGh","spotify:track:0jHkgTtTaqg5LNCiYDQPUB","spotify:track:4BGJSbB5rAcg4pNzD4gfxU",
                  "spotify:track:0MKqeOVdZcUFGJvWpGCKbG","spotify:track:6mFkJmJqdDVQ1REhVfGgd1","spotify:track:3zByVQLvdXUaDTubfWkpCk","spotify:track:0CsM8VGDi38kusMv3pxyj1"]
sample_vec = averageNewPlaylistVec(sample_playlist)
with open(path+'/test_embeddings2.pkl','wb') as fp:
    pickle.dump([sample_vec],fp)

In [29]:
with open('../ncae/output_embeddings2.pkl','rb') as fp:
    output_vector = pickle.load(fp)
rec = get_topN(output_vector[0],sample_vec,20)

  


In [43]:
for i in sample_playlist:
    print(track_uri2id[i],'-',track2artist[track_uri2id[i]])

64 - 44
7597 - 44
3288 - 1443
505429 - 30350
3375 - 1104
7208 - 1881
448 - 305
7599 - 2855
308 - 200
4435 - 1878


In [None]:
print(rec)

In [32]:
rec = get_topN(sample_vec,[],20)
print(rec)

['Stairway To Heaven - Led Zeppelin', 'Tiny Dancer - Elton John', 'Benny and the Jets - The Red One Rocketman', 'Sweet Caroline - Neil Diamond', 'American Pie - Don McLean', 'Take On Me - a-ha', 'Wish You Were Here - Pink Floyd', 'Hey Jude - Remastered 2015 - The Beatles', 'Bennie And The Jets - Remastered 2014 - Elton John', 'Stand By Me - Ben E. King', 'Every Breath You Take - Remastered 2003 - The Police', 'Let It Be - Remastered - The Beatles', 'Come Together - Remastered - The Beatles', 'Drift Away - Dobie Gray', 'Imagine - 2010 - Remaster - John Lennon', 'Roxanne - Remastered 2003 - The Police', 'Hotel California - Remastered - Eagles', 'Come On Eileen - Dexys Midnight Runners', 'The Joker - Steve Miller Band', 'Escape (The Pina Colada Song) - Rupert Holmes']
