In [1]:
import json
import os
import msgpack
import pickle
import copy
import numpy as np
import math
import empty_playlist
from annoy import AnnoyIndex
from tqdm import tqdm_notebook
from datetime import datetime
from gensim.models import Word2Vec

In [2]:
def getWord2Vec(data,embedding_dim,min_count=5,path='',precomputed=False):
    directory = path
    w2v_fname = directory + '/word2vec-' + str(embedding_dim) + '.txt'
    if not os.path.exists(directory):
        os.makedirs(directory)
    if(not precomputed):
        print('Creating Word2Vec Model...')
        model = Word2Vec(data,size=embedding_dim,window=5,min_count=1,workers=6,iter=10,negative=10)
        model.init_sims(replace=True)
        print('Initialized Sims')
        model.save(w2v_fname)
        print('Saved Word2Vec model')
    else:
        print('Loading Word2Vec Model...')
        model = Word2Vec.load(w2v_fname)
        print('Loaded Word2Vec Model...')
    return model

def createAnnoyIndex(data,num_trees,path='.'):
    dim = data[0].shape[0]
    annoy_fname = path + '/annoy-'+str(num_trees)+'.ann'
    annoy_indexer = AnnoyIndex(dim,metric='angular')  # Length of item vector that will be indexed
    for key,value in tqdm_notebook(data.items()):
        annoy_indexer.add_item(key, value)
    print('Building Annoy Index...')
    annoy_indexer.build(num_trees)
    print('Saving Annoy Index...')
    annoy_indexer.save(annoy_fname)
    print('Done!')
    return annoy_indexer
    
def loadAnnoyIndex(dim,num_trees,path=''):
    annoy_fname = path + '/annoy-'+str(num_trees)+'.ann'
    print(annoy_fname)
    annoy_indexer = AnnoyIndex(dim,metric='angular')
    annoy_indexer.load(annoy_fname) # super fast, will just mmap the file
    #print(t.get_nns_by_item(0, 5)) # will find the 1000 nearest neighbors
    return annoy_indexer

def averagePlaylistVec(data):
    avg_vec = np.zeros_like(w2v_model.wv.vectors[0])
    playlist_vectors = {}
    for index,playlist in enumerate(tqdm_notebook(data)):
        w2vecs = []
        for songid in playlist:
            vec = w2v_model.wv[songid]
            w2vecs.append(vec)
        if(len(w2vecs)==0):
            w2vecs.append(np.random.rand(w2v_model.wv.vectors[0].shape[0]))
        avg_vec = np.mean(np.asarray(w2vecs),axis=0)
        playlist_vectors[index] = avg_vec / np.linalg.norm(avg_vec)
    return playlist_vectors

def averageNewPlaylistVec(playlist):
    avg_vec = np.zeros_like(w2v_model.wv.vectors[0])
    w2vecs = []
    for songuri in playlist:
        songid = str(track_uri2id[songuri])
        if(songid not in w2v_model.wv.vocab):
            continue
        vec = w2v_model.wv[songid]
        w2vecs.append(vec)
    avg_vec = np.mean(np.asarray(w2vecs),axis=0)
    return avg_vec

def testPlaylistAvg(data):
    all_avg_vectors = [list() for i in range(len(data))]
    names_01 = names[0:1000] + names[9000:]  
    #similar_names = empty_playlist.get_topN_playlists(names_01, 10, 8)
    #with open('similar_names_test.pkl','wb') as fp:
    #    pickle.dump(similar_names,fp)
    #print(similar_names)
    with open('similar_names_test.pkl','rb') as fp:
        similar_names = pickle.load(fp)
    avg_vec = np.zeros_like(w2v_model.wv.vectors[0])
    for key,value in tqdm_notebook(similar_names.items()):
        w2vecs = []
        for v in value:
            if(v in playlist_vectors):
                w2vecs.append(playlist_vectors[v])
        avg_vec = np.mean(np.asarray(w2vecs),axis=0)
        if(key<1000):
            all_avg_vectors[key] = avg_vec
        else:
            if(len(data[8000+key]) != 0):
                songid = data[8000+key][0]
                vec_s = w2v_model.wv[songid]
                vec_a = artist_vecs[track2artist[int(songid)]]
                all_avg_vectors[8000+key] = 0.4*avg_vec + 0.4*vec_s + 0.2*vec_a
            else:
                all_avg_vectors[8000+key] = avg_vec
    for index,playlist in enumerate(tqdm_notebook(data[1000:9000])):
        w2vecs = []
        for songid in playlist:
            vec = w2v_model.wv[songid]
            w2vecs.append(vec)
        if(len(w2vecs)==0):
            pl_name = names[1000+index]
            if(pl_name!=''):
                similar_names = empty_playlist.get_topN_playlists([pl_name], 10, 1)
                for v in similar_names[0]:
                    w2vecs.append(playlist_vectors[v])
            else:
                print('No name at index',1000+index)    
        if(len(w2vecs) == 1):
            new_w2vec = []
            pl_name = names[1000+index]
            if(pl_name!=''):
                similar_names = empty_playlist.get_topN_playlists([pl_name], 10, 1)
                for v in similar_names[0]:
                    new_w2vec.append(playlist_vectors[v])
                w2vecs.append(np.mean(np.asarray(new_w2vec),axis=0))
            else:
                print('No name at index',1000+index)
                print('Getting artist match...')
                w2vecs.append(artist_vecs[track2artist[int(playlist[0])]])
                if(len(w2vecs) == 0):
                    w2vecs.append(np.random.rand(w2v_model.wv.vectors[0].shape[0]))
        avg_vec = np.mean(np.asarray(w2vecs),axis=0)
        all_avg_vectors[1000+index] = avg_vec
    return all_avg_vectors

In [3]:
embedding_dim = 256
num_trees = 100
min_count = 10
dataset = '1000k'
path = '-'.join(['annoy','dim',str(embedding_dim),'tree',str(num_trees),'mincount',str(min_count)])

In [4]:
print('Loading data file...')
with open('data-'+dataset+'.msgpack','rb') as fp:
    data = msgpack.load(fp,encoding='utf-8')

Loading data file...


In [5]:
print('Loading tracks...')
with open('tracks-'+dataset+'.pkl','rb') as fp:
    track_uri2id,track_id2name = pickle.load(fp,encoding='utf-8')
print('Loading artists...')
with open('artists-'+dataset+'.pkl','rb') as fp:
    artist_uri2id,artist_id2name = pickle.load(fp,encoding='utf-8')
print('Loading track to artist...')
with open('track2artist-'+dataset+'.pkl','rb') as fp:
    track2artist = pickle.load(fp)
track_id2uri = {v:k for k,v in track_uri2id.items()}


Loading tracks...
Loading artists...
Loading track to artist...


In [6]:
w2v_model = getWord2Vec([],embedding_dim,min_count,path=path,precomputed=True)

Loading Word2Vec Model...
Loaded Word2Vec Model...


In [7]:
from collections import defaultdict
artist_vecs = defaultdict(list)
for k,v in track2artist.items():
    if(str(k) in w2v_model.wv.vocab):
        artist_vecs[v].append(w2v_model.wv[str(k)])
for k in artist_vecs.keys():
    artist_vecs[k] = np.mean(np.asarray(artist_vecs[k]),axis=0)

In [None]:
playlist_vectors = averagePlaylistVec(data)
#playlist_vectors = np.asarray(playlist_vectors)
#playlist_vectors = list(np.delete(playlist_vectors,empty_playlists))
with open(path+'/playlist_embeddings.pkl','wb') as fp:
    pickle.dump(playlist_vectors,fp)

In [8]:
with open(path+'/playlist_embeddings.pkl','rb') as fp:
    playlist_vectors = pickle.load(fp)

In [None]:
print(len(playlist_vectors))

In [8]:
annoy_path = './annoy'
num_annoy_trees = 250
#annoy_indexer = createAnnoyIndex(playlist_vectors,num_annoy_trees,annoy_path)
annoy_indexer = loadAnnoyIndex(embedding_dim,num_annoy_trees,annoy_path)

./annoy/annoy-250.ann


In [9]:
del playlist_vectors

In [9]:
no_track_index = []
one_track_index = []
rest_index = []
challenge_data = []
challenge_pids = []
names = []
filename = '../../challenge_set.json'

with open(filename, "r") as fp:
    contents = json.load(fp)["playlists"]
    for idx,play_list in tqdm_notebook(enumerate(contents)):
        song_ids = []
        pid = play_list['pid']
        challenge_pids.append(pid)
        pname = ''
        if('name' in play_list):
            pname = play_list['name']
        names.append(pname)
        if(play_list["num_samples"] == 0):
            no_track_index.append(idx)
        elif(play_list["num_samples"] == 1):
            song = play_list['tracks'][0]
            if(song['track_uri'] in track_uri2id):
                song_ids.append(str(track_uri2id[song['track_uri']]))
                one_track_index.append(idx)
            else:
                no_track_index.append(idx)
        else:
            for song in play_list["tracks"]:
                if(song['track_uri'] in track_uri2id):
                    song_ids.append(str(track_uri2id[song['track_uri']]))
            if(len(song_ids)==0):
                no_track_index.append(idx)
            rest_index.append(idx)
        challenge_data.append(song_ids)
with open('../../challenge_data.pkl','wb') as fp:
    pickle.dump(challenge_data,fp)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [None]:
test_vectors = testPlaylistAvg(challenge_data)

In [None]:
test_vectors[4344] = artist_vecs[50644]

In [None]:
with open(path+'/test_embeddings.pkl','wb') as fp:
    pickle.dump(test_vectors,fp)

In [10]:
with open(path+'/test_embeddings.pkl','rb') as fp:
    test_vectors = pickle.load(fp)

In [None]:
print(len(annoy_indexer.get_nns_by_item(74848, 5,include_distances=True)))

In [11]:
def get_topN(vector,inputs,top=500):
    num_already_in_playlist = len(inputs)
    num_required = top+num_already_in_playlist
    num_similar_playlists = 600
    similar_playlists = annoy_indexer.get_nns_by_vector(vector,num_similar_playlists,include_distances=True)
    song_count = dict()
    for idx in range(num_similar_playlists):
        distance = similar_playlists[1][idx]
        playlist_score = distance*5000*math.exp(-0.22*idx)
        playlist_id = similar_playlists[0][idx]
        playlist = data[playlist_id]
        for songid in playlist:
            if songid in song_count:
                song_count[songid] = (song_count[songid][0] + playlist_score, song_count[songid][1] + 1)
            else:
                song_count[songid] = (playlist_score,1)
        if(len(song_count)>=1.5*num_required):
            break
    approx_list = []
    topN = sorted(song_count.items(), key=lambda x: x[1][0]/x[1][1], reverse=True)[:num_required]
    for entry in topN:
        songid = entry[0]
        if(songid in inputs):
            continue
        #title_and_artist = ' '.join([track_id2name[int(songid)],'-',artist_id2name[track2artist[int(songid)]]])
        #approx_list.append(title_and_artist)
        approx_list.append(track_id2uri[int(songid)])
    
    return approx_list[:top]

In [None]:
with open('../ncae/output_embeddings.pkl','rb') as fp:
    output_vectors = pickle.load(fp)

In [12]:
recommendations = []
recommendations.append(['team_info','main','neural-panda','v18saboo@g.ucla.edu'])
for idx,playlist in tqdm_notebook(enumerate(test_vectors)):
    pid = challenge_pids[idx]
    rec = get_topN(playlist,challenge_data[idx])
    recommendations.append([pid] + rec)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [13]:
for i in range(1,len(recommendations)):
    if len(recommendations[i]) < 501:
        print('Length Error at',i,', Expected 500, Got',len(recommendations[i]))

In [14]:
import csv
with open("submission.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(recommendations)

In [None]:
def get_topN(vector,inputs,n=500):
    num_already_in_playlist = len(inputs)
    approximate_neighbors = w2v_model.wv.most_similar([vector], topn=n+num_already_in_playlist)
    approx_list = []
    approx_artist = set()
    for pair in approximate_neighbors:
        songid = pair[0]
        if(songid in inputs):
            continue
        title_and_artist = ' '.join([track_id2name[int(songid)],'-',artist_id2name[track2artist[int(songid)]]])
        approx_list.append(title_and_artist)
        #approx_list.append(track_id2uri[int(songid)])
        #approx_artist.add(artist_id2name[track2artist[songid]])
    
    return approx_list[:10]

rec = get_topN(output_vectors[0],challenge_data[0])
print(rec)

In [None]:
choices = pickle.load(open('annoy-dim-256-tree-100-mincount-10/playlist_names.pkl', 'rb'))

In [None]:
for i in s:
    print(choices[i])

In [None]:
b = get_topN(test_vectors[-1],challenge_data[-1],500)

In [None]:
annoy_indexer.get_n_items()

In [None]:
choices[257168]

In [None]:
recommendations[-1][:20]

In [None]:
len(set(recommendations[-1][1:]).intersection(set(b)))

In [15]:
l = []
for i in data:
    l.append(len(i))

In [24]:
print('Mean =',np.mean(l))
print('Median =',np.median(l))
counts = np.bincount(l)
print('Mode =',np.argmax(counts))
print('Maximum =',np.max(l))
print('3rd Quartile =',np.percentile(l,75))

Mean = 60.308631
Median = 44.0
Mode = 20
Maximum = 250
3rd Quartile = 83.0
