In [1]:
import msgpack
import numpy as np
import pickle
from tqdm import tqdm_notebook
from datetime import datetime
from gensim.models import Word2Vec
from gensim.similarities.index import AnnoyIndexer
import os

In [32]:
def getWord2Vec(data,embedding_dim,min_count=5,path='',num_trees=100,precomputed=False):
    directory = path
    annoy_fname = directory + '/annoy'
    w2v_fname = directory + '/word2vec-' + str(embedding_dim) + '.txt'
    if not os.path.exists(directory):
        os.makedirs(directory)
    if(not precomputed):
        print('Creating Word2Vec Model...')
        model = Word2Vec(data, size=embedding_dim, window=5, min_count=min_count, workers=6, iter=10,negative=10)
        model.init_sims()
        print('Initialized Sims')
        model.save(w2v_fname)
        print('Creating Annoy Indexes...')
        annoy_index = AnnoyIndexer(model, num_trees)
        # Persist index to disk
        annoy_index.save(annoy_fname)
    else:
        print('Loading Word2Vec Model...')
        model = Word2Vec.load(w2v_fname)
        print('Loading Annoy Indexing...')
        annoy_index = AnnoyIndexer()
        annoy_index.load(annoy_fname)
        annoy_index.model = model
        print('Done Loading')

    return model,annoy_index#,embedding_matrix

def averagePlaylistVec(data):
    avg_vec = np.zeros_like(w2v_model.wv.vectors[0])
    playlist_vectors = []
    empty_playlists = []
    pid_order = []
    for index,playlist in enumerate(tqdm_notebook(data)):
        w2vecs = []
        for songid in playlist:
            if(songid in w2v_model.wv.vocab):
                vec = w2v_model.wv[songid]
                w2vecs.append(vec)
        if(len(w2vecs)==0):
            empty_playlists.append(index)
            continue
        pid_order.append(index)
        avg_vec = np.mean(np.asarray(w2vecs),axis=0)
        playlist_vectors.append(avg_vec)
    return playlist_vectors,pid_order,empty_playlists

def get_topN(vector,inputs,n=40):
    num_already_in_playlist = len(inputs)
    approximate_neighbors = w2v_model.wv.most_similar([vector], topn=n+num_already_in_playlist, indexer=annoy_index)
    approx_list = []
    approx_artist = set()
    for pair in approximate_neighbors:
        songid = pair[0]
        if(songid in inputs):
            continue
        approx_list.append(songid)
    return approx_list[:n]


def precision_at_k(expected, predicted):
    """
    Compute precision@k metric. Also known as hit-rate.
    
    Params
    ------
    expected : list of list
        Ground truth recommendations for each playlist.
    predicted : list of list
        Predicted recommendations for each playlist.
    """
    precisions = []
    for i in range(len(expected)):
        #predicted[i] = predicted[i][:len(expected[i])]
        precision = float(len(set(predicted[i]) & set(expected[i]))) / float(len(predicted[i]))
        precisions.append(precision)
    return np.mean(precisions) 
    
    
def compute_dcg(expected,predicted):
    """
    Compute DCG score for each user.
    
    Params
    ------
    expected : list
        Ground truth recommendations for single playlist.
    predicted : list
        Predicted recommendations for single playlist.
    """
    score = [float(el in expected) for el in predicted]
    dcg = np.sum(score / np.log2(1 + np.arange(1, len(score) + 1)))
    return dcg

def dcg_at_k(expected,predicted):
    """
    Compute dcg@k metric. (Discounted Continuous Gain)
    
    Params
    ------
    expected : list of list
        Ground truth recommendations for each playlist.
    predictions : list of list
        Predicted recommendations for each playlist.
    """
    dcg_scores = []
    
    for i in range(len(expected)):
        #predicted[i] = predicted[i][:len(expected[i])]
        dcg = compute_dcg(expected[i],predicted[i])
        dcg_scores.append(dcg)
    return np.mean(dcg_scores)

def ndcg_at_k(expected,predicted):
    """
    Compute ndcg@k metric. (Normalized Discounted Continous Gain)
    
    Params
    ------
    expected : list of list
        Ground truth recommendations for each playlist.
    predicted : list of list
        Predicted recommendations for each playlist.
    """
    ndcg_scores = []
    for i in range(len(expected)):
        #predicted[i] = predicted[i][:len(expected[i])]
        labels = expected[i]
        idcg = compute_dcg(labels,labels)
        true_dcg = compute_dcg(labels,predicted[i])
        ndcg_scores.append(true_dcg/idcg)
    return np.mean(ndcg_scores)


In [36]:
thresh = 80
leave_out = 20
dataset = 1000
pid_list = []
test_set = []
train_set = []
print('Loading data file...')
with open('data-'+str(dataset)+'k.msgpack','rb') as fp:
    data = msgpack.load(fp,encoding='utf-8')
print('Done Loading!')
counter = 0    
for i,d in enumerate(data):
    if len(d) > thresh:
        pid_list.append(i)
        test_set.append(d[-1*leave_out:])
        train_set.append(d[:len(d)-leave_out])
        counter += 1
        if counter > 9999:
            break
        
with open('my_chlng_train.msgpack', 'wb') as fp:
    msgpack.dump(train_set, fp)
with open('my_chlng_test.msgpack', 'wb') as fp:
    msgpack.dump(test_set, fp)
with open('my_chlng_pid_list.msgpack', 'wb') as fp:
    msgpack.dump(pid_list, fp)

Loading data file...
Done Loading!


In [None]:
embedding_dim = 256
num_trees = 100
min_count = 10
path = '-'.join(['annoy','dim',str(embedding_dim),'tree',str(num_trees),'mincount',str(min_count)])
w2v_model,annoy_index = getWord2Vec([],embedding_dim,min_count,path=path,precomputed=True)
playlist_vectors, _, empty_playlists = averagePlaylistVec(train_set)
train_set = np.asarray(train_set)
train_set = list(np.delete(train_set,empty_playlists))
test_set = np.asarray(test_set)
test_set = list(np.delete(test_set,empty_playlists))
pid_list = np.asarray(pid_list)
pid_list = list(np.delete(pid_list,empty_playlists))
with open('my_chlng_train.msgpack', 'wb') as fp:
    msgpack.dump(train_set, fp)
with open('my_chlng_test.msgpack', 'wb') as fp:
    msgpack.dump(test_set, fp)
with open('my_chlng_pid_list.pkl', 'wb') as fp:
    pickle.dump(pid_list, fp)
with open('my_chlng_plist_embed.pkl', 'wb') as fp:
    pickle.dump(playlist_vectors, fp)    

# TEST ANN 

In [38]:
with open('../ncae/challenge_output_embeddings.pkl','rb') as fp:
    output_vectors = pickle.load(fp)
with open('my_chlng_train.msgpack','rb') as fp:
    train_set = msgpack.load(fp, encoding='utf-8')

In [55]:
output_preds = []
for output,inputs in tqdm_notebook(zip(output_vectors, train_set)):
    output_preds.append(get_topN(output, inputs, 200))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [44]:
with open('my_chlng_test.msgpack','rb') as fp:
    test_set = msgpack.load(fp, encoding='utf-8')
hr = []
ndcg = []
dcg = []
for op,ip in tqdm_notebook(zip(output_preds, test_set)):
    hr.append(precision_at_k(ip, op[:20]))
    ndcg.append(ndcg_at_k(ip, op[:20]))
    dcg.append(dcg_at_k(ip, op[:20]))
print('Hit rate:', np.mean(np.asarray(hr)))
print('NDCG:', np.mean(np.asarray(ndcg)))
print('DCG:',np.mean(np.asarray(dcg)))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Hit rate: 0.33224325444463504
NDCG: 0.42361688820673576
DCG: 1.222424066823299


In [56]:
with open('my_chlng_test.msgpack','rb') as fp:
    test_set = msgpack.load(fp, encoding='utf-8')
hr = []
ndcg = []
dcg = []
for op,ip in tqdm_notebook(zip(output_preds, test_set)):
    ip = list(map(int, ip))
    op = list(map(int, op))
    hr.append(precision_at_k([ip], [op]))
    ndcg.append(ndcg_at_k([ip], [op]))
    dcg.append(dcg_at_k([ip], [op]))
print('Hit rate:', np.mean(np.asarray(hr)))
print('NDCG:', np.mean(np.asarray(ndcg)))
print('DCG:',np.mean(np.asarray(dcg)))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Hit rate: 0.0019262705082032816
NDCG: 0.01031328324882872
DCG: 0.07260828197055023
