In [1]:
import pickle
import time

import numpy as np
import pandas as pd

import gensim

from matplotlib import pyplot as plt

from evaluation_metrics import r_precision

In [2]:
# Path to project data
data_path = '/recsys/data/spotify/s2v_data/'
# Path to trian and test file
pickle_file = 'sentences_and_maps.pickle'

In [3]:
with open(data_path + pickle_file, 'rb') as handle:
    train, test, track_data_map, reverse_track_lookup = pickle.load(handle)

In [4]:
train = [val.split() for val in train]
test = [val.split() for val in test]

## Model training

In [5]:
model = gensim.models.Word2Vec(sentences=train, vector_size=64, window=7, min_count=10, workers=4)

In [6]:
len(track_data_map)

2262292

In [7]:
len(model.wv)

322701

In [8]:
model.wv.save(data_path + 'song_vectors.kv')

## Hyper parameters tunning
The parameters we change:
- vector size: size of the embeddings
- window: size of window to look at
- min_counts: Minimum examples of a song in order to create embeddings for it

In [9]:
def predict_using_similarity(s_sample, k, wv):
    song_in_dict = False
    for s in s_sample:
        if s in wv.key_to_index:
            song_in_dict = True
            break
    
    if song_in_dict:
        return wv.most_similar_cosmul(s, topn=k)
    else:
        return None

In [10]:
#The parameters we change:
# vector size: size of the embeddings
vector_sizes = [64, 128]
windows = [3, 5]
min_counts = [10 ,20]

In [11]:
params_df = pd.DataFrame(columns=['time', 'size', 'window', 'min_count', 'R-Precision', 'Coverage'])

for vector_size in vector_sizes:
    for window in windows:
        for min_count in min_counts:
            start = time.time()
            
            wv = gensim.models.Word2Vec(sentences=train, vector_size=64, window=7, min_count=10, workers=4).wv
            
            prediction = [predict_using_similarity(sample[:3], 100, wv) for sample in test[50000:52000]]
            test_found = [t for t, tester in zip(test[50000:52000], prediction) if tester]
            predictions_found = [p for p in prediction if p]
            
            params_df = params_df.append({'time': (time.time() - start) / 60,
                                            'size': vector_size,
                                            'window': window,
                                            'min_count': min_count,
                                            'Coverage': len(predictions_found) / len(prediction),
                                            'R-Precision': r_precision(test_found, predictions_found)},
                                           ignore_index=True)
            
            print(f'''Finished for size: {vector_size}, window: {window}, min_count: {min_count}, took: {(time.time() - start) / 60} minutes.''')

Finished for size: 64, window: 3, min_count: 10, took: 5.925591452916463 minutes.
Finished for size: 64, window: 3, min_count: 20, took: 3.943149673938751 minutes.
Finished for size: 64, window: 5, min_count: 10, took: 5.144257326920827 minutes.
Finished for size: 64, window: 5, min_count: 20, took: 4.182573374112447 minutes.
Finished for size: 128, window: 3, min_count: 10, took: 4.571328202883403 minutes.
Finished for size: 128, window: 3, min_count: 20, took: 3.002096164226532 minutes.
Finished for size: 128, window: 5, min_count: 10, took: 2.999345620473226 minutes.
Finished for size: 128, window: 5, min_count: 20, took: 2.973606562614441 minutes.


In [12]:
params_df

Unnamed: 0,time,size,window,min_count,R-Precision,Coverage
0,5.921462,64.0,3.0,10.0,0.107057,0.9865
1,3.93791,64.0,3.0,20.0,0.107878,0.9865
2,5.140761,64.0,5.0,10.0,0.107462,0.9865
3,4.178725,64.0,5.0,20.0,0.108071,0.9865
4,4.567009,128.0,3.0,10.0,0.107142,0.9865
5,2.998999,128.0,3.0,20.0,0.107323,0.9865
6,2.996299,128.0,5.0,10.0,0.10748,0.9865
7,2.970244,128.0,5.0,20.0,0.107437,0.9865
