In [2]:
import gc
import polars as pl
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.models.callbacks import CallbackAny2Vec
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import time

train = pd.read_parquet('input/footprint/train.parquet')
test = pd.read_parquet('input/footprint/test.parquet')

In [3]:
train = train[(train['aid']!=train['aid'].shift()) | (train['session']!=train['session'].shift())].reset_index()
test = test[(test['aid']!=test['aid'].shift()) | (test['session']!=test['session'].shift())].reset_index()

In [4]:
sentences_df =  pd.concat([train,test],ignore_index=True).groupby('session')['aid'].apply(list).reset_index(drop=True)

sentences = sentences_df.to_list()
del sentences_df; gc.collect()

0

In [6]:
class callback(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 1
        self.losses = []
        self.cumu_loss = 0.0
        self.previous_epoch_time = time.time()

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        norms = [np.linalg.norm(v) for v in model.wv.vectors]
        now = time.time()
        epoch_seconds = now - self.previous_epoch_time
        self.previous_epoch_time = now
        self.cumu_loss += float(loss)
        print(f"Loss after epoch {self.epoch}: {loss} (cumulative loss so far: {self.cumu_loss}) "+\
              f"-> epoch took {round(epoch_seconds, 2)} s - vector norms min/avg/max: "+\
              f"{round(float(min(norms)), 2)}, {round(float(sum(norms)/len(norms)), 2)}, {round(float(max(norms)), 2)}")
        self.epoch += 1
        self.losses.append(float(loss))
        model.running_training_loss = 0.0

In [68]:
%%time

w2vec = Word2Vec(
    sentences=sentences,
    epochs=100,
    vector_size= 64,
    alpha=0.05,
    min_alpha=0.001,
    window = 5, 
    negative = 12,
    ns_exponent = 0.1, 
    sg = 1, 
    min_count=1, 
    workers=48,
    compute_loss=True, 
    callbacks=[callback()],
)

index_to_aid = w2vec.wv.index_to_key
aid_to_index_dict = {aid:index for index,aid in enumerate(index_to_aid)}
aid_to_index = [aid_to_index_dict[i] for i in range(len(aid_to_index_dict))]
embs = w2vec.wv.vectors[aid_to_index]
print('Word2Vec embeddings have shape',embs.shape)
with open('matrices/w2v_100.npy', 'wb') as f:
    np.save(f, embs)

Loss after epoch 1: 17247084.0 (cumulative loss so far: 17247084.0) -> epoch took 590.66 s - vector norms min/avg/max: 0.06, 2.86, 7.39
Loss after epoch 2: 12454163.0 (cumulative loss so far: 29701247.0) -> epoch took 402.62 s - vector norms min/avg/max: 0.06, 3.43, 7.12
Loss after epoch 3: 11677254.0 (cumulative loss so far: 41378501.0) -> epoch took 399.07 s - vector norms min/avg/max: 0.06, 3.77, 7.61
Loss after epoch 4: 11385046.0 (cumulative loss so far: 52763547.0) -> epoch took 393.5 s - vector norms min/avg/max: 0.06, 4.02, 8.03
Loss after epoch 5: 11186762.0 (cumulative loss so far: 63950309.0) -> epoch took 393.53 s - vector norms min/avg/max: 0.06, 4.21, 8.42
Loss after epoch 6: 11401256.0 (cumulative loss so far: 75351565.0) -> epoch took 387.86 s - vector norms min/avg/max: 0.06, 4.37, 8.62
Loss after epoch 7: 11039624.0 (cumulative loss so far: 86391189.0) -> epoch took 390.87 s - vector norms min/avg/max: 0.06, 4.51, 8.91
Loss after epoch 8: 11187965.0 (cumulative loss s

Loss after epoch 61: 10765931.0 (cumulative loss so far: 666978056.0) -> epoch took 397.37 s - vector norms min/avg/max: 0.06, 6.29, 12.63
Loss after epoch 62: 10648389.0 (cumulative loss so far: 677626445.0) -> epoch took 392.82 s - vector norms min/avg/max: 0.06, 6.3, 12.64
Loss after epoch 63: 10737173.0 (cumulative loss so far: 688363618.0) -> epoch took 388.7 s - vector norms min/avg/max: 0.06, 6.31, 12.69
Loss after epoch 64: 10470603.0 (cumulative loss so far: 698834221.0) -> epoch took 388.98 s - vector norms min/avg/max: 0.06, 6.32, 12.7
Loss after epoch 65: 10499223.0 (cumulative loss so far: 709333444.0) -> epoch took 391.41 s - vector norms min/avg/max: 0.06, 6.33, 12.74
Loss after epoch 66: 10685030.0 (cumulative loss so far: 720018474.0) -> epoch took 390.73 s - vector norms min/avg/max: 0.06, 6.34, 12.78
Loss after epoch 67: 10652273.0 (cumulative loss so far: 730670747.0) -> epoch took 387.79 s - vector norms min/avg/max: 0.06, 6.35, 12.81
Loss after epoch 68: 10532841.