In [1]:
%load_ext autoreload
import sys
import os
import joblib
from word2vec_util import TextCorpusProcess


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data_path = os.getcwd() + '/data/images/' + 'img_prepared'


In [None]:
data_path = os.getcwd() + '/drive/MyDrive/fb_marketplace/data/images/' + 'img_prepared'

In [2]:
train_pklname = data_path + '_train.pkl'
train_data = joblib.load(train_pklname)
text_data = train_data['desc']

In [None]:
import nltk
nltk.download('stopwords')

In [3]:
%load_ext autoreload
punct_dict = {
    '.': '<PERIOD>',
    ',': '<COMMA>',
    '"': '<QUOTATION_MARK>',
    ';': '<SEMICOLON>',
    '!': '<EXCLAMATION_MARK>',
    '?': '<QUESTION_MARK>',
    '(': '<QUESTION_MARK>',
    ')': '<RIGHT_PAREN>',
    '--': '<HYPHENS>',
    '-': '<HYPHEN>',
    ':': '<COLON>',
    '|': '<PIPE>'
}

textCorpus = TextCorpusProcess(punct_dict=punct_dict, prod_descs=text_data)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
len(textCorpus.word_to_idx.values())

25184

In [4]:
import torch
import numpy as np
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.tensorboard import SummaryWriter
import copy
from skip_gram_neg import SkipGramNeg, NegativeSamplingLoss
from word2vec_util import Word2VecDataloader

In [None]:
%load_ext tensorboard
log_dir_path = '/content/drive/MyDrive/fb_marketplace/w2vruns'
save_dir_path = '/content/drive/MyDrive/fb_marketplace/w2vsave/'
try:
    os.mkdir(log_dir_path)
except:
    pass
try:
    os.mkdir(save_dir_path)
except:
    pass
%tensorboard --logdir '{log_dir_path}'

In [5]:
%load_ext tensorboard
current_path = os.getcwd()
log_dir_path = current_path + '/runs/'
save_dir_path = current_path + '/save/'
try:
    os.mkdir(log_dir_path)
except:
    pass
try:
    os.mkdir(save_dir_path)
except:
    pass
%tensorboard --logdir '{log_dir_path}'

Reusing TensorBoard on port 6006 (pid 6465), started 0:08:17 ago. (Use '!kill 6465' to kill it.)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
writer = SummaryWriter(log_dir=log_dir_path)
# get the noise distribution for negative sampling
word_freqs = np.array(sorted(textCorpus.freq_dict.values(), reverse=True))
unigram_dist = word_freqs/word_freqs.sum()
noise_dist = torch.from_numpy(
    unigram_dist**(0.75)/np.sum(unigram_dist**(0.75)))

# hyperparams
EMBED_DIM = 128
BATCH_SIZE = 16
NEG_SAMPLE_SIZE = 5
WINDOW_SIZE = 3
EPOCHS = 20
LR = 0.001

# instantiating the model
model = SkipGramNeg(
    vocab_size=textCorpus.vocab_size,
    embed_dim=EMBED_DIM,
    noise_dist=noise_dist,
    neg_sample_size=NEG_SAMPLE_SIZE,
    batch_size=BATCH_SIZE,
)
model = model.to(device)

# instantiating the dataloader
dataloader = Word2VecDataloader(
    prod_word_idx_lists=textCorpus.prod_word_idx_lists,
    batch_size=BATCH_SIZE,
    window_size=WINDOW_SIZE,
)

criterion = NegativeSamplingLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [6]:
epochs = EPOCHS
# an arbitrary high value for best_epoch_loss
best_epoch_loss = 100
best_model_wts = copy.deepcopy(model.state_dict())
for e in range(epochs):
    batch_count = 0
    running_loss = 0.0
    # get input, target batches
    for input_words, target_words in dataloader.generate_batch():
        inputs = torch.LongTensor(input_words)
        targets = torch.LongTensor(target_words)
        inputs, targets = inputs.to(device), targets.to(device)

        # input, outpt, and noise vectors
        input_vectors = model.forward_input(inputs)
        output_vectors = model.forward_output(targets)
        noise_vectors = model.generate_neg_samples(
            # len(input_words), exclude_words=None, device=device)
            len(input_words), exclude_words=target_words, device=device)

        # negative sampling loss
        loss = criterion(input_vectors, output_vectors, noise_vectors)
        running_loss += loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        batch_count += 1
    exp_lr_scheduler.step()
    epoch_loss = running_loss / batch_count
    print(f'Epoch {e} / {epochs}')
    print(f'Epoch training loss: {epoch_loss}')
    writer.add_scalar(f'Training loss', epoch_loss, e)
    writer.flush()

    # deep copy the model
    if epoch_loss < best_epoch_loss:
        best_epoch_loss = epoch_loss
        best_model_wts = copy.deepcopy(model.state_dict())
        torch.save(best_model_wts, save_dir_path + 'best_word2vec_ebedding_model.pt')


Epoch 0 / 20
Epoch training loss: 23.238672256469727


KeyboardInterrupt: 

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [None]:

model.load_state_dict(torch.load(
    save_dir_path + 'best_word2vec_ebedding_model.pt'))

In [None]:
# getting embeddings from the embedding layer of our model, by name
embeddings = model.in_embed.weight.to('cpu').data.numpy()

In [None]:
viz_words = 300
tsne = TSNE()
embed_tsne = tsne.fit_transform(embeddings[:viz_words, :])

In [None]:
fig, ax = plt.subplots(figsize=(16, 16))
for idx in range(viz_words):
    plt.scatter(*embed_tsne[idx, :], color='steelblue')
    plt.annotate(
        textCorpus.idx_to_word[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)