# Assignment 1.4: Negative sampling (15 points)

You may have noticed that word2vec is really slow to train. Especially with big (> 50 000) vocabularies. Negative sampling is the solution.

The task is to implement word2vec with negative sampling.

This is what was discussed in Stanford lecture. The main idea is in the formula:

$$ L = \log\sigma(u^T_o \cdot u_c) + \sum^k_{i=1} \mathbb{E}_{j \sim P(w)}[\log\sigma(-u^T_j \cdot u_c)]$$

Where $\sigma$ - sigmoid function, $u_c$ - central word vector, $u_o$ - context (outside of the window) word vector, $u_j$ - vector or word with index $j$.

The first term calculates the similarity between positive examples (word from one window)

The second term is responsible for negative samples. $k$ is a hyperparameter - the number of negatives to sample.
$\mathbb{E}_{j \sim P(w)}$
means that $j$ is distributed accordingly to unigram distribution.

Thus, it is only required to calculate the similarity between positive samples and some other negatives. Not across all the vocabulary.

Useful links:
1. [Efficient Estimation of Word Representations in Vector Space](https://arxiv.org/pdf/1301.3781.pdf)
1. [Distributed Representations of Words and Phrases and their Compositionality](http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.autograd import Variable
from torch import cuda
import random

from numpy.random import multinomial
from collections import Counter
from collections import OrderedDict
from tqdm import tqdm
import numpy as np
import pandas as pd
import pickle
import time

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')
with open("/content/gdrive/My Drive/DeepPavlov/text8/text8","r") as f:
  corpus = f.read()
with open("/content/gdrive/My Drive/DeepPavlov/w2v_data.pkl","rb") as f:
  [X, y, word2index, index2word] = pickle.load(f) 

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
batch_size = 512
window_size = X.shape[1]/2

In [0]:
# https://programmer.group/pytorch-implements-word2vec.html
# https://rguigoures.github.io/word2vec_pytorch/

def sample_negative(corpus, sample_size=5):
    sample_probability = OrderedDict()
    word_counts = OrderedDict(Counter(corpus.split()).most_common(len(word2index)))
    normalizing_factor = sum([v**0.75 for v in word_counts.values()])
    for word in word_counts:
        sample_probability[word] = word_counts[word]**0.75 / normalizing_factor
    words = list(word_counts.keys())
    neg_samples = []
    for i in range(X.shape[0]):
        multi_df = multinomial(sample_size, list(sample_probability.values()))
        sampled_index = np.where(multi_df>0)[0]
        if len(sampled_index) < sample_size:
          sampled_index = np.hstack((sampled_index, random.sample(range(len(word2index)), \
                                                                     int(sample_size - len(sampled_index)))))
        assert len(sampled_index) == sample_size
          
        yield sampled_index

neg_sampler = sample_negative(corpus, window_size*2)

In [5]:
print(X.shape)

(17005197, 10)


In [0]:
class EarlyStopping():
    def __init__(self, patience=5, min_percent_gain=0.1):
        self.patience = patience
        self.loss_list = []
        self.min_percent_gain = min_percent_gain / 100.
        
    def update_loss(self, loss):
        self.loss_list.append(loss)
        if len(self.loss_list) > self.patience:
            del self.loss_list[0]
    
    def stop_training(self):
        if len(self.loss_list) == 1:
            return False
        gain = (max(self.loss_list) - min(self.loss_list)) / max(self.loss_list)
        print("Loss gain: {}%".format(round(100*gain,2)))
        if gain < self.min_percent_gain:
            return True
        else:
            return False

class My_Dataset(Dataset):
    def __init__(self, x, y, neg_sampler):
        self.x = x
        self.y = y
        self.neg_sampler = neg_sampler

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        # print(self.y[idx].shape, self.x[idx, :].shape, next(neg_sampler).shape)
        return (self.y[idx], self.x[idx, :], next(neg_sampler))


batcher_train = DataLoader(My_Dataset(X, y, neg_sampler), batch_size=batch_size, shuffle=False)

In [0]:
class Word2Vec(nn.Module):
    def __init__(self, embedding_size, vocab_size):
        super(Word2Vec, self).__init__()
        self.embeddings_target = nn.Embedding(vocab_size, embedding_size)
        self.embeddings_context = nn.Embedding(vocab_size, embedding_size)

    def forward(self, target_word, context_word, negative_example):
        emb_target = self.embeddings_target(target_word)
        emb_context = torch.sum(self.embeddings_context(context_word), axis=1)
        emb_product = torch.mul(emb_target, emb_context)
        emb_product = torch.sum(emb_product, dim=1)
        out = torch.sum(F.logsigmoid(emb_product))

        emb_negative = self.embeddings_context(negative_example)
        emb_product = torch.bmm(emb_negative, emb_target.unsqueeze(2))
        emb_product = torch.sum(emb_product, dim=1)
        out += torch.sum(F.logsigmoid(-emb_product))
        return -out

In [0]:
loss_function = nn.CrossEntropyLoss()
net = Word2Vec(embedding_size=200, vocab_size=len(word2index))
optimizer = optim.SGD(net.parameters(), lr=0.01)
early_stopping = EarlyStopping(patience=5, min_percent_gain=1)

if cuda.is_available():
  net = net.cuda()
iters=0
while True:
    losses = []
    for batch in batcher_train:
        net.zero_grad()
        target_tensor, context_tensor, negative_tensor = batch
        if cuda.is_available():
            target_tensor, context_tensor, negative_tensor = \
                  target_tensor.cuda(), context_tensor.cuda(), negative_tensor.cuda()

        loss = net(target_tensor, context_tensor, negative_tensor)
        loss.backward()
        optimizer.step()
        losses.append(loss.data.detach().cpu().numpy())
        if iters%50==0:
          print("Loss: ", np.mean(losses))
        iters+=1

    print("Loss: ", np.mean(losses))
    early_stopping.update_loss(np.mean(losses))
    if early_stopping.stop_training():
        break

Loss:  18782.613
Loss:  14256.356
Loss:  12972.355
Loss:  12106.569
Loss:  11565.123
Loss:  11179.974
Loss:  10834.825
Loss:  10550.44
Loss:  10399.021
Loss:  10265.837
Loss:  10029.12
Loss:  9815.693
Loss:  9602.228
Loss:  9397.027
Loss:  9253.211
Loss:  9110.246
