# Word2Vec with NEG
---

papers:

* [Distributed Representations of Words and Phrases and their Compositionality](https://arxiv.org/abs/1310.4546)
* [word2vec Explained: deriving Mikolov et al.'s negative-sampling word-embedding method](https://arxiv.org/abs/1402.3722)

## Prepare Data

In [1]:
import os
import sys
import nltk
sys.path.append('/'.join(os.getcwd().split('/')[:-1]+['paper_code']))

nltk.download('punkt')
nltk.download('brown')

[nltk_data] Downloading package punkt to /home/simonjisu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to /home/simonjisu/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

## Preprocessing

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as torchdata

from nltk.corpus import brown
from collections import Counter
import random
import time
import numpy as np

from common.vocabulary import Vocab
from WORD2VEC.word2vec_dataloader import CustomDataset
flatten = lambda d: [tkn for sent in d for tkn in sent ]

remove " \`\` " and " '' " in the corpus

In [3]:
datas = [[tkn.lower() for tkn in sent if tkn not in ["``", "''"]] for sent in brown.sents()]
vocab_counter = Counter(flatten(datas))

In [4]:
len(vocab_counter), len(datas), len(brown.words())

(49813, 57340, 1161192)

prepare datasets

In [5]:
USE_CUDA = torch.cuda.is_available()
DEVICE = 'cuda' if USE_CUDA else None
BATCH = 1024
WINDOW_SIZE = 2
train_data = CustomDataset(datas, window=WINDOW_SIZE, device=DEVICE)
train_loader = torchdata.DataLoader(train_data, batch_size=BATCH, shuffle=True, collate_fn=train_data.collate_fn)

unigram distribution for negative sampling

$$(w, c) \sim p_{words}(w) \dfrac{p_{contexts} (c)^{3/4} }{Z}$$

In [6]:
Z = 10e-5
total_words = len(vocab_counter)
unigram_distribution = []
for w, c in vocab_counter.items():
    unigram_distribution.extend([w]*int(((c/total_words)**(3/4))/Z))
print(total_words, len(unigram_distribution))
unigram_distribution = train_data._numerical(unigram_distribution)

49813 799391


In [7]:
def negative_sampling(targets, unigram_distribution, k, use_cuda=False):
    """
    unigram_distribution: have to be numericalize words
    """
    neg_samples = []
    batch_size = targets.size(0)
    for t in targets.view(-1).cpu().tolist():
        samples = []
        while len(samples) < k:
            word = random.choice(unigram_distribution)
            assert isinstance(word, int), "have to be numericalize words"
            if word == t:
                continue
            samples.append(word)
        neg_samples.append(samples)
    if use_cuda:
        return torch.LongTensor(neg_samples).cuda()
    return torch.LongTensor(neg_samples)

## Object

![](../figs/word2vec.png)


$$\begin{aligned}
J(\theta) &= \dfrac{1}{T}\sum_{t=1}^{T} J_t(\theta)\\
J_t(\theta) &= \underbrace{\log \sigma(u_o^T v_c)}_{(1)} + \underbrace{\sum_{i=1}^{k} \mathbb{E}_{j \backsim P(w)} [\log \sigma(-u_j^T v_c)]}_{(2)}
\end{aligned}$$

* (1) : posivie log score
* (2) : negative log score

In [8]:
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(Word2Vec, self).__init__()
        self.embedding_w = nn.Embedding(vocab_size, embed_size)
        self.embedding_u = nn.Embedding(vocab_size, embed_size)
        
        init = (2.0 / (vocab_size + embed_size))**0.5 # Xavier init
        nn.init.uniform_(self.embedding_w.weight, -init, init)
        nn.init.uniform_(self.embedding_u.weight, -0.0, 0.0)        
    
    def forward(self, inputs, targets, neg_samples):
        embed = self.embedding_w(inputs)  # B, 1, embed_size
        context = self.embedding_u(targets)  # B, 1, embed_size
        negs = -self.embedding_u(neg_samples)  # B, k, embed_size
        
        pos = context.bmm(embed.transpose(1, 2)).squeeze(2)  # B, 1
        neg = negs.bmm(embed.transpose(1, 2)).sum(1)  # B, k, 1  > B, 1
        nll = F.logsigmoid(pos) + F.logsigmoid(neg)
        return -torch.mean(nll)
    
    def cosine_similarity(self, idx1, idx2):
        wv1 = self.embedding_w.weight[idx1]
        wv2 = self.embedding_w.weight[idx2]
        return F.cosine_similarity(wv1, wv2, dim=0)

In [9]:
V = len(train_data.vocab)
EMBED = 300
K = 10
STEP = 60
LR = 0.01

In [10]:
model = Word2Vec(V, EMBED)
if USE_CUDA:
    model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=LR)
scheduler = optim.lr_scheduler.MultiStepLR(gamma=0.1, milestones=[20, 40], optimizer=optimizer)

In [11]:
start_time = time.time()

losses = []
for step in range(STEP):
    scheduler.step()
    for batch in train_loader:
        inputs, targets = batch
        neg_samples = negative_sampling(targets, unigram_distribution, K, use_cuda=USE_CUDA)
        
        model.zero_grad()
        loss = model(inputs, targets, neg_samples)
        
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        
    if step % 5 == 0:
        msg = "[{}/{}] loss {:.4f}".format(step+1, STEP, np.mean(losses))
        print(msg)
        losses = []
        
end_time = time.time()
minute = int((end_time-start_time) // 60)
print('Training Excution time with validation: {:d} m {:.4f} s'.format(minute, (end_time-start_time)-minute*60))

[1/60] loss 1.0859
[6/60] loss 0.7516
[11/60] loss 0.6550
[16/60] loss 0.6301
[21/60] loss 0.5850
[26/60] loss 0.2950
[31/60] loss 0.2482
[36/60] loss 0.2357
[41/60] loss 0.2289
[46/60] loss 0.2192
[51/60] loss 0.2175
[56/60] loss 0.2173
Training Excution time with validation: 143 m 22.8557 s


In [12]:
torch.save(model.state_dict(), '../paper_code/word2vec/model/word2vec.model')

## Test

In [13]:
model.load_state_dict(torch.load('../paper_code/word2vec/model/word2vec.model'))

In [14]:
def most_similar(word, model, vocab, top_k=10):
    sims = []
    for i in range(len(vocab)):
        if vocab.itos[i] == word: 
            continue
        sim = model.cosine_similarity(vocab.stoi[word], i)
        sims.append((vocab.itos[i], sim.item()))
    return sorted(sims, key=lambda x: x[1], reverse=True)[:top_k]

In [15]:
most_similar(word='i', model=model, vocab=train_data.vocab, top_k=10)

[('he', 0.6608016490936279),
 ('it', 0.643374502658844),
 ('you', 0.6358298659324646),
 ('she', 0.6312659382820129),
 ('her', 0.6216399073600769),
 ('they', 0.6160386800765991),
 ('we', 0.6135802268981934),
 ('this', 0.6132360696792603),
 ('that', 0.6096834540367126),
 ('be', 0.6034596562385559)]

In [16]:
most_similar(word='man', model=model, vocab=train_data.vocab, top_k=10)

[('he', 0.5518719553947449),
 ('who', 0.5154294967651367),
 ('him', 0.5083991289138794),
 ('it', 0.5046495199203491),
 ('had', 0.5031024813652039),
 ('one', 0.5026065111160278),
 ('not', 0.5008120536804199),
 ('this', 0.49810266494750977),
 ('you', 0.49751439690589905),
 ('as', 0.4934142231941223)]