# Word2Vec: Skip-Gram-Softmax

paper: Efficient Estimation of Word Representations in Vector Space
https://arxiv.org/pdf/1301.3781.pdf

paper: Distributed Representations of Words and Phrases and their Compositionality

https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf

Word2Vec: 딥러닝을 활용한 단어를 벡터화 하는 하나의 방법이다.


![w2v.png](./figs/w2v.png)

Sentence: The quick brown fox jumps over the lazy dogs. 

![skipgram](./figs/skip-gram.png)

## Forward

* Input: one-hot Vector로 된 단어 size = (V,)
    * I $\rightarrow$ H : $W_{(V, N)}$
* Hidden: Projection 된 히든층 size = (N,)
    * H $\rightarrow$ O : $W'_{(N, V)}$
* Output: 전후로 나오는 단어의 확률들

$H =  W^T \cdot X $

$O = (W')^T \cdot H$

## Propose

$$\begin{aligned} \min J(\theta) &= -\dfrac{1}{T} \sum_{t=1}^T \sum_{-m \leq j \leq m,\ j \neq 0} \log P(w_{t+j} | w_t) \\
P(o|c) &= \dfrac{\exp(u_o^T V_c)}{\sum_{w=1}^V \exp(u_w^T V_c)}
\end{aligned}$$


## Update?

$$f = \log \dfrac{\exp(u_o^T V_c)}{\sum_{w=1}^V \exp(u_w^T V_c)}$$

$$\begin{aligned} \dfrac{\partial f}{\partial V_c} 
&= \dfrac{\partial }{\partial V_c} \big(\log(\exp(u_o^T V_c)) - \log(\sum_{w=1}^V \exp(u_w^T V_c))\big) \\
&= u_o - \dfrac{1}{\sum_{w=1}^V \exp(u_w^T V_c)}(\sum_{x=1}^V \exp(u_x^T V_c) u_x ) \\
&= u_o - \sum_{x=1}^V \dfrac{\exp(u_x^T V_c)}{\sum_{w=1}^V \exp(u_w^T V_c)} u_x \\ 
&= u_o - \sum_{x=1}^V P(x | c) u_x
\end{aligned}$$

* $u_o$ : observed word, output contextword
* $P(x|c)$: probs x given word context c  
* $P(x|c)u_x$: Expectation of all the context word: likelihood occurance probs $\times$ context vector  

## Packages load

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as torchdata
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
from collections import Counter, defaultdict
torch.manual_seed(1)

<torch._C.Generator at 0x1c0c65f8d0>

In [2]:
print(torch.__version__)
print(nltk.__version__)

0.3.0.post4
3.2.4


## Data Load

In [3]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [4]:
corpus = nltk.corpus.gutenberg.sents('melville-moby_dick.txt')[:100]
corpus = [[word.replace('"', "'").lower() for word in sent] for sent in corpus]
# corpus = [[w for word in sent for w in word_tokenize(word.replace('"', "'").lower())] for sent in corpus]

In [5]:
class corpus_process(object):
    def __init__(self, drop_rate=0.01, min_count=3):
        self.vocab2idx = defaultdict()
        self.vocab2idx['<NULL>'] = 0
        self.vocab_counts = Counter()
        self.drop_rate = drop_rate
        self.min_count = min_count
        self.stopwords = None
        self.stop_min_count = []
        self.flatten = lambda l: [item for sub in l for item in sub]
        self.V = None
        self.vocab = None
        
    def drop_words(self, corpus):
        """drop words by unigram distribution's tails"""
        self.vocab_counts.update(self.flatten(corpus))
        border = int(len(self.vocab_counts) * self.drop_rate)
        stops = self.vocab_counts.most_common()[:border] + \
                        list(reversed(self.vocab_counts.most_common()))[:border]
        self.stopwords = [s[0] for s in stops]
        
        for w, c in self.vocab_counts.items():
            if c < self.min_count:
                self.stop_min_count.append(w)
        
    def fit(self, corpus):
        self.drop_words(corpus)
        vocab = list(set(self.flatten(corpus)) - set(self.stopwords))
        for i, word in enumerate(vocab, 1):
            self.vocab2idx[word] = i
        self.idx2vocab = {v: k for k, v in self.vocab2idx.items()}
        vocab.append('<NULL>')
        self.vocab = vocab
        self.V = len(vocab)

    def transform_word(self, word):
        return self.vocab2idx['<NULL>'] if self.vocab2idx.get(word) is None else self.vocab2idx[word]   
    
    def transform_seq(self, seq):
        idxs = list(map(lambda w: self.vocab2idx["<NULL>"] if self.vocab2idx.get(w) is None else self.vocab2idx[w], seq))
        return torch.LongTensor(idxs)
    
    def transform2data(self, corpus, win_size=2):
        """transform words to idx adding pad data"""
        total_data = self.flatten([list(nltk.ngrams(['<NULL>']*win_size + c + ['<NULL>']*win_size, 2 * win_size + 1)) for c in corpus])
        datas = []
        for data in total_data:
            for i in range(win_size*2 + 1):
                if data[i] == '<NULL>' or i == win_size:
                    continue
                datas.append( (self.transform_word(data[win_size]),
                               self.transform_word(data[i])) )
        return np.array(datas)

In [6]:
CP = corpus_process(drop_rate=0.001, min_count=3)
CP.fit(corpus)
datas = CP.transform2data(corpus, win_size=2)

## Parameters & Data

In [7]:
V = CP.V  # VOCAB SIZE
N = 100  # EMBEDDING SIZE
BATCH_SIZE = 256
EPOCHS = 200
datas.shape

(5258, 2)

In [8]:
class CustomDataset(torchdata.Dataset):
    def __init__(self, data, transform=True):

        self.x = torch.LongTensor(data[:, 0]).contiguous().view(-1, 1)
        self.y = torch.LongTensor(data[:, 1]).contiguous().view(-1, 1)

    def __getitem__(self, index):
        # 인덱스에 해당하는 데이터셋 리턴
        return self.x[index], self.y[index]
        
    def __len__(self):
        # 데이터셋 수
        return len(self.x)

In [9]:
mydataset = CustomDataset(data=datas)
data_loader = torchdata.DataLoader(dataset=mydataset,
                                   batch_size=BATCH_SIZE, 
                                   shuffle=False, 
                                   drop_last=False)

## Model


In [10]:
class Word2Vec(nn.Module):
    def __init__(self, V, N):
        super(Word2Vec, self).__init__()
        
        self.V = V  # Vocab size
        self.N = N  # projection size

        self.embed_v = nn.Embedding(self.V, self.N, sparse=True)
        self.embed_u = nn.Embedding(self.V, self.N, sparse=True)
        self._weight_init()
        
    def _weight_init(self):
        initrange = 0.5 / self.N
        self.embed_v.weight.data.uniform_(-initrange, initrange)
        self.embed_u.weight.data.uniform_(-0, 0)
        
    def forward(self, c, t, o):
        # c: center words
        # t: target words
        # o: output words = vocabs
        c_embeds = self.embed_v(c) # B x 1 x N
        t_embeds = self.embed_u(t) # B x 1 x N
        o_embeds = self.embed_u(o) # B x V x N
        
        # each batch scores: we can do this because of simple linear model
        scores = t_embeds.bmm(c_embeds.transpose(1, 2)).squeeze(2) # Bx1xN * BxNx1 => Bx1
        norm_scores = o_embeds.bmm(c_embeds.transpose(1, 2)).squeeze(2) # BxVxN * BxNx1 => BxV
        # log-softmax
        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) 
        
        return nll # negative log likelihood
    
    def predict(self, x):
        embeds = self.embed_v(x)
        
        return embeds         

In [11]:
WV = Word2Vec(V, N)

## Optimizer

In [12]:
optimizer = optim.SparseAdam(WV.parameters())

In [None]:
for epoch in range(EPOCHS):
    total_loss = []
    for batch_x, batch_y in data_loader:
        inputs, targets = Variable(batch_x), Variable(batch_y) # Bx1
        vocabs = Variable(CP.transform_seq(list(CP.vocab)).expand(inputs.size(0), V)) # BxV
        WV.zero_grad()
        
        loss = WV.forward(inputs, targets, vocabs)
        loss.backward()
        optimizer.step()
        
        total_loss.append(loss.data[0])
    
    if epoch % 10 == 0:
        print('Epoch : {0}, mean_loss : {1:.2f}'.format(epoch, np.mean(total_loss)))

Epoch : 0, mean_loss : 6.38
Epoch : 10, mean_loss : 5.88
Epoch : 20, mean_loss : 5.52
Epoch : 30, mean_loss : 5.23
Epoch : 40, mean_loss : 5.00
Epoch : 50, mean_loss : 4.77
Epoch : 60, mean_loss : 4.51
Epoch : 70, mean_loss : 4.26
Epoch : 80, mean_loss : 4.03
Epoch : 90, mean_loss : 3.81


In [None]:
test = np.random.choice(CP.vocab)
test

In [None]:
def word_similarity(word, topn=10):
    idx = Variable(torch.LongTensor([CP.transform_word(word)]))
    word_vec = WV.predict(idx)
    sims = []
    for i in range(CP.V):
        if CP.vocab[i] == word: continue
        sim_idx = Variable(torch.LongTensor([CP.transform_word(CP.vocab[i])]))
        vec = WV.predict(sim_idx)
        cosine_sim = F.cosine_similarity(word_vec, vec).data[0] 
        sims.append([CP.vocab[i], cosine_sim])
    return sorted(sims, key=lambda x: x[1], reverse=True)[:topn]

In [None]:
word_similarity(test, topn=10)