# Skip-gram with negative sampling

Getting the embeddings by solving the skip-gram fake problem

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)

In [2]:
USE_CUDA = torch.cuda.is_available()
gpus = [0]
if(USE_CUDA):
    torch.cuda.set_device(gpus[0])

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [3]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch
        
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))

## 1. Data load and Preprocessing 

### 1.1. Load corpus : Gutenberg corpus

Let's use melvill-moby_dick corpus from Gutenberg corpus

In [4]:
corpus = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:500]
corpus = [[word.lower() for word in sent] for sent in corpus]

### 1.2. Exclude sparse words 

Remove stopwords from the corpus dictionary

In [5]:
word_count = Counter(flatten(corpus))

MIN_COUNT = 3
exclude = []

for w, c in word_count.items():
    if c < MIN_COUNT:
        exclude.append(w)

### 1.3. Prepare train data 

In [6]:
vocab = list(set(flatten(corpus)) - set(exclude))

# Create mapping dictionary

word2index = {}
for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
        
index2word = {v:k for k, v in word2index.items()}

In [7]:
# length of all corpus token, filtered corpus token length
print(len(set(flatten(corpus))), len(vocab))

2607 478


In [8]:
WINDOW_SIZE = 3

def create_data(ws=3):

    windows =  flatten([list(nltk.ngrams(['<DUMMY>'] * ws + c + ['<DUMMY>'] * ws, ws * 2 + 1)) for c in corpus])

    train_data = []

    for window in windows:
        for i in range(ws * 2 + 1):
            if window[i] in exclude or window[ws] in exclude: 
                continue # min_count
            if i == ws or window[i] == '<DUMMY>': 
                continue
            train_data.append((window[ws], window[i]))
            
    print(train_data[:ws * 2])
    return train_data

train_data = create_data()
train_data[0] # first document

[('(', 'supplied'), ('(', 'by'), ('(', 'a'), ('supplied', '('), ('supplied', 'by'), ('supplied', 'a')]


('(', 'supplied')

In [9]:
X_p = []
y_p = []

for tr in train_data:
    X_p.append(prepare_word(tr[0], word2index).view(1, -1))
    y_p.append(prepare_word(tr[1], word2index).view(1, -1))

train_data = list(zip(X_p, y_p))
print(len(train_data),' training data pairs')

31522  training data pairs


### 1.4. Build Unigram Distribution**0.75 

$$P(w)=U(w)^{3/4}/Z$$

In [10]:
Z = 0.001

word_count = Counter(flatten(corpus))
num_total_words = sum([c for w, c in word_count.items() if w not in exclude])

In [11]:
unigram_table = []

for vo in vocab:
    unigram_table.extend([vo] * int(((word_count[vo]/num_total_words)**0.75)/Z))
    
print(len(vocab), len(unigram_table))

478 3500


### 1.5. Negative Sampling 

In [12]:
def negative_sampling(targets, unigram_table, k):
    batch_size = targets.size(0)
    neg_samples = []
    for i in range(batch_size):
        nsample = []
        target_index = targets[i].data.cpu().tolist()[0] if USE_CUDA else targets[i].data.tolist()[0]
        while len(nsample) < k: # num of sampling
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).view(1, -1))
    
    return torch.cat(neg_samples)

## 2. Modeling 

### 2.1 Define model architecture

In [13]:
class SkipgramNegSampling(nn.Module):
    
    def __init__(self, vocab_size, projection_dim):
        super(SkipgramNegSampling, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim) # center embedding
        self.embedding_u = nn.Embedding(vocab_size, projection_dim) # out embedding
        self.logsigmoid = nn.LogSigmoid()
                
        initrange = (2.0 / (vocab_size + projection_dim))**0.5 # Xavier init
        self.embedding_v.weight.data.uniform_(-initrange, initrange) # init
        self.embedding_u.weight.data.uniform_(-0.0, 0.0) # init
        
    def forward(self, center_words, target_words, negative_words):
        center_embeds = self.embedding_v(center_words) # B x 1 x D
        target_embeds = self.embedding_u(target_words) # B x 1 x D
        
        neg_embeds = -self.embedding_u(negative_words) # B x K x D
        
        positive_score = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # Bx1
        negative_score = torch.sum(neg_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2), 1).view(negs.size(0), -1) # BxK -> Bx1
        
        loss = self.logsigmoid(positive_score) + self.logsigmoid(negative_score)
        
        return -torch.mean(loss)
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds

### 2.2. Train 

We are interested in generating word embeddings of dimension 30

In [14]:
EMBEDDING_SIZE = 30 
BATCH_SIZE = 256
EPOCH = 100
NEG = 10 # Num of Negative Sampling

losses = []
model = SkipgramNegSampling(len(word2index), EMBEDDING_SIZE)
if USE_CUDA:
    model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [15]:
%%time

for epoch in range(EPOCH):
    for i,batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        
        inputs, targets = zip(*batch)
        
        inputs = torch.cat(inputs) # B x 1
        targets = torch.cat(targets) # B x 1
        negs = negative_sampling(targets, unigram_table, NEG)
        model.zero_grad()

        loss = model(inputs, targets, negs)
        
        loss.backward()
        optimizer.step()
    
        losses.append(loss.data.item())
    if epoch % 10 == 0:
        print("Epoch : %d, mean_loss : %.02f" % (epoch, np.mean(losses)))
        losses = []

Epoch : 0, mean_loss : 0.93
Epoch : 10, mean_loss : 0.70
Epoch : 20, mean_loss : 0.56
Epoch : 30, mean_loss : 0.52
Epoch : 40, mean_loss : 0.49
Epoch : 50, mean_loss : 0.47
Epoch : 60, mean_loss : 0.46
Epoch : 70, mean_loss : 0.45
Epoch : 80, mean_loss : 0.44
Epoch : 90, mean_loss : 0.44
CPU times: user 4min 16s, sys: 798 ms, total: 4min 17s
Wall time: 4min 20s


### 2.3. Test 

Lets check our created embeddings by finding the most similar words

In [16]:
def word_similarity(target, vocab):
    if USE_CUDA:
        target_V = model.prediction(prepare_word(target, word2index))
    else:
        target_V = model.prediction(prepare_word(target, word2index))
    similarities = []
    for i in range(len(vocab)):
        if vocab[i] == target: 
            continue
        
        if USE_CUDA:
            vector = model.prediction(prepare_word(list(vocab)[i], word2index))
        else:
            vector = model.prediction(prepare_word(list(vocab)[i], word2index))
        
        cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0]
        similarities.append([vocab[i], cosine_sim])
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:10]

In [17]:
test = random.choice(list(vocab))
test

'huge'

In [18]:
word_similarity(test, vocab)

[['red', 0.6175423860549927],
 ['teeth', 0.6154489517211914],
 ['saw', 0.6089832782745361],
 ['board', 0.5986260175704956],
 ['deep', 0.5942798256874084],
 ['jaws', 0.592190682888031],
 ['ships', 0.5905594825744629],
 ['royal', 0.5871720314025879],
 ['hand', 0.573773205280304],
 ['aloft', 0.5672186613082886]]

## 3. Extracting Embeddings

Our main goal in this fake task is to extract the word embeddings

In [19]:
# working on gpu, need cpu()
list(model.parameters())[0].cpu().detach().numpy()

array([[-0.635266  , -3.6268964 ,  0.58066475, ..., -0.7340099 ,
         1.76451   ,  2.3171337 ],
       [-0.5256893 ,  0.25719434, -1.0269786 , ..., -1.965328  ,
        -2.1790724 ,  1.5324991 ],
       [-1.4854066 , -0.3895261 , -2.121851  , ..., -1.353419  ,
        -2.848737  , -1.5324385 ],
       ...,
       [ 0.78071016, -1.8877364 , -1.8727735 , ..., -1.3137099 ,
        -0.46396822, -0.01122358],
       [ 0.51682734, -1.7216606 ,  0.2473447 , ...,  0.10838561,
        -0.18622133, -0.20630579],
       [-0.49094895, -1.0697956 , -0.07435337, ...,  0.78838664,
        -1.7043719 , -1.4479324 ]], dtype=float32)

In [20]:
import pandas as pd

if USE_CUDA:
    embeds = list(model.parameters())[0].cpu().detach().numpy()
else:
    embeds = list(model.parameters())[0].detach().numpy()
    
vectors = pd.DataFrame(embeds,index=index2word.values())

## 4. Embedding Visualisation

We can visualise our word embeddings using a dimension reduction algorithm tsne

In [21]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
top_words_tsne = tsne.fit_transform(vectors)

In [22]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

p = figure(tools="pan,wheel_zoom,box_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE embedding representation")

source = ColumnDataSource(data=dict(x1=top_words_tsne[:,0],
                                    x2=top_words_tsne[:,1],
                                    names=list(vectors.index)))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)