# Skip-gram with naiive softmax 

Getting the embeddings by solving the skip-gram fake problem

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter

# group all sublists into one list
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)

In [2]:
USE_CUDA = torch.cuda.is_available()
gpus = [0]
torch.cuda.set_device(gpus[0])

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [3]:
# for batch looping during training 
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch
        
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))

## 1. Data load and Preprocessing 

### 1.1. Load corpus : Gutenberg corpus

Let's use `melvill-moby_dick` corpus from Gutenberg corpus

In [4]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [5]:
corpus = nltk.corpus.gutenberg.sents('melville-moby_dick.txt') # sampling sentences for test
corpus = [[word.lower() for word in sent] for sent in corpus] # list of lists
corpus[0]

['[', 'moby', 'dick', 'by', 'herman', 'melville', '1851', ']']

In [6]:
# lets use only a portion of the corpus documents
corpus = corpus[:500]

### 1.2. Extract Stopwords from unigram distribution's tails

flatten is a lambda function to combine all lists in list

In [7]:
word_count = Counter(flatten(corpus))
border = int(len(word_count) * 0.01)  # stop words will be top 1% 

In [8]:
stopwords = word_count.most_common()[:border] + list(reversed(word_count.most_common()))[:border]

In [9]:
stopwords = [s[0] for s in stopwords]
stopwords[:10]

[',', 'the', '.', 'of', 'a', 'and', 'in', 'to', '--', '"']

### 1.3. Build vocab

Remove stopwords from the corpus vocabulary using sets, add special token as well

In [10]:
# remove stopwords
vocab = list(set(flatten(corpus)) - set(stopwords))
vocab.append('<UNK>')

# create mapping dictionary 
word2index = {'<UNK>' : 0} 

for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)

index2word = {v:k for k, v in word2index.items()} 

In [11]:
# length of all corpus token, filtered corpus token length
print(len(set(flatten(corpus))), len(vocab))

2607 2556


### 1.4. Prepare train data 

Lets prepare the context and target word data 

In [12]:
WINDOW_SIZE = 3

def create_data(ws=3):

    windows = flatten([list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1)) for c in corpus])
    train_data = []

    for window in windows:
        for i in range(ws * 2 + 1):
            if i == ws or window[i] == '<DUMMY>': 
                continue
            train_data.append((window[ws], window[i]))

    print(train_data[:ws * 2])
    return train_data

# create skipgram pairs 
train_data = create_data()
train_data[0] # first document

[('[', 'moby'), ('[', 'dick'), ('[', 'by'), ('moby', '['), ('moby', 'dick'), ('moby', 'by')]


('[', 'moby')

In [13]:
X_p = []; y_p = []
for tr in train_data:
    X_p.append(prepare_word(tr[0], word2index).view(1, -1))
    y_p.append(prepare_word(tr[1], word2index).view(1, -1))
    
train_data = list(zip(X_p, y_p))

## 2. Modeling

### 2.1 Define model architecture

In [14]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, projection_dim):
        
        super(Skipgram,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim)
        self.embedding_u = nn.Embedding(vocab_size, projection_dim)

        self.embedding_v.weight.data.uniform_(-1, 1) # init
        self.embedding_u.weight.data.uniform_(0, 0) # init
        #self.out = nn.Linear(projection_dim,vocab_size)
    def forward(self, center_words,target_words, outer_words):
        center_embeds = self.embedding_v(center_words) # B x 1 x D
        target_embeds = self.embedding_u(target_words) # B x 1 x D
        outer_embeds = self.embedding_u(outer_words) # B x V x D
        
        scores = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # Bx1xD * BxDx1 => Bx1
        norm_scores = outer_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # BxVxD * BxDx1 => BxV
        
        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax
        
        return nll # negative log likelihood
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds  

### 2.2. Train  

We are interested in generating word embeddings of dimension 30

In [15]:
EMBEDDING_SIZE = 30
BATCH_SIZE = 256
EPOCH = 100
VOCAB_SIZE = len(word2index)

# initialise model weights
model = Skipgram(VOCAB_SIZE, EMBEDDING_SIZE)

if USE_CUDA:
    model = model.cuda()

# network optimiser
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [16]:
losses = []
for epoch in range(EPOCH):
    
    # loop through batches
    for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        
        inputs, targets = zip(*batch)
        
        inputs = torch.cat(inputs) # B x 1
        targets = torch.cat(targets) # B x 1
        vocabs = prepare_sequence(list(vocab), word2index).expand(inputs.size(0), len(vocab))  # B x V
        model.zero_grad()
        
        # returns loss function value as opposed to prediction
        loss = model(inputs, targets, vocabs)
        
        loss.backward()  # we can do backward propagation 
        optimizer.step()
   
        losses.append(loss.data.item())

    # mean loss over 10 iterations
    if epoch % 10 == 0:
        print("Epoch : %d, mean_loss : %.02f" % (epoch,np.mean(losses)))
        losses = []

Epoch : 0, mean_loss : 5.93
Epoch : 10, mean_loss : 4.35
Epoch : 20, mean_loss : 3.89
Epoch : 30, mean_loss : 3.82
Epoch : 40, mean_loss : 3.79
Epoch : 50, mean_loss : 3.78
Epoch : 60, mean_loss : 3.77
Epoch : 70, mean_loss : 3.76
Epoch : 80, mean_loss : 3.76
Epoch : 90, mean_loss : 3.75


### 2.3. Test

Lets check our created embeddings by finding the most similar words

In [17]:
# word similarity based on coside similarity 
def word_similarity(target, vocab):
    
    if USE_CUDA:
        target_V = model.prediction(prepare_word(target, word2index))
    else:
        target_V = model.prediction(prepare_word(target, word2index))
    similarities = []
    for i in range(len(vocab)):
        if vocab[i] == target: continue
        
        if USE_CUDA:
            vector = model.prediction(prepare_word(list(vocab)[i], word2index))
        else:
            vector = model.prediction(prepare_word(list(vocab)[i], word2index))
        cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0] 
        similarities.append([vocab[i], cosine_sim])
        
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:10] # sort by similarity

In [18]:
test = random.choice(list(vocab))
test

'pekee'

In [19]:
word_similarity(test, vocab)

[['erromangoan', 0.970512866973877],
 ['fegee', 0.9619144201278687],
 ['nuee', 0.9046104550361633],
 ['regulating', 0.7587066888809204],
 ['shuddered', 0.7463520765304565],
 ['montaigne', 0.7317308187484741],
 ['moses', 0.7304397225379944],
 ['unwieldy', 0.722458004951477],
 ['senate', 0.7208307981491089],
 ['lodgings', 0.7186830043792725]]

## 3. Extracting Embeddings

Our main goal in this fake task is to extract the word embeddings

In [20]:
# working on gpu, need cpu()
list(model.parameters())[0].cpu().detach().numpy()

array([[ 0.31058446,  0.38394982, -0.27791247, ...,  0.27956843,
        -0.48745468,  0.3338015 ],
       [ 1.751076  ,  0.40787688, -1.0316286 , ..., -0.34953263,
        -0.5261852 ,  1.1141593 ],
       [ 0.36493415, -0.6966971 , -0.7607147 , ...,  0.42251995,
        -1.2338314 ,  0.04637641],
       ...,
       [ 0.70431477,  1.0412371 , -0.06481844, ...,  0.66100353,
        -1.6041203 ,  0.45222238],
       [ 0.78536123, -0.20106986,  0.4013988 , ...,  1.1091657 ,
        -0.03339152,  1.5519061 ],
       [-0.33312085,  1.0145313 ,  0.25008214, ...,  1.1328151 ,
        -1.0193429 ,  0.7683327 ]], dtype=float32)

In [21]:
import pandas as pd

if USE_CUDA:
    embeds = list(model.parameters())[0].cpu().detach().numpy()
else:
    embeds = list(model.parameters())[0].detach().numpy()
    
vectors = pd.DataFrame(embeds,index=index2word.values())

## 4. Embedding Visualisation

We can visualise our word embeddings using a dimension reduction algorithm <code>tsne</code>

In [22]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
top_words_tsne = tsne.fit_transform(vectors)

In [23]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

p = figure(tools="pan,wheel_zoom,box_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE embedding representation")

source = ColumnDataSource(data=dict(x1=top_words_tsne[:,0],
                                    x2=top_words_tsne[:,1],
                                    names=list(vectors.index)))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)