## [Efficient Estimation of Word Representations in Vector Space \(Mikolov et al., 2013\)](https://arxiv.org/pdf/1301.3781.pdf)

In [1]:
# Imports
from collections import defaultdict
import math
import time
import random
import torch

### Why use uniform initialization instead of xavier uniform for word embeddings? Why do we pick the range [-0.25, 0.25]?

[Weight initialization in neural nets](https://kharshit.github.io/blog/2019/02/08/weight-initialization-in-neural-nets)

In [19]:
# Define the model
class WordEmbCbow(torch.nn.Module):
    def __init__(self, nwords, emb_size):
        super(WordEmbCbow, self).__init__()
        
        self.embedding = torch.nn.Embedding(nwords, emb_size)
        
        # Fills the input Tensor with values drawn 
        # from the uniform distribution U(a, b)
        torch.nn.init.uniform_(self.embedding.weight, -0.25, 0.25)
        
        # Projection layer for taking softmax over vocab words
        self.projection = torch.nn.Linear(emb_size, nwords)
        torch.nn.init.uniform_(self.projection.weight, -0.25, 0.25)
        
    def forward(self, words):
        emb = self.embedding(words)
        emb_sum = torch.sum(emb, dim=0) # size = emb_size
        emb_sum = emb_sum.view(1, -1) # size = 1 x emb_size 
        out = self.projection(emb_sum)
        return out 

In [20]:
N = 2  # length of window on each side (so N=2 gives a total window size of 5, as in t-2 t-1 t t+1 t+2)
EMB_SIZE = 128  # The size of the embedding

embeddings_location = "embeddings.txt"  # the file to write the word embeddings to
labels_location = "labels.txt"  # the file to write the labels to

In [21]:
# Read in the data
w2i = defaultdict(lambda: len(w2i))
S = w2i["<s>"]
UNK = w2i["<unk>"]


def read_dataset(filename):
    with open(filename, "r") as f:
        for line in f:
            yield [w2i[x] for x in line.strip().split(" ")]


# Read in the data
train = list(read_dataset("../data/ptb/train.txt"))
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset("../data/ptb/valid.txt"))
i2w = {v: k for k, v in w2i.items()}
nwords = len(w2i)

In [22]:
import os 
if not os.path.exists(labels_location):
    with open(labels_location, 'w') as labels_file:
        for i in range(nwords):
            labels_file.write(i2w[i] + '\n')

In [23]:
# Initialize the model
model = WordEmbCbow(nwords, EMB_SIZE)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

data_type = torch.LongTensor
use_cuda = torch.cuda.is_available()

if use_cuda:
    data_type = torch.cuda.LongTensor
    model.cuda()

In [24]:
# Calculate loss for entire sentence
def calc_sent_loss(sent):
    # Add padding to sentence equal to the size of the window
    padding = [S]*N
    padded_sent = padding + sent + padding 
    
    losses = []
    for i in range(N, len(sent)+N):
        # c is the context vector (does not include actual token)
        before = padded_sent[i-N:i]
        after = padded_sent[i+1:i+N+1]
        c = torch.tensor(before+after).type(data_type)
        
        # t is the target vector
        t = torch.tensor([padded_sent[i]]).type(data_type)
        
        logits = model(c)
        loss = criterion(logits, t)  # loss for predicting target from context
        losses.append(loss)
    return torch.stack(losses).sum()

In [25]:
MAX_LEN = 100

for ITER in range(1):
    print("started iter %r" % ITER)
    
    # Start training
    random.shuffle(train)
    train_words, train_loss = 0, 0.0
    start = time.time()
    model.train()
    
    for sent_id, sent in enumerate(train):
        my_loss = calc_sent_loss(sent)
        train_loss += my_loss.item()
        train_words += len(sent)
        
        # Take step after calculating loss for all words in sent
        optimizer.zero_grad()  # Zero the gradients 
        my_loss.backward()
        optimizer.step()
        
        if (sent_id+1) % 5000 == 0:
            print("--finished {} sentences".format(sent_id+1))
    
    print("iter {}: train loss/word={}, ppl={}, time={}".format(
        ITER,
        train_loss/train_words,
        math.exp(train_loss/train_words),
        time.time()-start))
    
    # Evaluate on dev set 
    dev_words, dev_loss = 0, 0.0
    start = time.time()
    model.eval() 
    for sent_id, sent in enumerate(dev):
        my_loss = calc_sent_loss(sent)
        dev_loss += my_loss.item()
        dev_words += len(sent)
        
    print("iter {}: dev loss/word={}, ppl={}, time={}".format(
        ITER,
        dev_loss/dev_words,
        math.exp(dev_loss/dev_words),
        time.time()-start))

started iter 0
--finished 5000 sentences
--finished 10000 sentences
--finished 15000 sentences
--finished 20000 sentences
--finished 25000 sentences
--finished 30000 sentences
--finished 35000 sentences
--finished 40000 sentences
iter 0: train loss/word=nan, ppl=nan, time=403.1145212650299
iter 0: train loss/word=nan, ppl=nan, time=18.368483543395996
saving embedding files


NameError: name 'embedings_location' is not defined

In [28]:
print("saving embedding files")
with open(embeddings_location, "w") as outfile:
    W_w_np = model.embedding.weight.data.cpu().numpy()
    for i in range(nwords):
        ith_embedding = "\t".join(map(str, W_w_np[i]))
        outfile.write(ith_embedding + "\n")

saving embedding files
