### [`word2vec` Explained: Deriving Mikolov et al.’s Negative-Sampling Word-Embedding Method](https://arxiv.org/pdf/1402.3722.pdf)

In [1]:
from collections import defaultdict
import math
import numpy as np
import time
import random
import torch
import torch.nn.functional as F

In [26]:
class WordEmbSkip(torch.nn.Module):
    def __init__(self, nwords, emb_size):
        # Use init function from superclass
        super(WordEmbSkip, self).__init__()
        
        # Word embeddings
        self.word_embedding = torch.nn.Embedding(nwords, emb_size, sparse=True)
        torch.nn.init.xavier_uniform_(self.word_embedding.weight)
        
        # Context embeddings
        self.context_embedding = torch.nn.Embedding(nwords, emb_size, sparse=True)
        torch.nn.init.xavier_uniform_(self.context_embedding.weight)
        
    def forward(self, word_pos, context_positions, negative_sample=False):
        embed_word = self.word_embedding(word_pos) # size = 1 x emb_size
        embed_context = self.context_embedding(context_positions) # size = N x emb_size 
        
        score = torch.matmul(
            embed_context, 
            embed_word.transpose(dim0=1, dim1=0))
        
        # Only possible in dynamic framework
        if negative_sample:
            score *= -1
        
        # Why the -1 here?
        obj = -1 * torch.sum(F.logsigmoid(score)) # Objective
        return obj

In [27]:
K=3 #number of negative samples
N=2 #length of window on each side (so N=2 gives a total window size of 5, as in t-2 t-1 t t+1 t+2)
EMB_SIZE = 128 # The size of the embedding

In [5]:
embeddings_location = "embeddings.txt" #the file to write the word embeddings to
labels_location = "labels.txt" #the file to write the labels to

# We reuse the data reading from the language modeling class
w2i = defaultdict(lambda: len(w2i))

#word counts for negative sampling
word_counts = defaultdict(int)

S = w2i["<s>"]
UNK = w2i["<unk>"]
def read_dataset(filename):
  with open(filename, "r") as f:
    for line in f:
      line = line.strip().split(" ")
      for word in line:
        word_counts[w2i[word]] += 1
      yield [w2i[x] for x in line]


# Read in the data
train = list(read_dataset("../data/ptb/train.txt"))
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset("../data/ptb/valid.txt"))
i2w = {v: k for k, v in w2i.items()}
nwords = len(w2i)

In [7]:
# take the word counts to the 3/4, normalize
counts =  np.array([list(x) for x in word_counts.items()])[:,1]**.75
normalizing_constant = sum(counts)
word_probabilities = np.zeros(nwords)
for word_id in word_counts:
    word_probabilities[word_id] = word_counts[word_id]**.75/normalizing_constant

In [8]:
with open(labels_location, 'w') as labels_file:
    for i in range(nwords):
        labels_file.write(i2w[i] + '\n')

In [30]:
# initialize the model
model = WordEmbSkip(nwords, EMB_SIZE)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
data_type = torch.LongTensor
use_cuda = torch.cuda.is_available()

if use_cuda:
    data_type = torch.cuda.LongTensor
    model.cuda()

### [Combining multiple criterions](https://discuss.pytorch.org/t/how-to-combine-multiple-criterions-to-a-loss-function/348)

In [15]:
def calc_sent_loss(sent):
    # Randomly sample words
    all_neg_words = np.random.choice(
        nwords,                # Values are in range(nwords)
        size=2*N*K*len(sent),  # Number of values to draw
        replace=True,
        p=word_probabilities)  
    
    losses = []
    for i, word in enumerate(sent):
        before_words = [sent[x] if x >=0 else S for x in range(i-N, i)]
        after_words = [sent[x] if x < len(sent) else S for x in range(i+1, i+N+1)]
        
        pos_words_tensor = torch.tensor(before_words+after_words).type(data_type)
        neg_words = all_neg_words[i*K*2*N:(i+1)*K*2*N]
        neg_words_tensor = torch.tensor(neg_words).type(data_type)
        
        target_word_tensor = torch.tensor([word]).type(data_type)
        
        # NOTE: Technically, we should ensure that neg words don't contain the context
        # But that is very unlikely 
        
        pos_loss = model(target_word_tensor, pos_words_tensor)
        neg_loss = model(target_word_tensor, neg_words_tensor, negative_sample=True)
        losses.append(pos_loss+neg_loss)   # Combining pos and neg loss
    return torch.stack(losses).sum()  # Combining loss for all words

In [32]:
for ITER in range(1):
    print("started iter %r" % ITER)
    
    # Start training
    random.shuffle(train)
    train_words, train_loss = 0, 0.0
    start = time.time()
    model.train()
    
    for sent_id, sent in enumerate(train[:5000]):
        my_loss = calc_sent_loss(sent)
        train_loss += my_loss.item()
        train_words += len(sent)
        
        # Take step after calculating loss for all words in sent
        optimizer.zero_grad()  # Zero the gradients 
        my_loss.backward()
        optimizer.step()
        
        if (sent_id+1) % 5000 == 0:
            print("--finished {} sentences".format(sent_id+1))
    
    train_ppl = float('inf') if train_loss / train_words > 709 else math.exp(train_loss / train_words)
    print("iter {}: train loss/word={}, ppl={}, time={}".format(
        ITER,
        train_loss/train_words,
        train_ppl,
        time.time()-start))
    
    # Evaluate on dev set 
    dev_words, dev_loss = 0, 0.0
    start = time.time()
    model.eval() 
    for sent_id, sent in enumerate(dev[:5000]):
        my_loss = calc_sent_loss(sent)
        dev_loss += my_loss.item()
        dev_words += len(sent)
    
    # Why 709?
    dev_ppl = float('inf') if dev_loss / dev_words > 709 else math.exp(dev_loss / dev_words)
    print("iter {}: dev loss/word={}, ppl={}, time={}".format(
        ITER,
        dev_loss/dev_words,
        dev_ppl,
        time.time()-start))

started iter 0
--finished 5000 sentences
iter 0: train loss/word=9.013635528645256, ppl=8214.330489782013, time=249.01472759246826
iter 0: dev loss/word=8.362590095129418, ppl=4283.775796565716, time=31.382995128631592
