Natural Language Processing 
Word 2 Vec
Skip Grams 

In [16]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import torch.nn.functional as F
from nltk.corpus import gutenberg
import pickle

In [17]:
#1. Load the data

In [18]:
#load the data from the pickle file
with open('corpus.pkl', 'rb') as f:
    tokenized_corpus, vocab, word2index, index2word = pickle.load(f)
flatten = lambda l: [item for sublist in l for item in sublist]
voc_size = len(vocab)


In [19]:
#2. pre train the publish_display_data

def random_batch(batch_size, word_sequence):
    
    # Make skip gram of one size window
    skip_grams = []
    # loop each word sequence
    # we starts from 1 because 0 has no context
    # we stop at second last for the same reason
    for sent in tokenized_corpus:
        for i in range(1, len(sent) - 1):
            target = word2index[sent[i]]
            context = [word2index[sent[i - 1]], word2index[sent[i + 1]]]
            for w in context:
                skip_grams.append([target, w])
    
    random_inputs = []
    random_labels = []
    random_index = np.random.choice(range(len(skip_grams)), batch_size, replace=False) #randomly pick without replacement
        
    for i in random_index:
        random_inputs.append([skip_grams[i][0]])  # target, e.g., 2
        random_labels.append([skip_grams[i][1]])  # context word, e.g., 3
            
    return np.array(random_inputs), np.array(random_labels)

In [20]:
# Testing the random_batch function first
batch_size = 20
word_sequence = flatten(tokenized_corpus)
random_inputs, random_labels = random_batch(batch_size, word_sequence)
print(random_inputs)



[[ 513]
 [ 941]
 [1083]
 [ 456]
 [3186]
 [1600]
 [2213]
 [ 236]
 [2780]
 [ 236]
 [2280]
 [ 513]
 [ 513]
 [2228]
 [2078]
 [1279]
 [ 467]
 [1642]
 [1407]
 [2078]]


In [21]:
#3 Model 
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(Skipgram,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size)
        self.embedding_u = nn.Embedding(vocab_size, emb_size)
    
    def forward(self, center_words, target_words, all_vocabs):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, 1, emb_size]
        all_embeds    = self.embedding_u(all_vocabs) #   [batch_size, voc_size, emb_size]
        
        scores      = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]

        norm_scores = all_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, voc_size, emb_size] @ [batch_size, emb_size, 1] = [batch_size, voc_size, 1] = [batch_size, voc_size]

        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax
        # scalar (loss must be scalar)    
            
        return nll # negative log likelihood


In [22]:
#4. Train the model
# Hyperparameters
batch_size = 100
embedding_size = 100
epochs = 1000
learning_rate = 0.001

# Model
model = Skipgram(voc_size, embedding_size)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

#use for the normalized term in the probability calculation
all_vocabs = prepare_sequence(list(vocab), word2index).expand(batch_size, len(vocab))  # [batch_size, voc_size]
all_vocabs.shape

torch.Size([100, 3203])

In [23]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [24]:
import time 
start_time = time.time()
for epoch in range(epochs):
    avg_loss = 0
    batch_inputs, batch_labels = random_batch(batch_size, word_sequence)
    batch_inputs = prepare_sequence(batch_inputs, word2index)
    batch_labels = prepare_sequence(batch_labels, word2index)
    optimizer.zero_grad()
    loss = model(batch_inputs, batch_labels, all_vocabs)
    loss.backward()
    optimizer.step()
    avg_loss += loss.item()
    if (epoch+1) % 100 == 0:
        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, epochs, avg_loss))
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
print('Epoch Time: {}m {}s'.format(epoch_mins, epoch_secs))
 



TypeError: unhashable type: 'numpy.ndarray'