## Word2Vec
This is a minimalist implementation of word2vec. First, we will go through our data and make:
 * A corpus
 * A vocablary
 * Dictionaries to convert between the two

In [None]:
import os
import torch
import numpy as np

corpus = [];  table = str.maketrans('','','!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')
for book in os.listdir('text_files/')[:10]:
    try:
        words = open('text_files/'+book,'r').read().translate(table).split()
        corpus.extend( [w.lower() for w in words] )
    except:
        print('Failed for book: '+book)

vocab = list(set(corpus))
print('The total number of words is: ',len(vocab))
word2idx = {w: idx for (idx, w) in enumerate(vocab)}
idx2word = {idx: w for (idx, w) in enumerate(vocab)}

We can now make the actual dataset that will be used to train the "neural network"

In [None]:
center_word = []; target_words = [];
for idx in range(2,len(corpus)-2):
    words        = corpus[idx-2:idx+3]
    center_word.append( word2idx[words[2]] )
    target_words.append( [word2idx[words[0]], word2idx[words[1]], word2idx[words[3]], word2idx[words[4]]] )

Again, we will write our own batching function. Note: **we only put out full sized batches**

In [None]:
def batch_generator(length, batch_size):
    indices = np.arange(length); 
    np.random.shuffle(indices); batch=[] 
    for i in indices:
        batch.append(i)
        if len(batch)==batch_size:
            yield batch
            batch=[]

Now we build the actual class. There is a difference in the networks between CBOW and skip-gram which takes place here. We are applying a soft-max here! (and not in the loss function)

In [None]:
class wrd2vec(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim=40):
        super(wrd2vec, self).__init__()
        self.u = torch.nn.Linear(vocab_size,embedding_dim)
        self.v = torch.nn.Linear(embedding_dim, vocab_size)
        self.soft = torch.nn.Softmax(dim=-1)

    def forward(self, x):
        out = self.soft(self.v(self.u(x)))
        return out

We can now set up some essential parameters, write a missing function and then train the network!

In [None]:
globStep=0; totWords = len(vocab); batch_size=50;
net = wrd2vec(totWords)
criterion = torch.nn.KLDivLoss(reduction='mean')
optimizer = torch.optim.AdamW(net.parameters())

In [None]:
def torchify(batch):
    x = torch.zeros((totWords,batch_size))
    y = torch.zeros((totWords,batch_size))
    idxs1  = [center_word[k] for k in batch]
    idxs2  = [target_words[k][j] for k in batch for j in range(4)]
    idxs3  = [k for k in range(batch_size) for h in range(4)]
    x[idxs1,list(range(batch_size))] = 1
    y[idxs2, idxs3] = 1; y = y/y.sum(dim=0)
    return x, y

In [None]:
for epoch in range(200):
    for batch in batch_generator(totWords,batch_size):
        x, y = torchify(batch)
        out = net(x.T)
        loss = criterion(out, y.T)
        loss.backward(); globStep += 1
        if globStep%2 == 0:
            optimizer.step()
            optimizer.zero_grad()
            if globStep%10 == 0:
                print('Step: ',globStep, ' Loss: ',loss.item())

## Extra Stuff 
This is for saving the results and maybe exploring the output depeding on how this session goes!

In [None]:
v = net.v.weight.detach().numpy()
u = net.u.weight.detach().numpy()

import scipy.io as sio
sio.savemat('matrices.mat',{'u':u, 'v':v, 'vocab':vocab})

In [None]:
def topWords(word):
    idx = word2idx[word]
    idxs = np.argsort(-np.dot(w[idx],w.T))[:10]
    words = [idx2word[k] for k in idxs]
    return words