Resources used to write this: 
* https://github.com/bollu/bollu.github.io#everything-you-know-about-word2vec-is-wrong
* https://www.youtube.com/watch?v=kEMJRjEdNzM&list=PLoROMvodv4rOhcuXMZkNm7j3fVwBBY42z&index=2

In [29]:
import torch 
import pandas as pd, numpy as np, datetime, scipy.misc
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
from scipy.spatial import distance_matrix
from sklearn.metrics import pairwise_distances
from IPython.core.debugger import set_trace
import plotly_express as px
import umap 
import torch.nn as nn
import torch.nn.functional as F
path_results = './results/word2vec/'
path_runs = path_results + 'runs/'
path_data = './data/'

Some thoughts on where to go from here: 
* ~~revamp data with the Dataset and DataLoader architecture~~ 
* put algorithm into a class that calls torch.nn
* ~~simplify code into functions~~
* ~~shuffle sentences on the training set~~ 
* work out how to make this algorithm in batches 
* adjust scaling of the starting parameters
* ~~do an average loss function over a few batches rather than one~~

Tensorboard
* ~~hook it up to TensorBoard and print out some training graphs in real time~~
* ~~visualise the gradients and the weights of the model~~
* plot performance on a holdout set over time (??)
* ~~log hyperparameters (m,lr, batch_sz)~~
* ~~plot embeddings in tensorboard~~

* profile the code: where are the bottlenecks?
* ~~put this on github~~
* manually calculate the gradients with finite differences and compare them to the autograd version
* intuitively understand the softmax part of this. what's the motivation, what are strengths and weaknesses going to be
* revamp using nn.Module 
* try out Pytorch Lightning

In [165]:
torch.manual_seed(420)
emb_sz = 3  # number of embedding dimensions

In [62]:
class SentenceData(Dataset): 
    def __init__(self, emb_sz): 
        self.emb_sz = emb_sz
        self.corpus,self.corpus_test = self._load_train(),self._load_test()
        self.X,self.X_test = self._sen2tkn(self.corpus),self._sen2tkn(self.corpus_test)
        self.words,self.n_words  = list(set(self.X)),len(list(set(self.X)))
        self.word2idx,self.idx2word = self._get_word_idx_mappings()

    def _sen2tkn(self, l):
        """Convert list of sentences l into a concatenated list of tokens, adding <SOL> 
        and <EOL> tokens as needed."""
        def add_line_tokens(x):  return '<SOL> ' + x + ' <EOL>'
        X_l = [add_line_tokens(o) + ' ' if i != (len(l)-1) 
                        else add_line_tokens(o) for i,o in enumerate(l)]
        X = ''.join(X_l).split(' ')
        return X

    def _load_train(self): 
        ## read sentences, return list with one sentence per line
        corpus = open(path_data + 'simple_sentences.txt').read().split('\n')
        return corpus
    

    
    def _get_word_idx_mappings(self): 
        word2idx,idx2word = dict(),dict()
        for i,o in enumerate(self.words): word2idx[o] = i; idx2word[i] = o
        return word2idx,idx2word
    
    def __getitem__(self, idx): 
        if type(idx) is not slice:     idx = slice(idx,idx+1)
        tmp = self.corpus[idx]
        # convert to index 
        return [self.word2idx[o] for o in self._sen2tkn(tmp)]
    
    def __len__(self): 
        return len(self.corpus)
    
    def _load_test(self): 
        corpus_test = """fluffy cute dog happy
big happy dog cute
small fluffy cat good fun
bank good bank big
withdraw money bank good
deposit bank money""".split('\n')
        return corpus_test
    

What you want is for shuffle to shuffle sentences, but leave word order unchanged within a sentence. So you want shuffle to work on corpus, and not X. You want the indices to refer to words from the shuffled batch, I guess, since we are concatenating all the sentences together. 

In [175]:
train_ds = SentenceData(emb_sz)
def collate_fn(x):
    """Flatten out list of lists"""
    l = []
    for o in x: l += o
    return l
# Batch size has become how many sentences to include in one row 
train_dl = DataLoader(train_ds, batch_size=2, shuffle=True, collate_fn = collate_fn)

In [176]:
for i,o in enumerate(train_dl): 
    if i ==3: x=o

In [184]:
# For now, we do the preprocessing in the SentenceData class, 
# and we assume that processed data is passed to this class. 
class Word2Vec(nn.Module): 
    def __init__(self, m, emb_sz, ds):
        """If a layer has trainable parameters, it is defined in init. 
        ds = dataset"""
        super(Word2Vec, self).__init__()
        self.m,self.emb_sz,self.ds = m,emb_sz,ds
        # U holds context words, V holds center words
        self.U,self.V = self._init_embedding(),self._init_embedding()  
        
    def _init_embedding(self): 
        return nn.Parameter(torch.rand((self.emb_sz, self.ds.n_words))*1)
    
    def _softmax(self, x): 
        """Numerically stable softmax"""
        z = x - max(x)
        return torch.exp(z) / torch.sum(torch.exp(z))
    
    def forward(self, x): 
        """x is a tensor of some sort""" 
        loss_sum = 0
        for i, c in enumerate(x):   # enumerates for center word c 
            # use in calculating p below
            dotprods = (self.V[:,c].reshape(-1,1)*self.U).sum(0)  
            min_idx,max_idx = max(0,i-self.m), min(len(x)-1,i+self.m)
            for j in range(min_idx, max_idx+1):   # enumerate context words 
                if j == i: continue   # don't eval the center word
                p = self._softmax(dotprods)[x[j]]
                loss_sum += torch.log(p)
        # last term below compensates for sliding window hitting the end of batch. 
        # it doesn't matter much.
        # batch_sz*2*m <--> number of center words * number of context words
        # sum(range(m,0,-1))*2 <--> adjust for end words 
        loss_sum = -loss_sum / (batch_sz*2*m - sum(range(m,0,-1))*2) 
        return loss_sum
    
    def predict(self, c, o):
        """Takes current values of U and V and predicts the probability of context word o 
        appearing in context window of center word c """
        c_idx,o_idx = self.ds.word2idx[c],self.ds.word2idx[o]
        v = self.V[:,c_idx].reshape(-1,1)
        dotprods = (v*self.U).sum(0)
        return self._softmax(dotprods)[o_idx].item()
    

In [185]:
def add_run_info_to_tensorboard(hparam_d, metric_d, model): 
    writer.add_hparams(hparam_d, metric_d)
    # Umap embeddings
    emb = ((model.V+model.U) /2).T
    writer.add_embedding(emb,metadata=model.ds.words,
                     tag='final_embeddings')

def update_tensorboard(global_step, loss, model, predict_sentences=False):
    # loss, weights, grads
    writer.add_scalar(tag='training_loss', scalar_value=loss, 
                          global_step=global_step)
    writer.add_histogram(tag='V_weights', values=model.V, global_step=global_step)
    writer.add_histogram(tag='U_weights', values=model.U, global_step=global_step)
    writer.add_histogram(tag='V_grad', values=model.V.grad, global_step=global_step)
    writer.add_histogram(tag='U_grad', values=model.U.grad, global_step=global_step)
    if predict_sentences:
        # some example predictions
        preds_d = {
                'p_dog_cute':          model.predict('dog','cute'),
                'p_cute_dog':          model.predict('cute', 'dog'),
                'p_dog_dog':           model.predict('dog','dog'),
                'p_eol_sol':           model.predict('<EOL>','<SOL>'),
                'p_bank_money':        model.predict('bank','money'),
                'p_withdraw_money':    model.predict('withdraw','money'),
                'p_money_withdraw':    model.predict('money','withdraw'),
                "p_dog_money":         model.predict('dog','money'),
                "p_bank_cat":          model.predict('bank','cat')
            }
        writer.add_scalars('example_predictions', preds_d, global_step=global_step)

In [186]:
m = 2  # number of context words
model = Word2Vec(m, emb_sz, train_ds)
model(x)

## the word2vec algorithm with SGD 
# with SGDa
lr = 0.1
n_epochs = 400
printfreq = 5
batch_sz = 16
idx = 0 
epoch = 0 
log_dir = path_runs + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = SummaryWriter(log_dir = log_dir)
opt = torch.optim.Adam(model.parameters(), lr=lr)
for epoch in range(n_epochs): 
    thisloss = 0
    for i,x in enumerate(train_dl): 
        opt.zero_grad()
        loss = model(x)
        thisloss += loss.item() / len(train_dl)
        loss.backward()
        opt.step()
    loss_l.append(thisloss)
    if epoch % printfreq == 0:
        global_step = epoch * len(train_dl)
        loss = thisloss
        update_tensorboard(global_step, loss, model, predict_sentences=True)
        print(epoch, loss_l[epoch])

# Hparams
hparam_d = {'emb_sz': emb_sz, 'm': m, 'lr': lr, 'batch_sz':batch_sz,
            'n_epochs':n_epochs, 'printfreq': printfreq}
metric_d = {'hparam/loss': loss}   
add_run_info_to_tensorboard(hparam_d, metric_d, model)
writer.close()

0 1.5959683284163475
5 1.4460309259593487
10 1.4356357231736183
15 1.4386391751468182
20 1.4148786589503288
25 1.4268550388514996
30 1.4139582067728043
35 1.4261057190597057
40 1.4144185669720173
45 1.4180114232003689
50 1.4246969409286976
55 1.407885730266571
60 1.4177313819527626
65 1.4271469712257385
70 1.4182912148535252
75 1.414852563291788
80 1.413858037441969
85 1.4156530760228634
90 1.4223078079521656
95 1.418634794652462
100 1.4289653450250626
105 1.4194122701883316
110 1.412411816418171
115 1.4112755618989468
120 1.4291682876646519
125 1.4030862376093864
130 1.5149525590240955
135 1.4302466958761215
140 1.4165453165769577
145 1.4234930872917175
150 1.4210439585149288
155 1.4387550204992294
160 1.490160770714283
165 1.4186476469039917
170 1.4181873425841331
175 1.4591817110776901
180 1.4080413058400154
185 1.4344502463936806
190 1.4190863035619259
195 1.414304543286562
200 1.4156725592911243
205 1.420191828161478
210 1.4128944873809814
215 1.4169982075691223
220 1.415448434650

TypeError: add_run_info_to_tensorboard() takes 3 positional arguments but 4 were given

In [6]:
# while epoch < n_epochs: 
#     loss_sum = 0
#     X_batch,new_idx = get_batch(X,batch_sz,idx)
#     if new_idx <= idx: 
#         X = shuffle_X(X_l)
#         epoch += 1
#         printflag = True 
#     else: printflag = False
#     idx = new_idx
#     for i, c in enumerate(X_batch):   # enumerates for center word c 
#         dotprods = (V[:,word2idx[c]].reshape(-1,1)*U).sum(0)   # use in calculating p below
#         min_idx,max_idx = max(0,i-m), min(len(X_batch)-1,i+m)
#         for j in range(min_idx, max_idx+1):   # enumerate context words 
#             if j == i: continue   # don't eval the center word
#             p = softmax(dotprods)[word2idx[X_batch[j]]] 
#             loss_sum += torch.log(p)
#           #  print("center:",c,"| context:",o, " ", p.item(), torch.log(p).item())   
        
#     # last term below compensates for sliding window hitting the end of batch. 
#     # it doesn't matter much.
#     # batch_sz*2*m <--> number of center words * number of context words
#     # sum(range(m,0,-1))*2 <--> adjust for end words 
#     loss_sum = -loss_sum / (batch_sz*2*m - sum(range(m,0,-1))*2) 
#     loss_l.append(loss_sum.item())
#     loss_sum.backward()
#     #if epoch % printfreq == (printfreq-1) and printflag: 
#     if epoch % printfreq == 0 and printflag:
#         global_step = epoch*len(X) + i
#         loss = np.mean(loss_l)
#         update_tensorboard()
#         print(epoch, global_step, np.mean(loss_l))
#         loss_l = []
#     U.data = U.data - lr * U.grad
#     V.data = V.data - lr * V.grad
#     U.grad.data.zero_()
#     V.grad.data.zero_()
# add_run_info_to_tensorboard()
# writer.close()


NameError: name 'X' is not defined

In [5]:
## prepare vectors and accessor dictionaries
# def get_batch(X, batch_sz, curr_idx=0):
#     """Allows for batch updating
#     X: corpus list of words, batch_sz: batch size, curr_idx: a running index 
#     Returns tuple: X_batch (to run word2vec over), and new_idx (the next 
#     iteration of running index )"""
#     idx = curr_idx 
#     new_idx = idx + batch_sz 
#     X_sz = len(X) - 1 
#     if new_idx > X_sz: 
#         new_idx = (new_idx % X_sz) - 1
#         X_batch = X[idx:] + X[0:new_idx] 
#     else: 
#         X_batch = X[idx:new_idx]
#     return X_batch,new_idx

# def shuffle_X(X_l):
#     """X_l: list of sentences of X"""
#     idxs = np.random.choice(range(len(X_l)),size = len(X_l),replace=False)
#     X_l_new = np.array(X_l)[idxs].tolist()
#     # make sure every sentence (except last) ends in space
#     X_l_new = [o + ' ' if o[-1] != ' ' else o for o in X_l_new]
#     X_l_new[-1] = X_l_new[-1][:-1]
#     return ''.join(X_l_new).split(' ')


    

## Diagnostics

In [260]:
class EmbeddingDiagnostics: 
    def __init__(self, model): 
        self.model = model
        # We average together V and U vectors to get the final embedding 
        self.emb = ((self.model.U + self.model.V) / 2).detach().numpy()
        self.df_emb = pd.DataFrame(self.emb, columns =  self.model.ds.words)
        self.df_emb_normalised = self._normalise_embedding_df(self.df_emb)

    def _normalise_embedding_df(self, df):  
        return df /np.linalg.norm(df.values,ord=2,axis=0,keepdims=True)
    
    def get_word_input_counts(self):   
        """Count how often each word appears in input"""
        return pd.Series(model.ds.X).value_counts()

    def get_word_cooccurences(self): 
        """
        Count how often a word appears within distance m of another word 
        Returns a symmetrical data frame, where the column is the center word, 
         and the row is how often that word  appears in the m-window 
         around the center word
        """ 
        m,n_words,X = self.model.m,self.model.ds.n_words,self.model.ds.X
        words,word2idx = self.model.ds.words,self.model.ds.word2idx
        count_df = pd.DataFrame(np.zeros((n_words,n_words)),dtype=int,
                               index = words, columns = words)
        # X is the list of words
        for i,c in enumerate(X): 
            min_idx,max_idx = max(0,i-m),min(len(X)-1,i+m)
            for j in range(min_idx,max_idx+1): 
                if j == i: continue
                o_idx = word2idx[X[j]]
                count_df.iloc[o_idx][word2idx[c]] += 1
        return count_df
    
    def _get_dist_mat(self, df, metric): 
        """df: pandas data frame; metric: anything fitting in pairwise distances"""
        from sklearn.metrics import pairwise_distances
        return pd.DataFrame(pairwise_distances(df.values.T, metric=metric).round(3), 
                           columns = self.df_emb.columns, index = self.df_emb.columns)
    
    def get_similar_n_words(self, w, n=5, direction='closest', metric='cosine'): 
        """
        Given a word w, what word is closest/furthest to it? 
            w: word, emb: embeddings as np array
            direction: one of closest or furthest"""
        dist_mat = self._get_dist_mat(self.df_emb, metric=metric)
        v = dist_mat[w]
        if    direction == 'closest' : return v.nsmallest(n+1)[1:]
        elif  direction == 'furthest': return v.nlargest(n)
        else: raise Exception("direction must be one of 'closest' or 'furthest'")
             
    def plot_umap(self, n_dims=3, n_neighbours=3): 
        """
        Perform dim reduction with umap and plot results
        n_dims: number of output dimensions 
        n_neighbours: local/global parameter for UMAP"""
        if   n_dims == 2:   cols = ['dim1','dim2']
        elif n_dims == 3:   cols = ['dim1','dim2','dim3']
        else:               raise Exception('dims should be 2 or 3')
        # put your data values in the fit_transform bit 
        emb_umap = umap.UMAP(n_neighbors=n_neighbours, n_components=n_dims).fit_transform(self.emb.T)
        # put in pandas dataframe to help plotting with plotly express
        emb_umap_df = pd.DataFrame(emb_umap, columns = cols)
        emb_umap_df['word'] = self.model.ds.words
        if n_dims == 2: 
            return px.scatter(emb_umap_df,    x='dim1', y='dim2',          hover_name='word')
        elif n_dims ==3:
            return px.scatter_3d(emb_umap_df, x='dim1', y='dim2',z='dim3', hover_name='word')

    def write2file(self, path_results): 
        """Write some distance matrices and embedding matrices to file to inspect manually"""
        df_U = pd.DataFrame(self.model.U.detach().numpy())
        df_V = pd.DataFrame(self.model.V.detach().numpy())
        df_U.columns = df_V.columns = self.df_emb.columns

        # distance matrices 
        df_emb_cosine = self._get_dist_mat(self.df_emb,'cosine')
        df_emb_l2     = self._get_dist_mat(self.df_emb,'euclidean')
        df_emb_cosine.columns = df_emb_l2.columns = self.df_emb.columns
        df_emb_cosine.index   = df_emb_l2.index   = self.df_emb.columns

        with pd.ExcelWriter(path_results + 'word2vec_vectors.xlsx') as writer: 
            df_V.to_excel(         writer, sheet_name='center_words'          )
            df_U.to_excel(         writer, sheet_name='context_words'         )  
            self.df_emb.to_excel(  writer, sheet_name='emb_words'             )
            df_emb_cosine.to_excel(writer, sheet_name='emb_cosine_distance'   )
            df_emb_l2.to_excel(    writer, sheet_name='emb_euclidean_distance')

In [268]:
d = EmbeddingDiagnostics(model)
d.get_word_input_counts()
d.get_word_cooccurences()
d.get_similar_n_words('<SOL>',direction = 'closest', metric='euclidean')
d.write2file(path_results)
d.plot_umap(n_dims=2)

<EOL>    0.181
good     0.278
big      0.506
money    1.005
dog      1.201
Name: <SOL>, dtype: float32