Resources used to write this: 
* https://github.com/bollu/bollu.github.io#everything-you-know-about-word2vec-is-wrong
* https://www.youtube.com/watch?v=kEMJRjEdNzM&list=PLoROMvodv4rOhcuXMZkNm7j3fVwBBY42z&index=2

In [139]:
import torch 
import torchtest
import pandas as pd, numpy as np, datetime, scipy.misc
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
from scipy.spatial import distance_matrix
from sklearn.metrics import pairwise_distances
from IPython.core.debugger import set_trace
import plotly_express as px
import umap 
import torch.nn as nn
import torch.nn.functional as F
path_results = './results/word2vec/'
path_runs = path_results + 'runs/'
path_data = './data/'
torch.manual_seed(420)

<torch._C.Generator at 0x7fe35112d2f0>

Some thoughts on where to go from here: 
* ~~revamp data with the Dataset and DataLoader architecture~~ 
* ~~put algorithm into a class that calls torch.nn~~
* ~~simplify code into functions~~
* ~~shuffle sentences on the training set~~ 
* some kind of test set testing 
* work out how to make this algorithm in batches 
* adjust scaling of the starting parameters
* ~~do an average loss function over a few batches rather than one~~
* put on GPU somehow (UTS? Colab?) 
* try on a real dataset
* GloVe
* something to handle unknown words, or words outside the top n 
* train/test split for dataset
* where can you integrate tensorboard code?
* where do you put tests?

Tensorboard
* ~~hook it up to TensorBoard and print out some training graphs in real time~~
* ~~visualise the gradients and the weights of the model~~
* plot performance on a holdout set over time (??)
* ~~log hyperparameters (m,lr, batch_sz)~~
* ~~plot embeddings in tensorboard~~

* profile the code: where are the bottlenecks?
* ~~put this on github~~
* manually calculate the gradients with finite differences and compare them to the autograd version
* intuitively understand the softmax part of this. what's the motivation, what are strengths and weaknesses going to be
* try out Pytorch Lightning

In [9]:
class SentenceData(Dataset): 
    def __init__(self): 
        self.corpus = self._load_train()
        self.X = self._sen2tkn(self.corpus)
        self.words,self.n_words  = list(set(self.X)),len(list(set(self.X)))
        self.word2idx,self.idx2word = self._get_word_idx_mappings()

    def _sen2tkn(self, l):
        """Convert list of sentences l into a concatenated list of tokens, adding <SOL> 
        and <EOL> tokens as needed."""
        def add_line_tokens(x):  return '<SOL> ' + x + ' <EOL>'
        X_l = [add_line_tokens(o) + ' ' if i != (len(l)-1) 
                        else add_line_tokens(o) for i,o in enumerate(l)]
        X = ''.join(X_l).split(' ')
        return X

    def _load_train(self): 
        ## read sentences, return list with one sentence per line
        corpus = open(path_data + 'simple_sentences.txt').read().split('\n')
        return corpus

    def _get_word_idx_mappings(self): 
        word2idx,idx2word = dict(),dict()
        for i,o in enumerate(self.words): word2idx[o] = i; idx2word[i] = o
        return word2idx,idx2word
    
    def __getitem__(self, idx): 
        if type(idx) is not slice:     idx = slice(idx,idx+1)
        tmp = self.corpus[idx]
        # convert to index 
        return [self.word2idx[o] for o in self._sen2tkn(tmp)]
    
    def __len__(self): 
        return len(self.corpus)
    

In [11]:
train_ds = SentenceData()
def collate_fn(x):
    """Flatten out list of lists"""
    l = []
    for o in x: l += o
    return l
# Batch size has become how many sentences to include in one row 
train_dl = DataLoader(train_ds, batch_size=2, shuffle=True, collate_fn = collate_fn)

In [40]:
# For now, we do the preprocessing in the SentenceData class, 
# and we assume that processed data is passed to this class. 
class Word2Vec(nn.Module): 
    def __init__(self, m, emb_sz, ds):
        """If a layer has trainable parameters, it is defined in init. 
        ds = dataset"""
        super(Word2Vec, self).__init__()
        self.m,self.emb_sz,self.ds = m,emb_sz,ds
        # U holds context words, V holds center words
        self.U,self.V = self._init_embedding(),self._init_embedding()  
        
    def _init_embedding(self): 
        return nn.Parameter(torch.rand((self.emb_sz, self.ds.n_words))*1)
    
    def _softmax(self, x): 
        """Numerically stable softmax"""
        z = x - max(x)
        return torch.exp(z) / torch.sum(torch.exp(z))
    
    def forward(self, x): 
        """x is a tensor of some sort""" 
        loss_sum = 0
        z = 0  # counts how many losses we add up (required for loss_sum)
        for i, c in enumerate(x):   # enumerates for center word c 
            # use in calculating p below
            dotprods = (self.V[:,c].reshape(-1,1)*self.U).sum(0)  
            min_idx,max_idx = max(0,i-self.m), min(len(x)-1,i+self.m)
            for j in range(min_idx, max_idx+1):   # enumerate context words 
                if j == i: continue   # don't eval the center word
                p = self._softmax(dotprods)[x[j]]
                z +=1
                loss_sum += torch.log(p)
        loss_sum = -loss_sum / z
        return loss_sum
    
    def predict(self, c, o):
        """Takes current values of U and V and predicts the probability of context word o 
        appearing in context window of center word c """
        c_idx,o_idx = self.ds.word2idx[c],self.ds.word2idx[o]
        v = self.V[:,c_idx].reshape(-1,1)
        dotprods = (v*self.U).sum(0)
        return self._softmax(dotprods)[o_idx].item()
    

In [41]:
def add_run_info_to_tensorboard(hparam_d, metric_d, model): 
    writer.add_hparams(hparam_d, metric_d)
    # Umap embeddings
    emb = ((model.V+model.U) /2).T
    writer.add_embedding(emb,metadata=model.ds.words,
                     tag='final_embeddings')

def update_tensorboard(global_step, loss, model, predict_sentences=False):
    # loss, weights, grads
    writer.add_scalar(tag='training_loss', scalar_value=loss, 
                          global_step=global_step)
    writer.add_histogram(tag='V_weights', values=model.V, global_step=global_step)
    writer.add_histogram(tag='U_weights', values=model.U, global_step=global_step)
    writer.add_histogram(tag='V_grad', values=model.V.grad, global_step=global_step)
    writer.add_histogram(tag='U_grad', values=model.U.grad, global_step=global_step)
    if predict_sentences:
        # some example predictions
        preds_d = {
                'p_dog_cute':          model.predict('dog','cute'),
                'p_cute_dog':          model.predict('cute', 'dog'),
                'p_dog_dog':           model.predict('dog','dog'),
                'p_eol_sol':           model.predict('<EOL>','<SOL>'),
                'p_bank_money':        model.predict('bank','money'),
                'p_withdraw_money':    model.predict('withdraw','money'),
                'p_money_withdraw':    model.predict('money','withdraw'),
                "p_dog_money":         model.predict('dog','money'),
                "p_bank_cat":          model.predict('bank','cat')
            }
        writer.add_scalars('example_predictions', preds_d, global_step=global_step)

In [42]:
### Run model 

m = 2  # number of context words
emb_sz = 15  # number of embedding dimensions
model = Word2Vec(m, emb_sz, train_ds)

## the word2vec algorithm with SGD 
# with SGDa
lr = 0.01
n_epochs = 400
printfreq = 50
batch_sz = 16
idx = 0 
epoch = 0 
log_dir = path_runs + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = SummaryWriter(log_dir = log_dir)
opt = torch.optim.Adam(model.parameters(), lr=lr)
loss_movingavg = []     # list to work out moving average of loss
for epoch in range(n_epochs): 
    thisloss = 0
    for i,x in enumerate(train_dl): 
        opt.zero_grad()
        loss = model(x)
        thisloss += loss.item() / len(train_dl)
        loss_movingavg.append(thisloss)
        loss.backward()
        opt.step()
    if epoch % printfreq == 0:
        global_step = epoch * len(train_dl)
        loss = thisloss
        update_tensorboard(global_step, loss, model, predict_sentences=True)
        print(epoch, thisloss )
        loss_movingavg = []

# Hparams
hparam_d = {'emb_sz': emb_sz, 'm': m, 'lr': lr, 'batch_sz':batch_sz,
            'n_epochs':n_epochs, 'printfreq': printfreq}
metric_d = {'hparam/loss': np.mean(loss_movingavg)}   
add_run_info_to_tensorboard(hparam_d, metric_d, model)
writer.close()

0 2.727652800710578
50 2.1705847664883264
100 2.1590568642867245
150 2.16153799860101
200 2.157130379425852
250 2.1623246481544096
300 2.147075251529091
350 2.1446270879946256


## Unit tests

Making sure everything works as it is supposed to 

#### Check variables actually get trained

In [180]:
def check_variables_update(model, opt, x): 
    """m: model
    opt: optimiser
    x: some input"""
    params = [o for o in model.named_parameters() if o[1].requires_grad]
    # copy initial values
    initial_params = [ (name, p.clone()) for (name, p) in params]
    print("---- Parameters with 'requires_grad' and their sizes ------")
    for (name, p) in initial_params:  print(name, p.size())
    # take a step
    opt.zero_grad()
    loss = model(x)
    loss.backward()
    opt.step()
    print("---- Matrix norm of parameter update for one step ------")
    for (_,old_p), (name, new_p) in zip(initial_params, params): 
        print (name, torch.norm(new_p - old_p).item())       
check_variables_update(model, opt, x)

---- Parameters with 'requires_grad' and their sizes ------
U torch.Size([15, 16])
V torch.Size([15, 16])
---- Matrix norm of parameter update for one step ------
U 0.0557783804833889
V 0.05241953581571579


#### Check performance against a baseline 



In [37]:
### Uniform prediction 


## Diagnostics

In [332]:
class EmbeddingDiagnostics: 
    def __init__(self, model, dl): 
        """dl: data_loader"""
        self.model,self.dl = model,dl
        # We average together V and U vectors to get the final embedding 
        self.emb = ((self.model.U + self.model.V) / 2).detach().numpy()
        self.df_emb = pd.DataFrame(self.emb, columns =  self.model.ds.words)
        self.df_emb_normalised = self._normalise_embedding_df(self.df_emb)
        
    def get_baseline_loss(self, baseline='random'): 
        """
        Returns baseline loss to compare our model against. 
        If we don't get better than these losses, our model isn't learning anything. 
        
        We run through one pass of the data, assuming that we predict each
               word with equal probability. 
        Params
            baseline: 
                "random": p(word) = 1/n_words, i.e. a uniform dist
                "bag_of_words": p(word) = p(word|bag of words(train_set)) 
                "coocurrences": p(word) = p(word|center word c, distance m, train set)
                                  Basically looking at word cofrequencies to get probability. 
        """
        z = 0  
        loss_sum = 0
        if baseline == "bag_of_words": 
            freqs = self.get_word_input_counts() / sum(self.get_word_input_counts())
            freqs.index = [self.model.ds.word2idx[o] for o in freqs.index]
            idx2freq = dict(freqs)
        if baseline == 'coocurrences':
            counts = self.get_word_cooccurences()
            prob_mat = (counts / counts.values.sum(axis=1))
        for x in self.dl:  # enumerate each example
            for i, c in enumerate(x):  # enumerate center words 
                min_idx = max(0,          i - self.model.m)
                max_idx = min(len(x) - 1, i + self.model.m)
                for j in range(min_idx, max_idx + 1):   # enumerate context words 
                    if j == i: continue   # don't eval the center word
                    if baseline == "random":            p = 1 / model.ds.n_words
                    elif baseline == "bag_of_words":    p = idx2freq[x[j]]
                    elif baseline == "coocurrences":
                        p = prob_mat.iloc[:,c].loc[self.model.ds.idx2word[x[j]]]
                    else:
                        raise Exception('Baseline keyword not recognised')
                    z += 1
                    loss_sum += torch.log(torch.tensor(p))
        loss_sum = -loss_sum / z
        return loss_sum

    def _normalise_embedding_df(self, df):  
        return df /np.linalg.norm(df.values,ord=2,axis=0,keepdims=True)
    
    def get_word_input_counts(self):   
        """Count how often each word appears in input"""
        return pd.Series(model.ds.X).value_counts()

    def get_word_cooccurences(self): 
        """
        Count how often a word appears within distance m of another word 
        Returns a symmetrical data frame, where the column is the center word, 
         and the row is how often that word  appears in the m-window 
         around the center word
        """ 
        m,n_words,X = self.model.m,self.model.ds.n_words,self.model.ds.X
        words,word2idx = self.model.ds.words,self.model.ds.word2idx
        count_df = pd.DataFrame(np.zeros((n_words,n_words)),dtype=int,
                               index = words, columns = words)
        # X is the list of words
        for i,c in enumerate(X): 
            min_idx,max_idx = max(0,i-m),min(len(X)-1,i+m)
            for j in range(min_idx,max_idx+1): 
                if j == i: continue
                o_idx = word2idx[X[j]]
                count_df.iloc[o_idx][word2idx[c]] += 1
        return count_df
    
    def _get_dist_mat(self, df, metric): 
        """df: pandas data frame; metric: anything fitting in pairwise distances"""
        from sklearn.metrics import pairwise_distances
        return pd.DataFrame(pairwise_distances(df.values.T, metric=metric).round(3), 
                           columns = self.df_emb.columns, index = self.df_emb.columns)
    
    def get_similar_n_words(self, w, n=5, direction='closest', metric='cosine'): 
        """
        Given a word w, what word is closest/furthest to it? 
            w: word, emb: embeddings as np array
            direction: one of closest or furthest"""
        dist_mat = self._get_dist_mat(self.df_emb, metric=metric)
        v = dist_mat[w]
        if    direction == 'closest' : return v.nsmallest(n+1)[1:]
        elif  direction == 'furthest': return v.nlargest(n)
        else: raise Exception("direction must be one of 'closest' or 'furthest'")
             
    def plot_umap(self, n_dims=3, n_neighbours=3): 
        """
        Perform dim reduction with umap and plot results
        n_dims: number of output dimensions 
        n_neighbours: local/global parameter for UMAP"""
        if   n_dims == 2:   cols = ['dim1','dim2']
        elif n_dims == 3:   cols = ['dim1','dim2','dim3']
        else:               raise Exception('dims should be 2 or 3')
        # put your data values in the fit_transform bit 
        emb_umap = umap.UMAP(n_neighbors=n_neighbours, n_components=n_dims).fit_transform(self.emb.T)
        # put in pandas dataframe to help plotting with plotly express
        emb_umap_df = pd.DataFrame(emb_umap, columns = cols)
        emb_umap_df['word'] = self.model.ds.words
        if n_dims == 2: 
            return px.scatter(emb_umap_df,    x='dim1', y='dim2',          hover_name='word')
        elif n_dims ==3:
            return px.scatter_3d(emb_umap_df, x='dim1', y='dim2',z='dim3', hover_name='word')

    def write2file(self, path_results): 
        """Write some distance matrices and embedding matrices to file to inspect manually"""
        df_U = pd.DataFrame(self.model.U.detach().numpy())
        df_V = pd.DataFrame(self.model.V.detach().numpy())
        df_U.columns = df_V.columns = self.df_emb.columns

        # distance matrices 
        df_emb_cosine = self._get_dist_mat(self.df_emb,'cosine')
        df_emb_l2     = self._get_dist_mat(self.df_emb,'euclidean')
        df_emb_cosine.columns = df_emb_l2.columns = self.df_emb.columns
        df_emb_cosine.index   = df_emb_l2.index   = self.df_emb.columns

        with pd.ExcelWriter(path_results + 'word2vec_vectors.xlsx') as writer: 
            df_V.to_excel(         writer, sheet_name='center_words'          )
            df_U.to_excel(         writer, sheet_name='context_words'         )  
            self.df_emb.to_excel(  writer, sheet_name='emb_words'             )
            df_emb_cosine.to_excel(writer, sheet_name='emb_cosine_distance'   )
            df_emb_l2.to_excel(    writer, sheet_name='emb_euclidean_distance')

In [333]:
d = EmbeddingDiagnostics(model, train_dl)
#d.get_word_input_counts()
#d.get_word_cooccurences()
#d.get_similar_n_words('<SOL>',direction = 'closest', metric='euclidean')
#d.write2file(path_results)
#d.plot_umap(n_dims=2)

<EOL>    0.948
big      1.108
happy    1.326
dog      1.401
good     1.463
Name: <SOL>, dtype: float32

In [334]:
d.get_baseline_loss(baseline='coocurrences')

tensor(2.1541)