This is a simplified implementation of word2vec. It is missing negative sampling and it is only run on a toy dataset. 

The main idea behind this notebook was to get experience with Pytorch and in visualising and debugging an algorithm. 

Enhancements for next time: 
* work out how to make this algorithm in batches 
* try on a real dataset
* GloVe implementation
* something to handle unknown words, or words outside the top n 
* profile the code: where are the bottlenecks?
* GPU integration

Resources used to write this: 
* https://github.com/bollu/bollu.github.io#everything-you-know-about-word2vec-is-wrong
* https://www.youtube.com/watch?v=kEMJRjEdNzM&list=PLoROMvodv4rOhcuXMZkNm7j3fVwBBY42z&index=2

In [1]:
import torch 
import torchtest
import pandas as pd, numpy as np, datetime, scipy.misc
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
from scipy.spatial import distance_matrix
from sklearn.metrics import pairwise_distances
from IPython.core.debugger import set_trace
import plotly_express as px
import umap 
import torch.nn as nn
import torch.nn.functional as F
path_results = './results/word2vec/'
path_runs = path_results + 'runs/'
path_data = './data/'
torch.manual_seed(420)

<torch._C.Generator at 0x7f7fc282d2f0>

In [12]:
class SentenceData(Dataset): 
    def __init__(self, m): 
        self.m = m 
        self.corpus = self._load_train()
        self.X = self._sen2tkn(self.corpus)
        self.words,self.n_words  = list(set(self.X)),len(list(set(self.X)))
        self.word2idx,self.idx2word = self._get_word_idx_mappings()

    def _sen2tkn(self, l):
        """Convert list of sentences l into a concatenated list of tokens, adding <SOL> 
        and <EOL> tokens as needed."""
        def add_line_tokens(x):  return '<SOL> ' + x + ' <EOL>'
        X_l = [add_line_tokens(o) + ' ' if i != (len(l)-1) 
                        else add_line_tokens(o) for i,o in enumerate(l)]
        X = ''.join(X_l).split(' ')
        return X

    def _load_train(self): 
        ## read sentences, return list with one sentence per line
        corpus = open(path_data + 'simple_sentences.txt').read().split('\n')
        return corpus

    def _get_word_idx_mappings(self): 
        word2idx,idx2word = dict(),dict()
        for i,o in enumerate(self.words): word2idx[o] = i; idx2word[i] = o
        return word2idx,idx2word
    
    def __getitem__(self, idx): 
        if type(idx) is not slice:     idx = slice(idx,idx+1)
        tmp = self.corpus[idx]
        # convert to index 
        x = [self.word2idx[o] for o in self._sen2tkn(tmp)]
        return x
    
    def __len__(self): 
        return len(self.corpus)
    

In [13]:
m = 2
train_ds = SentenceData(m=m)
def collate_fn(x):
    """Flatten out list of lists"""
    l = []
    for o in x: l += o
    return l
# Batch size has become how many sentences to include in one row 
train_dl = DataLoader(train_ds, batch_size=2, shuffle=True, collate_fn = collate_fn)

In [170]:
class Word2Vec(nn.Module): 
    def __init__(self, m, emb_sz, ds):
        """If a layer has trainable parameters, it is defined in init. 
        ds = dataset"""
        super(Word2Vec, self).__init__()
        self.m,self.emb_sz,self.ds = m,emb_sz,ds
        # U holds context words, V holds center words
        self.U,self.V = self._init_embedding(),self._init_embedding()  
        
    def _init_embedding(self): 
        return nn.Parameter(torch.randn((self.emb_sz, self.ds.n_words)) *\
                            np.sqrt(1/self.ds.n_words))
    
    def forward(self, x, input_independent=False): 
        """ Returns tensor of log probabilities
        x is a tensor of some sort""" 
        logp_l = []
        if input_independent: x = [0 for o in x]
        for i, c in enumerate(x):   # enumerates for center word c 
            dotprods = self._get_dotprods(c) # use in calculating p below
            min_idx,max_idx = max(0,i-self.m), min(len(x)-1,i+self.m)
            for j in range(min_idx, max_idx+1):   # enumerate context words 
                if j != i:    # don't eval the center word
                    logp = self.predict(c, o=x[j], dotprods=dotprods)
                    logp_l.append(logp)
        return torch.stack(logp_l)
    
    def _get_dotprods(self, c): 
        v,U = self.V[:,c].reshape(-1,1),self.U
        dotprods = (v * U).sum(0)
        return dotprods
    
    def predict(self, c, o, dotprods=None):
        """
        Takes current values of U and V and predicts log probability of context word o 
        appearing in context window of center word c 
        c: index of center word 
        o: index of context word
        dotprods: saves computation if you pass those in"""
        if dotprods is None: dotprods = self._get_dotprods(c)
        return F.log_softmax(dotprods, dim=0)[o]
    

In [171]:
# I wasn't quite sure where to put this code. 

def add_run_info_to_tensorboard(hparam_d, metric_d, model): 
    writer.add_hparams(hparam_d, metric_d)
    # Umap embeddings
    emb = ((model.V+model.U) /2).T
    writer.add_embedding(emb,metadata=model.ds.words,
                     tag='final_embeddings')

def update_tensorboard(global_step, loss, model, predict_sentences=False):
    # loss, weights, grads
    writer.add_scalar(tag='training_loss', scalar_value=loss, 
                          global_step=global_step)
    writer.add_histogram(tag='V_weights', values=model.V, global_step=global_step)
    writer.add_histogram(tag='U_weights', values=model.U, global_step=global_step)
    writer.add_histogram(tag='V_grad', values=model.V.grad, global_step=global_step)
    writer.add_histogram(tag='U_grad', values=model.U.grad, global_step=global_step)
    if predict_sentences:
        word_pairs = [('dog','cute'), ('cute', 'dog'), ('dog','dog'), ('<EOL>','<SOL>'),
                      ('bank','money'), ('withdraw','money'), ('money','withdraw'),
                      ('dog','money'), ('bank','cat')]
        pred_d = dict()
        for c, o in word_pairs: 
            key = "p_" + c + "_" + o
            c_idx,o_idx = model.ds.word2idx[c],model.ds.word2idx[o]
            pred_d[key] = torch.exp(model.predict(c_idx, o_idx)).item()
        writer.add_scalars('example_predictions', pred_d, global_step=global_step)

In [173]:
### Run model 
m = 2  # number of context words
emb_sz = 15  # number of embedding dimensions
model = Word2Vec(m, emb_sz, train_ds)

## the word2vec algorithm with SGD 
# with SGDa
lr = 0.01
n_epochs = 400
printfreq = 1
batch_sz = 100
idx = 0 
epoch = 0 
log_dir = path_runs + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = SummaryWriter(log_dir = log_dir)
opt = torch.optim.Adam(model.parameters(), lr=lr)
def loss_fn(logp_l):    return -torch.mean(logp_l)
loss_movingavg = []     # list to work out moving average of loss
for epoch in range(n_epochs): 
    thisloss = 0
    for i,x in enumerate(train_dl): 
        opt.zero_grad()
        logp_l = model(x)  # log preds 
        loss = loss_fn(logp_l)
        thisloss += loss.item() / len(train_dl)
        loss_movingavg.append(thisloss)
        loss.backward()
        opt.step()
    if epoch % printfreq == 0:
        global_step = epoch * len(train_dl)
        loss = thisloss
        update_tensorboard(global_step, loss, model, predict_sentences=True)
        print(epoch, thisloss )
        loss_movingavg = []

# Hparams
hparam_d = {'emb_sz': emb_sz, 'm': m, 'lr': lr, 'batch_sz':batch_sz,
            'n_epochs':n_epochs, 'printfreq': printfreq}
metric_d = {'hparam/loss': np.mean(loss_movingavg)}   
add_run_info_to_tensorboard(hparam_d, metric_d, model)
writer.close()

0 2.749580747202824
1 2.619847034153185
2 2.495410216482062
3 2.3872072822169255
4 2.347351036573711
5 2.2759331778476115
6 2.2536559606853284
7 2.242249325702065
8 2.2260925393355517
9 2.2234162154950594
10 2.216376097578751
11 2.2052205612784936
12 2.211290767318324
13 2.205070834410818
14 2.1938008509184184
15 2.1901651934573527
16 2.186603596335963
17 2.182089767957989
18 2.169069327806172
19 2.1922519583451114
20 2.1808404985227083
21 2.188620385370757
22 2.1638218352669165
23 2.1752220956902755
24 2.1752996319218685
25 2.172049610238327
26 2.1792183800747527
27 2.1476881692284033
28 2.1664692226209135
29 2.1794854590767305
30 2.1572324476744003
31 2.1705891395870007
32 2.1804998673890768
33 2.1665854077590145


KeyboardInterrupt: 

## Diagnostics

In this class are methods to check statistics of the input, visualise the input, get loss of some simple baselines, check the variables in the model actually update,  visualise the output via UMAP, create distance matrices between the output embeddings, find most similar and dissimilar words to a given word, and so on.  

In [155]:
class EmbeddingDiagnostics: 
    def __init__(self, model, dl, opt,loss_fn, seed=420): 
        """dl: data_loader"""        
        torch.manual_seed(seed)
        self.model,self.dl,self.opt,self.loss_fn = model,dl,opt,loss_fn
        # We average together V and U vectors to get the final embedding 
        self.emb = ((self.model.U + self.model.V) / 2).detach().numpy()
        self.df_emb = pd.DataFrame(self.emb, columns =  self.model.ds.words)
        self.df_emb_normalised = self._normalise_embedding_df(self.df_emb)
             
    def check_variables_update(self): 
        """
        This checks that parameters are being updated. 
        We run one forward pass+backward pass, and then update the parameters once, 
        and look at what changed. 
        """
        x = next(iter(self.dl))  # get one batch from data loader 
        params = [o for o in self.model.named_parameters() if o[1].requires_grad]
        # copy initial values
        initial_params = [(name, p.clone()) for (name, p) in params]
        print("---- Parameters with 'requires_grad' and their sizes ------")
        for (name, p) in initial_params:  print(name, p.size())
        # take a step
        self.opt.zero_grad()
        logp_l = self.model(x)
        loss = self.loss_fn(logp_l)
        loss.backward()
        self.opt.step()
        print("---- Matrix norm of parameter update for one step ------")
        for (_,old_p), (name, new_p) in zip(initial_params, params): 
            print (name, torch.norm(new_p - old_p).item())       
            
    def check_inputs(self): 
        """Select some inputs, translate them, print them"""
        idxs = self.model.ds[1:4]
        print(idxs)
        print([ds.idx2word[o] for o in idxs])
    
    def get_baseline_loss(self, baseline='random'): 
        """
        Returns baseline loss to compare our model against. 
        These let us know how good or bad we are doing.  
        
        We run through one pass of the data, assuming that we predict each
               word with equal probability. 
        Params
            baseline: 
                "random": p(word) = 1/n_words, i.e. a uniform dist
                "input_independent": replace all sentences with all one character
                "bag_of_words": p(word) = p(word|bag of words(train_set)) 
                "coocurrences": p(word) = p(word|center word c, distance m, train set)
                                Basically looking at word cofrequencies to get probability.
                                I think this is the best loss possible for our model.   
        """
        baseline_vals = ["random", "bag_of_words", "coocurrences"]
        if baseline not in baseline_vals:   raise Exception('Baseline keyword not recognised')  
        logp_l = []
        if baseline == "random": return torch.tensor(-np.log(1/self.model.ds.n_words))
        if baseline == "bag_of_words": 
            freqs = self.get_word_input_counts() / sum(self.get_word_input_counts())
            freqs.index = [self.model.ds.word2idx[o] for o in freqs.index]
            idx2freq = dict(freqs)
        if baseline == 'coocurrences':
            counts = self.get_word_cooccurences()
            prob_mat = (counts / counts.values.sum(axis=1))
        for x in self.dl:  # one epoch, going through all examples
            for i, c in enumerate(x):  # enumerate center words 
                min_idx = max(0,          i - self.model.m)
                max_idx = min(len(x) - 1, i + self.model.m)
                for j in range(min_idx, max_idx + 1):   # enumerate context words 
                    if j != i:  # don't eval the center word
                        if baseline == "bag_of_words":    
                            logp = torch.log(torch.tensor(idx2freq[x[j]]))
                        elif baseline == "coocurrences":
                            logp = torch.log(torch.tensor(
                                prob_mat.iloc[:,c].loc[self.model.ds.idx2word[x[j]]])) 
                        else: 
                            logp = self.model.predict(c, o=x[j])
                        logp_l.append(logp)
        return loss_fn(torch.stack(logp_l))

    def _normalise_embedding_df(self, df):  
        return df /np.linalg.norm(df.values,ord=2,axis=0,keepdims=True)
    
    def get_word_input_counts(self):   
        """Count how often each word appears in input"""
        return pd.Series(model.ds.X).value_counts()

    def get_word_cooccurences(self): 
        """
        Count how often a word appears within distance m of another word 
        Returns a symmetrical data frame, where the column is the center word, 
         and the row is how often that word  appears in the m-window 
         around the center word
        """ 
        m,n_words,X = self.model.m,self.model.ds.n_words,self.model.ds.X
        words,word2idx = self.model.ds.words,self.model.ds.word2idx
        count_df = pd.DataFrame(np.zeros((n_words,n_words)),dtype=int,
                               index = words, columns = words)
        # X is the list of words
        for i,c in enumerate(X): 
            min_idx,max_idx = max(0,i-m),min(len(X)-1,i+m)
            for j in range(min_idx,max_idx+1): 
                if j == i: continue
                o_idx = word2idx[X[j]]
                count_df.iloc[o_idx][word2idx[c]] += 1
        return count_df
    
    def _get_dist_mat(self, df, metric): 
        """df: pandas data frame; metric: anything fitting in pairwise distances"""
        from sklearn.metrics import pairwise_distances
        return pd.DataFrame(pairwise_distances(df.values.T, metric=metric).round(3), 
                           columns = self.df_emb.columns, index = self.df_emb.columns)
    
    def get_similar_n_words(self, w, n=5, direction='closest', metric='cosine'): 
        """
        Given a word w, what word is closest/furthest to it? 
            w: word, emb: embeddings as np array
            direction: one of closest or furthest"""
        dist_mat = self._get_dist_mat(self.df_emb, metric=metric)
        v = dist_mat[w]
        if    direction == 'closest' : return v.nsmallest(n+1)[1:]
        elif  direction == 'furthest': return v.nlargest(n)
        else: raise Exception("direction must be one of 'closest' or 'furthest'")
             
    def plot_umap(self, n_dims=3, n_neighbours=3): 
        """
        Perform dim reduction with umap and plot results
        n_dims: number of output dimensions 
        n_neighbours: local/global parameter for UMAP"""
        if   n_dims == 2:   cols = ['dim1','dim2']
        elif n_dims == 3:   cols = ['dim1','dim2','dim3']
        else:               raise Exception('dims should be 2 or 3')
        # put your data values in the fit_transform bit 
        emb_umap = umap.UMAP(n_neighbors=n_neighbours, n_components=n_dims).fit_transform(self.emb.T)
        # put in pandas dataframe to help plotting with plotly express
        emb_umap_df = pd.DataFrame(emb_umap, columns = cols)
        emb_umap_df['word'] = self.model.ds.words
        if n_dims == 2: 
            return px.scatter(emb_umap_df,    x='dim1', y='dim2',          hover_name='word')
        elif n_dims ==3:
            return px.scatter_3d(emb_umap_df, x='dim1', y='dim2',z='dim3', hover_name='word')

    def write2file(self, path_results): 
        """Write some distance matrices and embedding matrices to file to inspect manually"""
        df_U = pd.DataFrame(self.model.U.detach().numpy())
        df_V = pd.DataFrame(self.model.V.detach().numpy())
        df_U.columns = df_V.columns = self.df_emb.columns

        # distance matrices 
        df_emb_cosine = self._get_dist_mat(self.df_emb,'cosine')
        df_emb_l2     = self._get_dist_mat(self.df_emb,'euclidean')
        df_emb_cosine.columns = df_emb_l2.columns = self.df_emb.columns
        df_emb_cosine.index   = df_emb_l2.index   = self.df_emb.columns

        with pd.ExcelWriter(path_results + 'word2vec_vectors.xlsx') as writer: 
            df_V.to_excel(         writer, sheet_name='center_words'          )
            df_U.to_excel(         writer, sheet_name='context_words'         )  
            self.df_emb.to_excel(  writer, sheet_name='emb_words'             )
            df_emb_cosine.to_excel(writer, sheet_name='emb_cosine_distance'   )
            df_emb_l2.to_excel(    writer, sheet_name='emb_euclidean_distance')

In [156]:
d = EmbeddingDiagnostics(model, train_dl, opt, loss_fn)
#d.get_word_input_counts()
#d.get_word_cooccurences()
#d.get_similar_n_words('<SOL>',direction = 'closest', metric='euclidean')
#d.write2file(path_results)
#d.plot_umap(n_dims=2)

In [157]:
#d.get_baseline_loss()
#d.check_variables_update()    
# d.check_inputs()

[15, 11, 14, 11, 12, 13, 15, 14, 11, 3, 13, 15, 4, 11, 3, 11, 14, 11, 13]
['<SOL>', 'dog', 'fluffy', 'dog', 'fun', '<EOL>', '<SOL>', 'fluffy', 'dog', 'small', '<EOL>', '<SOL>', 'cute', 'dog', 'small', 'dog', 'fluffy', 'dog', '<EOL>']
