Resources used to write this: 
* https://github.com/bollu/bollu.github.io#everything-you-know-about-word2vec-is-wrong
* https://www.youtube.com/watch?v=kEMJRjEdNzM&list=PLoROMvodv4rOhcuXMZkNm7j3fVwBBY42z&index=2

In [1]:
import torch 
import pandas as pd, numpy as np, datetime, scipy.misc
from torch.utils.tensorboard import SummaryWriter
from scipy.spatial import distance_matrix
from sklearn.metrics import pairwise_distances
import plotly_express as px
import umap 
path_results = './results/word2vec/'
path_runs = path_results + 'runs/'
path_data = './data/'

Some thoughts on where to go from here: 
* revamp this with the Dataset and DataLoader architecture 
* ~~simplify code into functions~~
* ~~shuffle sentences on the training set~~ 
* work out how to make this algorithm in batches 
* adjust scaling of the starting parameters
* ~~do an average loss function over a few batches rather than one~~

Tensorboard
* ~~hook it up to TensorBoard and print out some training graphs in real time~~
* ~~visualise the gradients and the weights of the model~~
* plot performance on a holdout set over time (??)
* ~~log hyperparameters (m,lr, batch_sz)`~
* ~~plot embeddings in tensorboard~~

* profile the code: where are the bottlenecks?
* put this on github
* manually calculate the gradients with finite differences and compare them to the autograd version
* intuitively understand the softmax part of this. what's the motivation, what are strengths and weaknesses going to be
* revamp using nn.Module 
* try out Pytorch Lightning

In [9]:
torch.manual_seed(420)
emb_sz = 3  # number of embedding dimensions

In [10]:
## read sentences, add start/end of line tokens, merge sentences together
corpus = open(path_data + 'simple_sentences.txt').read().split('\n')
def add_line_tokens(x):  return '<SOL> ' + x + ' <EOL>'
X_l = [add_line_tokens(o) + ' ' if i != (len(corpus)-1) 
            else add_line_tokens(o) for i,o in enumerate(corpus) ]
X = ''.join(X_l).split(' ')

In [11]:
X_test_ = """fluffy cute dog happy
big happy dog cute
small fluffy cat good fun
bank good bank big
withdraw money bank good
deposit bank money""".split('\n')
#[add_line_tokens(o) + ' ' for o in X_test]

# sen_l = X_test[0]
# sen = sen_l.split(' ')
# m = 1 
# idx_c=2  # idx_c
# c = sen[idx_c]
# min_idx,max_idx = max(0,sen_l-m), min(len(sen_l)-1,idx_c+m)
# for j in range(min_idx, max_idx+1): 
#     if j == idx_c:  continue 
#         sen[j]

In [12]:
## prepare vectors and accessor dictionaries
words = list(set(X))
n_words = len(words)
word2idx,idx2word = dict(),dict()
for i,o in enumerate(words): word2idx[o] = i ; idx2word[i] = o
def init_emb_mat(): 
    tmp = torch.rand((emb_sz, n_words))*1
    tmp.requires_grad=True
    return tmp
U,V = init_emb_mat(),init_emb_mat()  # U holds context words, V holds center words
def softmax(x): 
    """Numerically stable softmax"""
    z = x - max(x)
    return torch.exp(z) / torch.sum(torch.exp(z))

def get_batch(X, batch_sz, curr_idx=0):
    """Allows for batch updating
    X: corpus list of words, batch_sz: batch size, curr_idx: a running index 
    Returns tuple: X_batch (to run word2vec over), and new_idx (the next 
    iteration of running index )"""
    idx = curr_idx 
    new_idx = idx + batch_sz 
    X_sz = len(X) - 1 
    if new_idx > X_sz: 
        new_idx = (new_idx % X_sz) - 1
        X_batch = X[idx:] + X[0:new_idx] 
    else: 
        X_batch = X[idx:new_idx]
    return X_batch,new_idx

def shuffle_X(X_l):
    """X_l: list of sentences of X"""
    idxs = np.random.choice(range(len(X_l)),size = len(X_l),replace=False)
    X_l_new = np.array(X_l)[idxs].tolist()
    # make sure every sentence (except last ends in space)
    X_l_new = [o + ' ' if o[-1] != ' ' else o for o in X_l_new]
    X_l_new[-1] = X_l_new[-1][:-1]
    return ''.join(X_l_new).split(' ')

def predict(c,o):
    """Takes current values of U and V and predicts the probability of context word o 
    appearing in context window of center word c """
    c_idx,o_idx = word2idx[c],word2idx[o]
    v = V[:,c_idx].reshape(-1,1)
    dotprods = (v*U).sum(0)
    return softmax(dotprods)[o_idx].item()

def add_run_info_to_tensorboard(): 
    # Hparams
    hparam_d = {'emb_sz': emb_sz, 'm': m, 'lr': lr, 'batch_sz':batch_sz,
                'n_epochs':n_epochs, 'printfreq': printfreq}
    metric_d = {'hparam/loss': loss}
    writer.add_hparams(hparam_d, metric_d)
    # Umap embeddings
    emb = ((V+U) /2).T
    writer.add_embedding(emb,metadata=words,
                     tag='final_embeddings' )

def update_tensorboard(): 
    # loss, weights, grads
    writer.add_scalar(tag='training_loss', scalar_value=loss, 
                          global_step=global_step)
    writer.add_histogram(tag='V_weights', values=V, global_step=global_step)
    writer.add_histogram(tag='U_weights', values=U, global_step=global_step)
    writer.add_histogram(tag='V_grad', values=V.grad, global_step=global_step)
    writer.add_histogram(tag='U_grad', values=U.grad, global_step=global_step)
    
    # some example predictions
    preds_d = {
            'p_dog_cute':          predict('dog','cute'),
            'p_cute_dog':          predict('cute', 'dog'),
            'p_dog_dog':           predict('dog','dog'),
            'p_eol_sol':           predict('<EOL>','<SOL>'),
            'p_bank_money':        predict('bank','money'),
            'p_withdraw_money':    predict('withdraw','money'),
            'p_money_withdraw':    predict('money','withdraw'),
            "p_dog_money":         predict('dog','money'),
            "p_bank_cat":          predict('bank','cat')
        }
    writer.add_scalars('example_predictions', preds_d, global_step=global_step)
    
    

In [13]:
## the word2vec algorithm with SGD 
# with SGDa
lr = 0.1
m = 2  # number of context words
n_epochs = 400
printfreq = 50
batch_sz = 16
idx = 0 
epoch = 0 
loss_l = []  # holds losses of each batch, we'll take an average for eval 
log_dir = path_runs + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = SummaryWriter(log_dir = log_dir)
while epoch < n_epochs: 
    loss_sum = 0
    X_batch,new_idx = get_batch(X,batch_sz,idx)
    if new_idx <= idx: 
        X = shuffle_X(X_l)
        epoch += 1
        printflag = True 
    else: printflag = False
    idx = new_idx
    for i, c in enumerate(X_batch):   # enumerates for center word c 
        dotprods = (V[:,word2idx[c]].reshape(-1,1)*U).sum(0)   # use in calculating p below
        min_idx,max_idx = max(0,i-m), min(len(X_batch)-1,i+m)
        for j in range(min_idx, max_idx+1):   # enumerate context words 
            if j == i: continue   # don't eval the center word
            p = softmax(dotprods)[word2idx[X_batch[j]]] 
            loss_sum += torch.log(p)
          #  print("center:",c,"| context:",o, " ", p.item(), torch.log(p).item())   
        
    # last term below compensates for sliding window hitting the end of batch. 
    # it doesn't matter much.
    # batch_sz*2*m <--> number of center words * number of context words
    # sum(range(m,0,-1))*2 <--> adjust for end words 
    loss_sum = -loss_sum / (batch_sz*2*m - sum(range(m,0,-1))*2) 
    loss_l.append(loss_sum.item())
    loss_sum.backward()
    #if epoch % printfreq == (printfreq-1) and printflag: 
    if epoch % printfreq == 0 and printflag:
        global_step = epoch*len(X) + i
        loss = np.mean(loss_l)
        update_tensorboard()
        print(epoch, global_step, np.mean(loss_l))
        loss_l = []
    U.data = U.data - lr * U.grad
    V.data = V.data - lr * V.grad
    U.grad.data.zero_()
    V.grad.data.zero_()
add_run_info_to_tensorboard()
writer.close()


50 8315 2.521885723047863
100 16615 2.3806907666671253
150 24915 2.3162625005938873
200 33215 2.2803790251720826
250 41515 2.2599090018483716
300 49815 2.248839707273509
350 58115 2.2443694553853
400 66415 2.239168651775964


## Diagnostics

In [14]:
# We average together V and U vectors to get the final embedding 
emb = ((U + V) / 2).detach().numpy()
df_emb = pd.DataFrame(emb, columns =  words)
def normalise_embedding_df(df):  
    return df /np.linalg.norm(df.values,ord=2,axis=0,keepdims=True)
df_emb_normalised = normalise_embedding_df(df_emb)

In [15]:
### Functions 
## Count how often each word appears in input
def get_word_input_counts():   return pd.Series(X).value_counts()

## Count how often a word appears within distance m of another word 
def get_word_cooccurences(): 
    """
    Returns a symmetrical data frame, where the column is the center word, 
     and the row is how often that word  appears in the m-window 
     around the center word
    """ 
    count_df = pd.DataFrame(np.zeros((n_words,n_words)),dtype=int,
                           index = words, columns = words)
    # X is the list of words
    for i,c in enumerate(X): 
        min_idx,max_idx = max(0,i-m),min(len(X)-1,i+m)
        for j in range(min_idx,max_idx+1): 
            if j == i: continue
            o_idx = word2idx[X[j]]
            count_df.iloc[o_idx][word2idx[c]] += 1
    return count_df

def get_dist_mat(df, metric): 
    """df: pandas data frame; metric: anything fitting in pairwise distances"""
    return pd.DataFrame(pairwise_distances(df.values.T, metric=metric).round(3), 
                       columns = df_emb.columns, index = df_emb.columns)
    
# Given a word w, what word is closest/furthest to it? 
def get_similar_n_words(w, n=5, direction='closest', metric='cosine'): 
    """w: word, emb: embeddings as np array
    direction: one of closest or furthest"""
    dist_mat = get_dist_mat(df_emb,metric=metric)
    v = dist_mat[w]
    if    direction == 'closest' : return v.nsmallest(n+1)[1:]
    elif  direction == 'furthest': return v.nlargest(n)
    else: raise Exception("direction must be one of 'closest' or 'furthest'")

## Do dim reduction with umap and plot results
def plot_umap(emb, n_dims=3, n_neighbours=3): 
    """n_dims: number of output dimensions 
    n_neighbours: local/global parameter for UMAP"""
    if   n_dims == 2:   cols = ['dim1','dim2']
    elif n_dims == 3:   cols = ['dim1','dim2','dim3']
    else:               raise Exception('dims should be 2 or 3')
    # put your data values in the fit_transform bit 
    emb_umap = umap.UMAP(n_neighbors=n_neighbours, n_components=n_dims).fit_transform(emb.T)
    # put in pandas dataframe to help plotting with plotly express
    emb_umap_df = pd.DataFrame(emb_umap, columns = cols)
    emb_umap_df['word'] = words
    if n_dims == 2: 
        return px.scatter(emb_umap_df,    x='dim1', y='dim2',          hover_name='word')
    elif n_dims ==3:
        return px.scatter_3d(emb_umap_df, x='dim1', y='dim2',z='dim3', hover_name='word')
         

In [16]:
print(get_word_input_counts())
get_word_cooccurences()

<EOL>       31
<SOL>       31
dog         18
money       14
bank        13
good         9
fluffy       9
cute         7
small        6
put          5
fun          5
big          5
cat          4
happy        3
withdraw     3
deposit      3
dtype: int64


Unnamed: 0,cat,dog,money,big,<SOL>,<EOL>,fun,bank,deposit,cute,fluffy,withdraw,good,put,happy,small
cat,0,1,0,1,3,4,0,0,0,2,2,0,1,0,0,2
dog,1,10,0,1,12,14,5,0,0,7,9,0,4,0,4,5
money,0,0,0,2,14,14,0,12,3,0,0,3,3,4,0,0
big,1,1,2,0,5,5,0,3,0,0,2,0,1,0,0,0
<SOL>,3,12,14,5,0,30,5,12,3,7,7,3,8,5,3,5
<EOL>,4,14,14,5,30,0,5,13,3,5,8,3,8,5,2,3
fun,0,5,0,0,5,5,0,0,0,0,2,0,3,0,0,0
bank,0,0,12,3,12,13,0,0,2,0,0,2,3,4,0,0
deposit,0,0,3,0,3,3,0,2,0,0,0,1,0,0,0,0
cute,2,7,0,0,7,5,0,0,0,2,2,0,0,0,1,2


In [17]:
get_similar_n_words('cute',direction = 'closest', metric='euclidean')

small     0.255
fluffy    0.274
happy     0.456
dog       0.500
fun       0.507
Name: cute, dtype: float32

In [18]:
get_similar_n_words('cute',direction = 'furthest', metric='euclidean')

put         2.910
withdraw    2.811
deposit     2.790
money       2.786
bank        2.691
Name: cute, dtype: float32

In [19]:
def get_len(x): b

In [295]:
#get_dist_mat(df_emb,'cosine')
plot_umap(df_emb_normalised.values, n_dims=3)


Embedding a total of 3 separate connected components using meta-embedding (experimental)



In [None]:
# write results to file to inspect
df_U,df_V = pd.DataFrame(U.detach().numpy()),pd.DataFrame(V.detach().numpy())
df_U.columns = df_V.columns = df_emb.columns = words

# distance matrices 
df_emb_cosine,df_emb_l2 = get_dist_mat(df_emb,'cosine'),get_dist_mat(df_emb,'euclidean')
df_emb_cosine.columns = df_emb_l2.columns = words
df_emb_cosine.index = df_emb_l2.index= words

In [None]:
with pd.ExcelWriter(path_results + 'word2vec_vectors.xlsx') as writer: 
    df_V.to_excel(writer, sheet_name='center_words')
    df_U.to_excel(writer, sheet_name='context_words')
    df_emb.to_excel(writer, sheet_name='emb_words')
    df_final_cosine.to_excel(writer, sheet_name='emb_cosine_distance')
    df_final_l2.to_excel(writer, sheet_name='emb_euclidean_distance')