Implementation of model to generate word2vec embeddings for telugu language. The model is trained on a corpus of wikipedia articles.

## Imports

In [None]:
%config Completer.use_jedi=False

In [None]:
import numpy as np

In [None]:
from tqdm.notebook import trange, tqdm

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [None]:
from itertools import chain, groupby

In [None]:
from collections import Counter

In [None]:
import string

In [None]:
import time

In [None]:
import re

In [None]:
import gc

In [None]:
import random
import math

In [None]:
import matplotlib.pyplot as plt
plt.style.use('seaborn-colorblind')

In [None]:
from sklearn.decomposition import PCA

In [None]:
torch.manual_seed(42)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device 

## Generate skipgrams

In [None]:
gc.collect()

In [None]:
# sample sentence to test the code
sentences = ['This is a notebook'.lower().split(), 'Another good notebook exists somewhere else as far as I know'.lower().split()]

In [None]:
# generate_word_contextword_pairs generate_word_contextword_pairs(sentence, n_of_ngram=3):
#     n = n_of_ngram
#     indx = int((n_of_ngram-1)/2)
#     all_word_context_pairs = []
#     for i in np.arange(len(sentence)-n + 1):
#         all_word_context_pairs.append([(sentence[i+indx], context_word) for context_word in sentence[i:i+indx]])
#         all_word_context_pairs.append([(sentence[i+indx], context_word) for context_word in sentence[i+indx+1:i+n]])
    
#     return list(chain.from_iterable(all_word_context_pairs))

# def generate_word_contextword_pairs(sentence, window_size=5): 
#     all_word_context_pairs = []
#     for i in np.arange(window_size, len(sentence)-window_size):
#         all_word_context_pairs.append([(sentence[i], context_word) for context_word in sentence[i-window_size:i]])
#         all_word_context_pairs.append([(sentence[i], context_word) for context_word in sentence[i+1:i+window_size+1]])
    
#     return list(chain.from_iterable(all_word_context_pairs))

def generate_word_contextword_pairs(sentence, window_size=5): 
    all_word_context_pairs = []
    
    for i, word in enumerate(sentence):
#         win_size = np.random.randint(low = 1, high = window_size+1 )
        win_size = window_size
        contexts = sentence[i-win_size:i] + sentence[i+1:i+win_size+1]
        all_word_context_pairs.append([(word, context_word) for context_word in contexts])
    return list(chain.from_iterable(all_word_context_pairs))

def generate_word_contextword_pairs_dynamic(sentence, window_size=5): 
    # window size is dynamic. window_size parameter is the maximum allowed window_size.
    # so for each word, win_size is uniformly sampled from [1, window_size]
    # Ref: Goldberg and Levy 2014 

    all_word_context_pairs = []
    
    for i, word in enumerate(sentence):
        win_size = np.random.randint(low = 1, high = window_size+1 )
#         win_size = window_size
        contexts = sentence[i-win_size:i] + sentence[i+1:i+win_size+1]
        all_word_context_pairs.append([(word, context_word) for context_word in contexts])
    return list(chain.from_iterable(all_word_context_pairs))

In [None]:
generate_word_contextword_pairs_dynamic(sentences[1],2)

In [None]:
generate_word_contextword_pairs(sentences[1],2)

In [None]:
te_corpus = open('Data/te_wiki_cleaned_dedup.txt').read().splitlines()
# Remove digits
te_corpus = [re.sub(r'\d+', '',line).strip() for line in te_corpus] 
# Remove all space/tab/newline/half-space characters
te_corpus = [re.sub(r'\u200c', ' ', line) for line in te_corpus]
print(f'Length of corpus: {len(te_corpus)}')

In [None]:
print(te_corpus[:10])

In [None]:
# Remove punctuation
te_corpus = [line.translate(str.maketrans('', '', string.punctuation)) for line in te_corpus] 

In [None]:
# Remove empty lines and lowercasing
te_corpus = [line.lower() for line in te_corpus if line] 

In [None]:
# Remove lines with less than 15 words
te_corpus = [line.strip() for line in te_corpus if len(line.strip().split()) >= 15]
print(f'Length of corpus: {len(te_corpus)}')

In [None]:
# Tokenize
te_corpus = [line.strip().split() for line in te_corpus]

In [None]:
print(te_corpus[:10])

In [None]:
unigram_freq = dict(Counter(list(chain.from_iterable(te_corpus))))
print(f'Number of tokens: {len(unigram_freq)}')

In [None]:
min_occurrences = 50
unigram_freq_filtered ={}
[unigram_freq_filtered.update({key:unigram_freq[key]}) for key in unigram_freq.keys() if unigram_freq[key]>min_occurrences]
print(f'Number of tokens: {len(unigram_freq_filtered)}')

In [None]:
list(unigram_freq_filtered.items())[:10]

In [None]:
# Vocabulary
vocab = list(unigram_freq_filtered.keys())
# vocab.insert(0, '<unk>')
vocab_size = len(vocab)

In [None]:
# dictionaries to help convert between index and word
word_to_idx = {word: i for i, word in enumerate(vocab)}
indx_to_word = {i: word for i, word in enumerate(vocab)}

In [None]:
# Handle low freq words
te_corpus_N = []
for sentence in te_corpus:
    sent_N = []
    for word in sentence:
        if word in vocab:
            sent_N.append(word) 
#         else:
#             sent_N.append('<unk>') # remove them; ref: Goldberg and Levy 2014
    te_corpus_N.append(sent_N)

In [None]:
len(te_corpus), len(te_corpus_N)

In [None]:
vocab_freq = torch.Tensor(list(Counter(list(chain.from_iterable(te_corpus_N))).values()))

In [None]:
sampling_dist = vocab_freq**0.75

In [None]:
sampling_dist = sampling_dist/sampling_dist.sum()

In [None]:
data = list(chain.from_iterable([generate_word_contextword_pairs(sentence,5) for sentence in te_corpus_N]))
# data = list(chain.from_iterable([generate_word_contextword_pairs_dynamic(sentence,5) for sentence in te_corpus_N]))
print(f'A few word-context pairs: {data[0:10]}')
data = [(word_to_idx[a], word_to_idx[b]) for a,b in data]
print(f'A few word-context pairs interms of vocab indices: {data[0:10]}')

In [None]:
dataloader = torch.utils.data.DataLoader(data, batch_size=512, shuffle=True)

In [None]:
class SkipGramNegativeSampling(nn.Module):
    
    def __init__(self,embedding_size, vocab_size, num_neg_samples=5, sampling_weights=None):
        super(SkipGramNegativeSampling, self).__init__()
        self.input_embedding = nn.Embedding(vocab_size,embedding_size)
        self.output_embedding = nn.Embedding(vocab_size, embedding_size)
        
#         r = 0.01/embedding_size
#         r = 5
#         torch.nn.init.uniform_(self.input_embedding.weight, -r, r)
#         torch.nn.init.uniform_(self.output_embedding.weight, -r, r)
        
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size      
        self.num_neg_samples = num_neg_samples
        if sampling_weights is not None:
            self.sampling_weights = sampling_weights
        else:
            self.sampling_weights = torch.ones(self.vocab_size)/self.vocab_size
            
        
    def forward(self, inputs, outputs):
        # Dont need forward. Computing loss dirictly is simpler
#         raise NotImplementedError
        out = self.input_embedding(inputs)* self.output_embedding(outputs)
#         for _ in out.shape[1:]:
#             out= out.sum(1)
#         return out
        return out.sum(1)
    
    def negativeSampling(self, num_samples):        
        # returns indices of sampled words
        return torch.multinomial(self.sampling_weights, num_samples, replacement=True)
    
    def loss(self, inputs, outputs, negative_samples):
        input_em = self.input_embedding(inputs)
        output_em = self.output_embedding(outputs)
        neg_samples_em = self.output_embedding(negative_samples)

        loss_val_term1 = F.logsigmoid(torch.sum(input_em* output_em, dim=1))
        loss_val_term2 = torch.sum(F.logsigmoid(-torch.sum((input_em.unsqueeze(1).repeat((1,self.num_neg_samples,1))*neg_samples_em), dim=2)), dim=1)
        
        return -torch.sum(loss_val_term1 + loss_val_term2)

    
    
    

In [None]:
corpus = te_corpus_N

In [None]:
# torch.save(corpus,'teWikiCorpus.pkl')

In [None]:
model = SkipGramNegativeSampling(embedding_size= 100, vocab_size=vocab_size, num_neg_samples= 5, sampling_weights=sampling_dist)

In [None]:
# data = data[:1000000]
# NUM_BATCHES=100
LEARNING_RATE =0.01
NUM_EPOCHS = 100
LOG_INTERVAL = 10

# Free memory
gc.collect()
torch.cuda.empty_cache()

start = time.time() # timer

# training
model = model.to(device)
model.train()

loss_values =[]

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)
# dataloader = torch.utils.data.DataLoader(data, batch_size=int(len(data)/NUM_BATCHES), shuffle=True)
# scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
# scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[ 100, 150, 200], gamma=0.1)
# scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, 
#                                                 max_lr=0.025, 
#                                                 steps_per_epoch=len(dataloader), 
# #                                                 anneal_strategy = 'linear',
#                                                 epochs=n_epochs
#                                                )
scheduler = None


for epoch in trange(NUM_EPOCHS):
    loss_total = 0
    
    for word_indx, context_word_indx in dataloader:
         
        # zero the gradients
        optimizer.zero_grad()     

        # compute loss
        inp = word_indx.to(device)
        outp = context_word_indx.to(device)
        negative_samples = model.negativeSampling(model.num_neg_samples*inp.shape[0]) # batch size = inputs.shape[0]
        negative_samples = negative_samples.reshape(inp.shape[0],model.num_neg_samples)
        negative_samples = negative_samples.to(device)
        
        loss = model.loss(inp, outp,negative_samples)
        loss_total += loss.item()

        # backward pass
        loss.backward()
        
        # optimization step
        optimizer.step()
        
        if scheduler is not None:
            scheduler.step()
        
    loss_values.append(loss_total)
            
    if epoch%LOG_INTERVAL ==0:
        print("Epoch " + str(epoch) + " done. Loss: " + str(loss_total))
#         torch.save(model, 'saved_model.pkl')
        
print("Epoch " + str(epoch) + " done. Loss: " + str(loss_total))
elapsed = time.time() - start
print(f"Model trained for {n_epochs} in {elapsed/60: .2f} minutes")

In [None]:
# torch.save(model, 'saved_model_1.pkl')

In [None]:
plt.plot(loss_values)

In [None]:
model.to('cpu')
em = nn.Embedding.from_pretrained(model.input_embedding.weight, freeze=True)
word_emb = em.weight

In [None]:
def cosineSimilarity(w1, w2):
    return np.dot(w1,w2)/(np.linalg.norm(w1) * np.linalg.norm(w2))

In [None]:
def findSimilar(word, word_emb, k=5):
    word_vec = word_emb[word_to_idx[word]]

    sim = np.zeros(word_emb.shape[0])
    for i,row in enumerate(word_emb):
        sim[i] = cosineSimilarity(word_vec.numpy(), row.numpy())
    val,ind = torch.topk(torch.Tensor(sim), k+1)
    return [(vocab[ind[i]], val[i].item() )for i in np.arange(1, len(ind))]


In [None]:

# findSimilar('queen', word_emb,10)
# word = 'తక్కువ' #'లోపల'#'పాలు'#'సమీప'#'దివ్యాంగుల'#'జైపూర్'
# word = 'పంటలు'#'నీరు'#'తక్కువ' #'పాలు'#'అన్న' #'అక్కడ' #'దేశం'
# word = 'దక్షిణ'#'తండ్రి'
word = 'బంగారు'#'విద్య'#'రాజ్యాంగం'#'ఇథియోపియా'#'బిలియన్ల'#'నదులు'#'యుద్ధం'#'మొక్కలు'#'గాంధీ'#'విజయం'#'అభివృద్ధి'
# word = 'ఎనిమిది'#'కుటుంబం'#'ఆలయ'#'సాయంత్రం'#'నక్షత్రం' #'దిగుమతి'#'పాఠశాల'#'వినియోగం'
# word = 'గ్రామం'
# word = 'లక్ష'#'ప్రముఖ'#'జర్మన్'#'ప్రజలు'#'గుండె'#'మట్టి'#'ఆధిక్యత'#'చెట్లు'#'పెద్ద'#'ఆధారిత'#'ఇతర'#'జిల్లా'#'ఒకటి'#'దూరం' #'కణాలు'
# model.input_embedding.requires_grad_(False)
findSimilar(word, em, 10)

In [None]:
# plot_words = ['king','queen', 'daughter', 'son', 'wife', 'husband',  'mother','father', 'this','that']

plot_words = ['కింద', 'మీద']

ww_idx = [ word_to_idx[word] for word in plot_words ]
word_emb1 = em.weight.numpy()[ww_idx,:]

In [None]:
pca = PCA(n_components=2,)
ww = pca.fit_transform(word_emb1)

fig = px.scatter(ww, x=0, y=1)
# fig.show()

for i, idx in enumerate(ww_idx):
    fig.add_annotation(x=ww[i,0],y=ww[i,1], text=indx_to_word[idx])
HTML(fig.to_html())

# pca = PCA(n_components=2)
# pca.fit(em.weight.numpy())
# ww = pca.transform(em.weight.numpy())

# fig = px.scatter(ww[ww_idx,:], x=0, y=1)
# # fig.show()

# for idx in ww_idx:
#     fig.add_annotation(x=ww[idx,0],y=ww[idx,1], text=indx_to_word[idx])
# HTML(fig.to_html())