Implementation of model to generate fasttext embeddings for telugu language. The model is trained on a corpus of wikipedia articles. 

Note: Preprocessing is skipped and the processed corpus from word2vec_sg_te is directly loaded. See the word2vec_sg_te notbook for processing details.

## Imports

In [1]:
%config Completer.use_jedi=False

In [2]:
import numpy as np

In [3]:
from tqdm.notebook import trange, tqdm

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [5]:
from itertools import chain
from collections import Counter
import string
import time
import re
import gc
import random
import math

In [6]:
torch.manual_seed(42)

In [7]:
import matplotlib.pyplot as plt

In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device 

## Generate skipgrams

In [9]:

def generate_word_contextword_pairs(sentence, window_size=5): 
    all_word_context_pairs = []
    
    for i, word in enumerate(sentence):
#         win_size = np.random.randint(low = 1, high = window_size+1 )
        win_size = window_size
        contexts = sentence[i-win_size:i] + sentence[i+1:i+win_size+1]
        all_word_context_pairs.append([(word, context_word) for context_word in contexts])
    return list(chain.from_iterable(all_word_context_pairs))

def generate_word_contextword_pairs_dynamic(sentence, window_size=5): 
    # window size is dynamic. window_size parameter is the maximum allowed window_size.
    # so for each word, win_size is uniformly sampled from [1, window_size]
    # Ref: Goldberg and Levy 2014 

    all_word_context_pairs = []
    
    for i, word in enumerate(sentence):
        win_size = np.random.randint(low = 1, high = window_size+1 )
#         win_size = window_size
        contexts = sentence[i-win_size:i] + sentence[i+1:i+win_size+1]
        all_word_context_pairs.append([(word, context_word) for context_word in contexts])
    return list(chain.from_iterable(all_word_context_pairs))

## Telugu wikipedia corpus

In [10]:
te_corpus = torch.load('Data/teWikiCorpus.pkl')

In [11]:
print(te_corpus[:10])

In [12]:
unigram_freq = Counter(list(chain.from_iterable(te_corpus)))

In [13]:
# vocab of words
vocab_w = list(unigram_freq.keys())
vocab_w.insert(0, '<unk>')
vocab_w_size = len(vocab_w)

In [14]:
# Generate character ngrams for each word
# Ref: page 3 of  Enriching Word Vectors with Subword Information
# https://arxiv.org/pdf/1607.04606.pdf

# with hash
def extract_ngrams(str, n1, n2=None):
    if n2 is None:
        nv= [n1]
    else:
        nv = np.arange(n1, n2)

    str = '<'+str+'>'
    n_grams =[]
    for n in nv:
        for i in np.arange(len(str)-n+1):
            n_grams.append(hash(str[i:i+n]))
    n_grams.append(hash(str))
    return list(set(n_grams))

# without hash
def extract_ngrams_nohash(str, n1, n2=None):
    if n2 is None:
        nv= [n1]
    else:
        nv = np.arange(n1, n2)

    str = '<'+str+'>'
    n_grams =[]
    for n in nv:
        for i in np.arange(len(str)-n+1):
            n_grams.append(str[i:i+n])
    n_grams.append(str)
    return list(set(n_grams))


In [15]:
print(extract_ngrams_nohash('విస్తీర్ణంలో', 4))

In [16]:
print(extract_ngrams_nohash('రెండవ', 3))

In [17]:
char_ngrams_dict = {}
for word in vocab_w:
    char_ngrams_dict.update({word:extract_ngrams(word,3,7)})

In [18]:
# vocab of character ngrams
vocab = set(chain.from_iterable(char_ngrams_dict.values()))

In [19]:
vocab_freq = torch.Tensor(list(unigram_freq.values()))
vocab_size  = len(vocab)

In [20]:
sampling_dist = vocab_freq**0.75
sampling_dist = sampling_dist/sampling_dist.sum()

In [21]:
# dictionaries to help convert between index and word
word_to_idx = {word: i for i, word in enumerate(vocab_w)}
indx_to_word = {i: word for i, word in enumerate(vocab_w)}

In [22]:
# dictionaries to help convert between index and character ngrams
subword_to_idx = {word: i for i, word in enumerate(vocab)}
indx_to_subword = {i: word for i, word in enumerate(vocab)}

In [23]:
word_indx_to_char=[]
for i in range(len(indx_to_word)):
#     word_indx_to_char.append(torch.IntTensor([subword_to_idx[subword] for subword in char_ngrams_dict[indx_to_word[i]]]).long())
    word_indx_to_char.append([subword_to_idx[subword] for subword in char_ngrams_dict[indx_to_word[i]]])

In [24]:
# data_sh = list(chain.from_iterable([generate_word_contextword_pairs_dynamic(sentence,2) for sentence in corpus]))
win_size=2
data_sh = list(chain.from_iterable([generate_word_contextword_pairs(sentence,win_size) for sentence in te_corpus]))
data = [(word_to_idx[a], word_to_idx[b]) for a,b in data_sh]

In [25]:
BATCH_SIZE = 1024*4
dataloader = torch.utils.data.DataLoader(data, batch_size=BATCH_SIZE, shuffle=True)

## Model

In [26]:
def get_inp_and_offsets(inputs):
    aa = []
    offsets = [0]
    for lst in inputs:
#         tmp = word_indx_to_char[lst].numpy()
        tmp = word_indx_to_char[lst]
        aa.append(tmp)
        offsets.append(len(tmp)+offsets[-1])
    offsets.pop(-1)
    return torch.tensor(list(chain.from_iterable(aa))).to(device), torch.tensor(offsets).to(device)

In [27]:
class FastTextTorch(nn.Module):
    
    def __init__(self,embedding_size, in_vocab_size,out_vocab_size, num_neg_samples,char_ngrams_dict):
        super(FastTextTorch, self).__init__()
        self.input_embedding = nn.EmbeddingBag(in_vocab_size,embedding_size)
        self.output_embedding = nn.Embedding(out_vocab_size, embedding_size)
        
        self.in_vocab_size = in_vocab_size
        self.out_vocab_size = out_vocab_size
        self.embedding_size = embedding_size      
        self.num_neg_samples = num_neg_samples
        
    def forward(self, inputs, outputs):        
        inp, offsets = get_inp_and_offsets(inputs)
        input_em = self.input_embedding(inp, offsets)
        output_em = self.output_embedding(outputs)
        out = input_em*output_em
        return out.sum(1)
    
    def negativeSampling(self, num_samples,sampling_weights):        
        # returns indices of sampled words
        return torch.multinomial(sampling_weights, num_samples, replacement=True)
    
    def loss(self, inputs, outputs, negative_samples):
        
        input_em = self.input_embedding(inputs)
        output_em = self.output_embedding(outputs)
        neg_samples_em = self.output_embedding(negative_samples)
        loss_val_term1 = F.logsigmoid(torch.sum(input_em* output_em, dim=1))
        loss_val_term2 = torch.sum(F.logsigmoid(-torch.sum((input_em.unsqueeze(1).repeat((1,self.num_neg_samples,1))*neg_samples_em), dim=2)), dim=1)
        return -torch.sum(loss_val_term1 + loss_val_term2)

In [28]:
model = FastTextTorch(embedding_size=100, in_vocab_size=vocab_size,out_vocab_size=vocab_w_size, char_ngrams_dict=char_ngrams_dict, num_neg_samples=5)

In [29]:
model = model.to(device)

In [30]:
gc.collect()
torch.cuda.empty_cache()

In [31]:
BATCH_SIZE = 1024
LEARNING_RATE =0.01
NUM_EPOCHS = 30
LOG_INTERVAL = 1

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)
loss_func = nn.BCEWithLogitsLoss(reduction='sum')

In [32]:
len_dataloader = int(np.ceil(len(data)/BATCH_SIZE))

In [33]:
def getInputVector(inp_word_indx):
    # inp: batch_size x 1
    inp_vec = torch.zeros(inp_word_indx.shape[0], len(subword_to_idx))
    
    for i,word_i in enumerate(inp_word_indx):
        inp_vec[i,word_indx_to_char[word_i].long()] = 1
    
#     for i,word_i in enumerate(inp):
#         for subword in char_ngrams_dict[indx_to_word[word_i.item()]]:
#             inp_vec[i,subword_to_idx[subword]] = 1
    return inp_vec
    
    

In [34]:
def get_inp_and_offsets_transform(inputs):
    aa = []
    offsets = [0]
    for lst in inputs:
#         tmp = word_indx_to_char[lst].numpy()
        tmp = word_indx_to_char[lst]
        aa.append(tmp)
        offsets.append(len(tmp)+offsets[-1])
    offsets.pop(-1)
    return (torch.tensor(list(chain.from_iterable(aa))), torch.tensor(offsets))

In [35]:
# Find optimal Learning rate 
def find_learning_rate(model, dataloader, initial_lr = 1e-5, max_lr = 1e2,  num_lr_finder_steps=100):


    # Free memory
    gc.collect()
    torch.cuda.empty_cache()

    # training mode
    model.to(device)
    model.train()

    optimizer = torch.optim.Adam(model.parameters(), lr = initial_lr)
    fac= np.exp((1/num_lr_finder_steps)*np.log(max_lr/initial_lr))
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer,lr_lambda=lambda epoch: fac**(epoch+1))
    # scheduler = None
    iter_dataloader = iter(dataloader)
    lr_list = []
    for iteration in trange(num_lr_finder_steps):
        
        word_indx, context_word_indx  = next(iter_dataloader)
        
        # zero the gradients
        optimizer.zero_grad()
        
        # compute loss
        outp = context_word_indx.to(device)
        inp = word_indx.tolist()
        pred_p = model(inp,outp)
        loss = loss_func(pred_p, pred_p*0+1)
        for _ in range(model.num_neg_samples):
            negative_samples = model.negativeSampling(outp.shape[0], sampling_dist) # batch size = inputs.shape[0]
            negative_samples = negative_samples.to(device)
            pred_n = model(inp,negative_samples)

        lr_list.append([scheduler.get_last_lr()[0],loss.item() ])
        
        # backward pass
        loss.backward()
        
        # optimization step
        optimizer.step()
        
        # scheduler step
        scheduler.step()
        
    return np.array(lr_list)
        

In [36]:
# Remember to run lr_finder on just initialised model
model = FastTextTorch(embedding_size=100, in_vocab_size=vocab_size,out_vocab_size=vocab_w_size, char_ngrams_dict=char_ngrams_dict, num_neg_samples=5)
lr_list = find_learning_rate(model, dataloader, initial_lr=1e-4,max_lr=1e2,num_lr_finder_steps=100)

In [37]:
plt.loglog(lr_list[:,0], lr_list[:,1])

In [38]:
optimal_lr = 0.005
LEARNING_RATE = optimal_lr

In [39]:
loss_values =[]

def train(model, n_epochs):
    start = time.time()
    model.train()
    gc.collect()
    torch.cuda.empty_cache()
    
    optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)
#     scheduler =None
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, 
                                                max_lr=LEARNING_RATE, 
                                                steps_per_epoch=len_dataloader, 
                                                anneal_strategy = 'linear',
                                                epochs=n_epochs,
                                                div_factor = 10,
                                                final_div_factor = 1000,
                                                pct_start = 0.4,
                                                three_phase = True,
                                               )
    
    for epoch in trange(n_epochs):
        loss_total = 0
        
        # dynamically generated context word pairs
        data_sh = list(chain.from_iterable([generate_word_contextword_pairs_dynamic(sentence,win_size) for sentence in te_corpus]))
        data = [(word_to_idx[a], word_to_idx[b]) for a,b in data_sh]
        dataloader = torch.utils.data.DataLoader(data, batch_size=BATCH_SIZE, shuffle=True) 
        
       
        i_dl=0
        for word_indx, context_word_indx in tqdm(dataloader, total = len(dataloader)):

            # zero the gradients    
            optimizer.zero_grad()    

            # compute loss
            outp = context_word_indx.to(device)
            inp = word_indx.tolist()
            pred_p = model(inp,outp)
            loss = loss_func(pred_p, pred_p*0+1)
            for _ in range(model.num_neg_samples):
                negative_samples = model.negativeSampling(outp.shape[0], sampling_dist) # batch size = inputs.shape[0]
                negative_samples = negative_samples.to(device)
                pred_n = model(inp,negative_samples)


            loss += loss_func(pred_n, pred_n*0)
            
            loss_total += loss.item()
            
            # backward pass
            loss.backward()


            # optimization step
            optimizer.step()
            
            if scheduler is not None:
                scheduler.step()
            
        loss_values.append(loss_total)
        
        if epoch%LOG_INTERVAL ==0:
            print("Epoch " + str(epoch) + " done. Loss: " + str(loss_total))
#             torch.save(model, 'saved_model_ft.pkl')
    elapsed = time.time() - start
    print(f"Model trained for {n_epochs} in {elapsed/60: .2f} minutes")
    
    torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss_values': loss_values,
            }, f'FastText_te_{n_epochs}_epochs.pt')
    
    return model

In [40]:
gc.collect()
torch.cuda.empty_cache()
model = FastTextTorch(embedding_size=100, in_vocab_size=vocab_size,out_vocab_size=vocab_w_size, char_ngrams_dict=char_ngrams_dict, num_neg_samples=5)
model.to(device)
model = train(model, n_epochs=NUM_EPOCHS)

In [41]:
plt.plot(loss_values)

In [42]:
model.to('cpu')
model.eval()
em = nn.EmbeddingBag.from_pretrained(model.input_embedding.weight, freeze=True)


In [43]:
def cosineSimilarity(w1, w2):
    return np.dot(w1,w2)/(np.linalg.norm(w1) * np.linalg.norm(w2))

In [44]:
def findSimilar(word, em, k=5):
    inp, offsets = get_inp_and_offsets(torch.IntTensor([word_to_idx[word]]))
    word_vec = em(inp.to('cpu'), offsets.to('cpu'))
    word_vec = word_vec.detach().numpy().squeeze()

    sim = np.zeros(len(indx_to_word.keys()))
    for i,row in enumerate(indx_to_word.keys()):
        inp, offsets = get_inp_and_offsets(torch.IntTensor([row]))
        row_v = em(inp.to('cpu'), offsets.to('cpu')).detach().numpy().squeeze()
        sim[i] = cosineSimilarity(word_vec, row_v)

    val,ind = torch.topk(torch.Tensor(sim), k+1)
#     plt.scatter(np.arange(len(sim)),sim)
#     print(val)
#     print(ind)
#     print(row.shape)
    return [(indx_to_word[ind[i].item()], val[i].item() )for i in np.arange(1, len(ind))]

In [45]:
findSimilar('దక్షిణ', em, 10)

In [47]:
findSimilar('పంటలు', em, 10)

In [48]:
findSimilar('ఎనిమిది', em, 10)

In [49]:
findSimilar('తక్కువ', em, 10)

In [50]:
findSimilar('గ్రామం', em, 10)