In [None]:
pip install -q bpemb  # byte pair encoding module

In [None]:
pip install -q torchsummary

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch 
import torch.nn as nn
from bpemb import BPEmb
from torchsummary import summary
from tqdm import tqdm
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
data = pd.read_csv('hindi_english_parallel.csv')
data.head()

Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the top panel
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default


In [3]:
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1353912 entries, 0 to 1561839
Data columns (total 2 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   hindi    1353912 non-null  object
 1   english  1353912 non-null  object
dtypes: object(2)
memory usage: 31.0+ MB


In [4]:
data['english_len'] = data['english'].apply(lambda x:len(x.split()))
data['hindi_len'] = data['hindi'].apply(lambda x:len(x.split()))
data.head()

Unnamed: 0,hindi,english,english_len,hindi_len
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout,6,8
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer,3,3
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel,8,7
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the top panel,8,7
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default,9,12


In [5]:
data = data[(data.english_len>=5) & (data.english_len<=15) & (data.hindi_len>=5) & (data.hindi_len<=15)]
data.head()

Unnamed: 0,hindi,english,english_len,hindi_len
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout,6,8
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel,8,7
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the top panel,8,7
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default,9,12
6,पहुंचनीय आसंधि (नोड) को चुनते समय हाइलाइट बक्स...,The duration of the highlight box when selecti...,10,10


In [6]:
data = data.sample(n=25000, random_state=0)
train_split, test_split = train_test_split(data, test_size=0.1, random_state=0)
train_split = train_split.reset_index(0).drop(['index'], axis=1)
test_split = test_split.reset_index(0).drop(['index'], axis=1)

In [7]:
print(len(train_split), len(test_split))
train_split.head()

22500 2500


Unnamed: 0,hindi,english,english_len,hindi_len
0,एक खूब छोटा अंश जिस में तत्वों के गुण होते है।,A very small component acquiring a quality of ...,9,11
1,कोटा जानकारी समर्थित नहीं फ़ोल्डर '% s' के लिए,No IMAP mailbox available for folder '% s',8,9
2,परन्तु यह शीघ्र ही अपर्याप्त प्रतीत हुआ...।,But soon this was found rather inadequate.,7,7
3,उसने गिरफ्तार व्यक्ति के लिए प्रतिभू की भूमिका...,He acted as ad - promisor for the arrested per...,10,9
4,अल्लाह से क्षमा की प्रार्थना करो। निस्संदेह अल...,Ask God for forgiveness: He is most forgiving ...,10,12


# Tokenization approach
One of the ways to perform subword tokenization is Byte-Pair Encoding (BPE, actually it is a data compression algorithm), 
WordPiece (used by BERT), and SentencePiece 


In [8]:
# this is basically the module by which we're gonna perform tokenization according to the Byte-Level Byte Pair Encoding
bpemb_en = BPEmb(lang='en')
bpemb_hi = BPEmb(lang='hi')

# Dataset and Dataloader

In [9]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, max_seq_len=64):
        super(CustomDataset, self).__init__()
        self.data = data
        self.max_seq_len = max_seq_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        eng_sen = self.data.english.iloc[index]
        hin_sen = self.data.hindi.iloc[index]
        eng_tokens = bpemb_en.encode_ids_with_bos_eos(eng_sen)
        hin_tokens = bpemb_hi.encode_ids_with_bos_eos(hin_sen)
        trg_input_tokens = hin_tokens[:-1]
        trg_output_tokens = hin_tokens[1:]
        
        eng_mask = [1]*(len(eng_tokens))
        hin_mask = [1]*(len(trg_input_tokens))
        
        eng_tokens = eng_tokens + [0]*(self.max_seq_len - len(eng_tokens))
        trg_input_tokens = trg_input_tokens + [0]*(self.max_seq_len - len(trg_input_tokens))
        trg_output_tokens = trg_output_tokens + [0]*(self.max_seq_len - len(trg_output_tokens))
        
        eng_mask = eng_mask + [0]*(self.max_seq_len - len(eng_mask))
        hin_mask = hin_mask + [0]*(self.max_seq_len - len(hin_mask))
        # pad eng_tokens upto max_seq_len
        # pad_hin_tokens upto max_seq_len
        # then create masks for both of the inputs, and make them upto the dimension needed
        
        # now we have to pad the sequence upto max length and create the masks for the same
        
        
        return torch.tensor(eng_tokens), torch.tensor(trg_input_tokens), torch.tensor(trg_output_tokens), torch.tensor(eng_mask), torch.tensor(hin_mask)

# Encoder
So the solution is 'throw all recurrence and base EACH encoder OUTPUT on all the ENCODER INPUTS', so the solution is 
self-attention.
For example let there be n inputs, vocab_size be 10, and embedding_dim be m, so our first we have to get the attention weights
which will be like - softmax([<xj,x1>, <xj, x2>,...,  <xj,xn>]) and the new resultant vector is x_i = sum(a_ij*x_j) which will
be of shape n again.
Now due to this type of mechanism we can avoid recurrence, and get all the attention based outputs in just a single pass. 
By just multiplying the matrices as softmax(X@X.T, axis=0) (if input is of shape n ,embedding_dim where n i sthe number of
time_steps or seq_len). (Note that this is for one senetence only). 
And we can stack attention as they are of the same shape. 

But the probelm in above task is that, we don't have any learnable weights, so we have to rely only on the quality of 
embeddingss, so we insert weights in the scenario and it becomes scaled dot product self-attention.


Attention(Q, K, V) = softmax(Q@K.T/sqrt(d_k))V, where Q = X@W_q, K = X@W_k, V = X@W_v.  And each of W_q, W_k, W_v is of shape 
dxd and X is of shape (txd). This is known as attention single head. Scaling in this helps in preventing the exploding 
gradient problem. 

Now there is still a problem in the above implementation as the self attention is going to give the most weight to one of the 
embedding_dimension, but there can be multiple informations in a single statement, like 'I went to a restaurant to meet my 
freind that night', there are questions, 'why, where, who, when'. So. we require multiple such attentions so that our 
attention mechanism takes care of all such queries, so the solution is multi-headed attention, in this method we take
separate weights for W_q, W_k, W_v for each of the single heads, but the catch over here is that each of matrix W_q, W_k, W_v
is of the shape is (dxd/h), thus the computational cost is still same as single headed attention. And then, the outputs from 
all the heads are then passed concatenated and passed through a feed forward network. 

In all this mechanism, we got the information on how the words are dependent on one another, but we lost the positional 
information, so here comes the positional encoding for rescue. We will add the positional embedding layer (which can be treated
as learnable weights) and it is of the same shape as our inputs.

Now there is no non-linearity in the system, thus we have to introduce one, so we insert feedforward networks after mutli-head
self attention with non-linear activation functions, now all this multihead + feedforward is reffered as an encoder block, and 
we can stack these blocks.

But stacking brings another problem, i.e., the positional informations gets lost over if multiple encoder blocks are used and 
this also leads to vanishing gradient problem.


And the solutions to both these problems are LayerNormalization in between feedforward networks and skip-connections between 
the blocks (same as resnet).


# Decoder
In decoder, we can apply the same process, but there is a issue, as in encoder, we have the all the input sentence at one time,
but in decoding practically, we have only the previous time steps, so we have to somwhow zero the attention weights given to 
the subsequent words in the sentence. Thus we have to mask them. Thus, it is known as masked smulti-head self attention.

# Shapes of inputs and outputs of different layers
m - number of examples or batch_size if batched inputs

t - sequence length or can say number of time steps\

d - model shape, or embedding dimension shape 

h - number of heads

Then, 

1. input shape - (m, t, d)
2. W_q, W_k, W_v - (d, d/h)
3. Q,K,V shapes - XW_q.shape i.e. (m, t, d/h)
4. Shape of K.T will be (m, d/h, t)
5. Output from single head - softmax(Q@K.T/sqrt(d_k)) where d_k = d/h, will be - (m, t, d/h)


In [10]:
def scaled_dot_product_attention(query, key, value, mask=None):
    # this is going in the single head self-attention.
    # query, key, value shape will be (b, h, t, d/h) 
    d_k = query.shape[-1]
    scaled_scores = torch.matmul(query, torch.transpose(key, -2, -1))/np.sqrt(d_k)  # shape is  (b, h, t, t)
    
    if mask is not None:
        # mask must be of shape (b,h,t,t)
        scaled_scores = torch.where(mask==0, -np.inf, scaled_scores)
        
    weights = torch.nn.Softmax(dim=-1)(scaled_scores) # shape is (b,h,t,t)
    return torch.matmul(weights, value)  # shape will be (b,h,t,d/h)

# MODEL

In [15]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        
        # let us say that our input is of shape (b, t, d) 
        self.wq = nn.Linear(in_features = self.d_model, out_features = self.d_model, bias = False) 
        self.wk = nn.Linear(in_features = self.d_model, out_features = self.d_model, bias = False)
        self.wv = nn.Linear(in_features = self.d_model, out_features = self.d_model, bias = False)
        # remember that wq, wk, ev defined above are just the matrices nothing more
        
        # now the final dense layer to add some nolinearity in it 
        self.fc = nn.Linear(self.d_model, self.d_model)
        
    def forward(self, q, k, v, mask=None):
        # shape of q, k, v is (b, t, d)
        q = self.wq(q).reshape(q.shape[0], q.shape[1], self.num_heads, self.d_model//self.num_heads).permute(0, 2, 1, 3) 
        k = self.wk(k).reshape(k.shape[0], k.shape[1], self.num_heads, self.d_model//self.num_heads).permute(0, 2, 1, 3) 
        v = self.wv(v).reshape(v.shape[0], v.shape[1], self.num_heads, self.d_model//self.num_heads).permute(0, 2, 1, 3) 
        
        # now shape of q, k, v is (b, h, t, d/h)
        # now we have to simply perform  self-attention for every head
        op = scaled_dot_product_attention(q, k, v, mask)  # shape of op is (b, h, t, d/h)
        op = op.permute(0, 2, 1, 3)
        op = op.reshape(op.shape[0], op.shape[1], self.d_model)  # now shape of op is (b, t, d)
        return self.fc(op) # shapes are (b, t, d)

In [16]:
class feedforward(nn.Module):
    def __init__(self, d_model, hidden_dim):
        super(feedforward, self).__init__()
        self.fc1 = nn.Linear(d_model, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, d_model)
        
    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [17]:
class encoder_block(nn.Module):
    def __init__(self, d_model, num_heads, hidden_dim, dropout_rate=0.1):
        super(encoder_block, self).__init__()
        self.mhsa = MultiHeadSelfAttention(d_model, num_heads)
        self.fc = feedforward(d_model, hidden_dim) # gonna give the shapes again to be (b, t, d)
        
        self.dropout1 = nn.Dropout(p=dropout_rate)
        self.dropout2 = nn.Dropout(p=dropout_rate)
        
        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)
    
    def forward(self, x, mask=None):
        # input shape is (b, t, d)
        op = self.mhsa(x, x, x, mask) # op shape is (b, t, d) and attention weights are of shape (b, t, t)
        op = self.dropout1(op)
        # now we have to pass it through layer normalization (study it)
        op = self.layernorm1(op + x)
        ffn_op = self.fc(op)
        ffn_op = self.dropout2(ffn_op)
        op = self.layernorm2(op + ffn_op)
        return op
        
        

In [18]:
class encoder_transformer(nn.Module):
    def __init__(self, num_blocks, d_model, num_heads, hidden_dim, src_vocab_size, max_seq_len, dropout_rate=0.1):
        # max_seq_len is the number of time steps, i'll be referring it as t
        super(encoder_transformer, self).__init__()
        self.num_blocks = num_blocks
        self.d_model = d_model
        self.num_heads = num_heads
        self.hidden_dim = hidden_dim 
        self.vocab_size = src_vocab_size
        self.max_seq_len = max_seq_len
        
        self.token_embeds = nn.Embedding(src_vocab_size, d_model)
        self.pos_embeds = nn.Embedding(max_seq_len, d_model)
        self.dropout = nn.Dropout(p = dropout_rate)
        self.blocks = nn.ModuleList([encoder_block(d_model, num_heads, hidden_dim, dropout_rate=dropout_rate) 
                      for _ in range(self.num_blocks)])
        
            
        
        
    
    def forward(self, source, mask=None):
        # shape of source is (b, t)
        # all source sentences will be padded and padding will be static
        source = source.type(torch.LongTensor).to(device)
        # comment out above line for 
        t_embeds = self.token_embeds(source) # (b, t, d)
        pos_ids = torch.broadcast_to(torch.arange(self.max_seq_len), (x.shape[0], self.max_seq_len)).type(torch.LongTensor
                                                                                                         ).to(device)
        p_embeds = self.pos_embeds(pos_ids) # (b, t, d)
        
        inp = t_embeds + p_embeds  # (b, t, d)
        op = self.dropout(inp)  # (b, t, d)
        
        for _, block in enumerate(self.blocks):
            op = block(op, mask)
            
        return op
            

# Decoder

In [19]:
class decoder_block(nn.Module):
    def __init__(self, d_model, num_heads, hidden_dim, dropout_rate=0.1):
        super(decoder_block, self).__init__()
        self.mhsa1 = MultiHeadSelfAttention(d_model, num_heads)
        self.mhsa2 = MultiHeadSelfAttention(d_model, num_heads)
        
        self.fc = feedforward(d_model, hidden_dim)
        
        self.dropout1 = nn.Dropout(p=dropout_rate)
        self.dropout2 = nn.Dropout(p=dropout_rate)
        self.dropout3 = nn.Dropout(p=dropout_rate)
        
        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)
        self.layernorm3 = nn.LayerNorm(d_model)
        
        
    def forward(self, encoder_ouptut, target, decoder_mask=None, memory_mask=None):
        mhsa_op1 = self.mhsa1(target, target, target, decoder_mask)
        mhsa_op1 = self.dropout1(mhsa_op1)
        mhsa_op1 = self.layernorm1(mhsa_op1 + target)
        
        mhsa_op2 = self.mhsa2(mhsa_op1, encoder_ouptut, encoder_ouptut, memory_mask)
        mhsa_op2 = self.dropout2(mhsa_op2)
        mhsa_op2 = self.layernorm2(mhsa_op1 + mhsa_op2)
        
        fc_op = self.fc(mhsa_op2)
        fc_op = self.dropout3(fc_op)
        op = self.layernorm3(fc_op + mhsa_op2)
        
        return op

In [20]:
class decoder_transformer(nn.Module):
    def __init__(self, num_blocks, d_model, num_heads, hidden_dim, trg_vocab_size, max_seq_len, dropout_rate=0.1):
        super(decoder_transformer, self).__init__()
        self.token_embeds = nn.Embedding(trg_vocab_size, d_model)
        self.pos_embeds = nn.Embedding(max_seq_len, d_model)
        self.max_seq_len = max_seq_len
        self.dropout = nn.Dropout(p=dropout_rate)
        
        self.blocks = nn.ModuleList([decoder_block(d_model, num_heads, hidden_dim, dropout_rate) 
                                    for _ in range(num_blocks)])
    
    def forward(self, encoder_output, target, decoder_mask=None, memory_mask=None):
        # shape of target is (b, t)
        target = target.type(torch.LongTensor).to(device)
        t_embeds = self.token_embeds(target) # (b, t, d)
        pos_ids = torch.broadcast_to(torch.arange(self.max_seq_len), (x.shape[0], self.max_seq_len)).type(torch.LongTensor
                                                                                                         ).to(device)
        p_embeds = self.pos_embeds(pos_ids)  # (b, t, d)
        
        inp = t_embeds + p_embeds
        op = self.dropout(inp)
        # now op is (b, t, d)
        
        for _, block in enumerate(self.blocks):
            op = block(encoder_output, op, decoder_mask, memory_mask)
            
        return op

In [21]:
class transformer(nn.Module):
    def __init__(self, num_blocks, d_model, num_heads, hidden_dim, src_vocab_size, trg_vocab_size, max_seq_len, 
                dropout_rate=0.1):
        super(transformer, self).__init__()
        self.encoder = encoder_transformer(num_blocks, d_model, num_heads, hidden_dim, src_vocab_size, max_seq_len,
                            dropout_rate=0.1)
        self.decoder = decoder_transformer(num_blocks, d_model, num_heads, hidden_dim, src_vocab_size, max_seq_len,
                            dropout_rate=0.1)
        
        self.fc = nn.Linear(d_model, trg_vocab_size)
        self.lookahead = torch.tril(torch.ones((max_seq_len, max_seq_len))).to(device)
    
    def forward(self, src, trg, src_pad_mask=None, trg_pad_mask=None):
        # we require masks to be of shape (b, 1, 1, t)
        dec_mask = None
        if src_pad_mask is not None:
            src_pad_mask = src_pad_mask.unsqueeze(1).unsqueeze(1)
            src_pad_mask = src_pad_mask.to(device)
        if trg_pad_mask is not None:
            trg_pad_mask = trg_pad_mask.unsqueeze(1).unsqueeze(1)
            dec_mask = torch.minimum(trg_pad_mask, self.lookahead)
            dec_mask = dec_mask.to(device)
            
        enc_op = self.encoder(src, src_pad_mask)
        op = self.decoder(enc_op, trg, dec_mask, src_pad_mask) # op is of shape (b, t, d)
        # and generate the required look ahead mask
        op = op.reshape(-1, op.shape[-1]) # shape is (b*t, d)
        op = self.fc(op) # shape is (b*t, trg_vocab_size)
        
        return op

# UTILS

In [22]:
def BLEU():
    pass

In [23]:
def METEOR():
    pass

In [24]:
def save_checkpoint(state, filename='my_checkpoint(1).pth'):
    # will save model and optimizer params at every epoch
    print("-> Saving CheckPoint")
    torch.save(state, filename)

In [25]:
def load_checkpoint(checkpoint, model):
    # it will just load, we can train it further, make changes to the architecture
    # and simply use it to predict
    print("-> Loading CheckPoint")
    model.load_state_dict(checkpoint["state_dict"])

In [26]:
def train(loader, model, optimizer, scaler, scheduler, loss_fn, epoch, device=device):
    '''
    it is the training procedure for one epoch of the network
    '''
    losses = 0
    model.train()
    num_batches = len(loader)
    batches = tqdm(loader) # tqdm will be used to generate progress bars
    for idx, batch in enumerate(batches, 0):
        src = batch[0].to(device)  # (batch_size, max_len)
        trg_inp = batch[1].to(device)  # (batch_size, max_len)
        trg_op = batch[2].to(device) # (batch_size, max_len)
        src_pad_mask = batch[3].to(device) # (batch_size, max_len)
        trg_pad_mask = batch[4].to(device) # (batch_size, max_len)

        # forward
        optimizer.zero_grad()
        with torch.cuda.amp.autocast(): # for gradient underflowing and overflowing and it makes training faster by converting all floats to float16
            op = model(src, trg_inp, src_pad_mask, trg_pad_mask) # op shape is (batch_size*max_len, trg_vocab_size+1)
            trg_op = trg_op.reshape(trg_op.shape[0]*trg_op.shape[1]) # trg_op shape is (batch_size*max_len)
            loss = loss_fn(op, trg_op) # loss_fn should contain the parametere ignore_idx=0, so that 
            # losses corresponding to the padding token isn't calculated

        # making all the previous gradients zero 
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        batches.set_postfix(loss = loss.item(), epoch=epoch) # loss of this current batch on current iteration 
        losses+= loss.item()

    losses/=num_batches    
    #scheduler.step()
    return losses

In [27]:
def test():
    # i will only use this function for measuring its accuracy on different metrics
    # for this task such as meteor and bleu
    pass

In [28]:
def translate_sentence(sentence, model, max_seq_len=64, trans_len=50, beam_search = None):
    model.eval()
    # first i am given the english sentence
    inp = bpemb_en.encode_ids_with_bos_eos(sentence)
    enc_mask = [1]*(len(inp))
    inp = inp + [0]*(max_seq_len - len(inp))
    enc_mask = enc_mask + [0]*(max_seq_len - len(enc_mask))
    # inp shape is (max_seq_len) and so is of mask
    #print(f'input = {inp}\n\nmask = {enc_mask}')
    inp = torch.tensor(inp).unsqueeze(0).to(device) # shape of input is (1, max_seq_len)
    enc_mask = torch.tensor(enc_mask).unsqueeze(0).to(device) # shape of mask is also (1, max_seq_len)
    #inp =  inp.unsqueeze(0)
    #enc_mask = enc_mask.unsqueeze(0).unsqueeze(1).unsqueeze(1) #  shape should be (b,1,1,max_seq_len)
    # now the shapes are as required by the transformer
    
    #enc_op = model.encoder(inp, enc_mask)
    # now we have to decode the sentence one-by-one 
    # so let us first of all make the inputs and the corresponding trg_mask
    trg_inp = torch.zeros(max_seq_len).unsqueeze(0).to(device) # shape is (1, max_seq_len)
    trg_mask = torch.zeros(max_seq_len).unsqueeze(0).to(device) # shape is (1, max_seq_len)
    
    #lookahead = torch.tril(torch.ones((max_seq_len, max_seq_len))).to(device) 
    
    trg_inp[0, 0] = 1 # 1 means <sos> token
    translation = []
    #trg_mask[0, 0, 0, len(translation)] = 1 # as trg_inp has only one word in it at current time step
    #dec_mask = torch.minimum(trg_mask, lookahead)
    trg_mask[0, len(translation)] = 1 # as trg_inp has only one word in it at current time step
    last_token = trg_inp[0, len(translation)]
    # now we have to pass it through a decoder until we get a <eos> token or we exceed trans_len
    while len(translation)<trans_len and last_token!=2: # 2 means <eos> token
        # now we have to pass the above inputs through decoder
        #dec_op = model.decoder(enc_op, trg_inp, dec_mask, enc_mask) 
        # now shape of decoder op will be 
        # shape of output is (1, hindi_vocab_size)
        op = model(inp, trg_inp, enc_mask, trg_mask) # shape of op will be (max_seq_len, trg_vocab_size)
#         print(op.shape)
#         print(op)
        
#         op = op.argmax(dim=1)  # shape will be (max_seq_len)  Greedy decoding
#         op = None
        if beam_search:
            op = beam_search(op)
        else:
            op = op.argmax(dim=1)
#         print(op)
#         op.squeeze()
#         print(op.shape)
        last_token = op[0][len(translation)].item()
#         print(last_token)
        translation.append(last_token)
        trg_inp[0, len(translation)] = last_token  # updating the last token in the trg_inp
        trg_mask[0, len(translation)] = 1  # setting up the mask for the current value equal to 1
        #print(f"DONE {len(translation)} times")
        
    model.train()
    return bpemb_hi.decode(translation)


In [44]:
from torch import tensor
def beam_search(prediction, k=5):
#     prediction_t = list(tensor.detach().numpy(prediction_t))
    prediction = prediction.unsqueeze(0)
#     print(prediction.shape)
    
    batch_size, seq_length, vocab_size = prediction.shape
    log_prob, indices = prediction[:, 0, :].topk(k, sorted=True)
    indices = indices.unsqueeze(-1)
    for n1 in range(1, seq_length):
        log_prob_temp = log_prob.unsqueeze(-1) + prediction[:, n1, :].unsqueeze(1).repeat(1, k, 1)
        log_prob, index_temp = log_prob_temp.view(batch_size, -1).topk(k, sorted=True)
        idx_begin = index_temp // vocab_size  # retrieve index of start sequence
        idx_concat = index_temp % vocab_size  # retrieve index of new token
        new_indices = torch.zeros((batch_size, k, n1+1), dtype=torch.int64).to(device)
        for n2 in range(batch_size):
            new_indices[n2, :, :-1] = indices[n2][idx_begin[n2]]
            new_indices[n2, :, -1] = idx_concat[n2]
        indices = new_indices
#     indices[0]
    return indices[0]

# DRIVER CODE

In [30]:
# training hyperparameters
num_epochs =  20
lr = 3e-4
batch_size = 64

train_dataset = CustomDataset(train_split, max_seq_len=64)
test_dataset = CustomDataset(test_split, max_seq_len=64)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [31]:
# model hyperparameters
num_blocks = 3
d_model = 512
num_heads = 8
hidden_dim = 4*d_model
src_vocab_size = bpemb_en.vocab_size + 1 # +1 due to padding token
trg_vocab_size = bpemb_hi.vocab_size + 1 # +1 due to padding token
max_seq_len = 64

# testing the model
model = transformer(num_blocks, d_model, num_heads, hidden_dim, src_vocab_size, 
                              trg_vocab_size, max_seq_len).to(device)
summary(model, [(max_seq_len, ), (max_seq_len ,)])

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
         Embedding-1              [-1, 64, 512]       5,120,512
         Embedding-2              [-1, 64, 512]          32,768
           Dropout-3              [-1, 64, 512]               0
            Linear-4              [-1, 64, 512]         262,144
            Linear-5              [-1, 64, 512]         262,144
            Linear-6              [-1, 64, 512]         262,144
            Linear-7              [-1, 64, 512]         262,656
MultiHeadSelfAttention-8              [-1, 64, 512]               0
           Dropout-9              [-1, 64, 512]               0
        LayerNorm-10              [-1, 64, 512]           1,024
           Linear-11             [-1, 64, 2048]       1,050,624
             ReLU-12             [-1, 64, 2048]               0
           Linear-13              [-1, 64, 512]       1,049,088
      feedforward-14              [

In [32]:
# let us test our model on some actual input to check it doesn't break
for _, batch in enumerate(train_loader):
    src = batch[0][0].unsqueeze(0).to(device)
    trg_inp = batch[1][0].unsqueeze(0).to(device)
    trg_op = batch[2][0].unsqueeze(0).to(device)
    src_pad_mask = batch[3][0].unsqueeze(0).to(device)
    trg_pad_mask = batch[4][0].unsqueeze(0).to(device)
    break
    
#print(f'{src}\n\n{src_pad_mask}\n\n{trg}\n\n{trg_pad_mask}')
op = model(src, trg_inp, src_pad_mask, trg_pad_mask)
op = nn.Softmax(dim=-1)(op)
out = torch.max(op, dim=-1).indices
print(f'Original English Sentence {bpemb_en.decode(src.tolist())}\n\nOriginal Hindi Sentence {bpemb_hi.decode(trg_inp.tolist())}\n\nPredicted Sentence {bpemb_hi.decode(out.tolist())}')
# ?? are because of the padding tokens we can easily remove them when needed

Original English Sentence ['it is if we have handed the keys to the vault over to burglars. ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇ ']

Original Hindi Sentence ['यह ऐसा है जैसे हमने खजाने की चाबी चोरों को दे दी हो। ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇ ']

Predicted Sentence ['स्पेनिश', 'रि', 'ंखला', 'पुष्प', 'संग', '्मण', 'ण्ड', 'अञ्चल', 'नात', 'ष्णा', 'उपयोग', 'कोयला', 'hys', 'अभिक्रिया', 'भर', 'णी', 'ca', 'ोर्ट', 'यूक', 'शियम', '0000–00', 'गुण', 'जैविक', 'रियल', 'अवत', 'भ्रूण', 'cr', 'क्षम', 'वनडे', 'देखने', 'आश', 'ट्ट', 'ent', 'बीन', 'जन्मे', 'जैविक', '२०१५', 'क्षम', 'mat', 'प्रयास', 'प्रेस', 'eric', 'प्रयास', 'पृष्ठभूमि', 'शास', 'भ्रूण', 'city', 'मंच', 'राहुल', 'अवत', 'केल', 'apse', 'अंग्रेज़ी', 'इत', 'अंग्रेज़ी', 'ण', 'ry', 'प्रयास', 'एल', 'सफल', 'कथित', '२०००', 'हिन्दु', 'जा']


In [None]:
# exampel of translated sentence
text = 'the aluminium corporation of india, came into existence after the war.'
print(translate_sentence(text, model, 64))

In [33]:
# setups
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss(ignore_index=0) 
scaler = torch.cuda.amp.GradScaler() 
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
losses = []
# for inference
# sentences = ['another plant, the aluminium corporation of india, came into existence after the war.', 
#              'He is doing very good these days', 'this guy is totally mad', 'what were you saying that day?']
sentences = ["So what happened on this day?", 
             'Allow all sites to track my physical location', 
             'India is a democratic country', 
             "In 1898, Sarojini Naidu became the life - partner of Dr. Govindarajulu Naidu.", 
             'Is this a good time?',]

In [None]:
#load saved model pickle
model.load_state_dict(torch.load("my_checkpoint (1).pth")['state_dict'])
# model.eval()

In [34]:
# taking the test sentences for checking how good the model is trained
def infer(sentences, model, max_seq_len, beam_search = None):
    for idx, sentence in enumerate(sentences):
        print(f"Example {idx+1}:\n{sentence}\n{' '.join(translate_sentence(sentence, model, max_seq_len, beam_search = beam_search))}\n\n")

In [43]:
infer(sentences, model, max_seq_len, beam_search)

Example 1:
So what happened on this day?
तो इस दिन में कोई पड़ा तो ? 


Example 2:
Allow all sites to track my physical location
सभी साइट ों को स्वचालित ट्रैक बां ट ें 


Example 3:
India is a democratic country
भारत एक लोक तांत्रिक देश है 


Example 4:
In 1898, Sarojini Naidu became the life - partner of Dr. Govindarajulu Naidu.
0000 में सर ोज िनी नाय डू डा . गोवि ंद राज ुल ू नाय डू की जीवन - संग िनी बनी ं । 


Example 5:
Is this a good time?
यह एक अच्छा समय है ? 




In [None]:
# training
for epoch in range(num_epochs):
    model.train()
    losses.append(train(train_loader, model, optimizer, scaler, scheduler, loss_fn, epoch))
    
    # save checkpoint
    checkpoint = {
        'state_dict': model.state_dict(),
        'optimizer':optimizer.state_dict()
    }
    save_checkpoint(checkpoint)
    
    # check accuracy  on test set
    infer(sentences, model, max_seq_len)

In [None]:
# to perform good translation, we can do the following - 
# get more data, first try with some shorter sentences (or to check if transformer works well, take small data, try
# overfitting it)
# train it for longer time and schedule the learning rate as given in the paper
# use in built nn.Transformer class 