In [1]:
import torch
import pandas as pd
import numpy as np
import os
import sys
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from typing import Iterable, List
import math
from tempfile import TemporaryDirectory
from typing import Tuple
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import Dataset, IterableDataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from torch import Tensor
from torch.nn import Transformer
import math
from timeit import default_timer as timer

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #set the device as GPU if available

In [2]:
EMB_SIZE = 400
NHEAD = 2
FFN_HID_DIM = 512
BATCH_SIZE = 16
NUM_ENCODER_LAYERS = 4
NUM_DECODER_LAYERS = 4
lr = 0.0001
betas = (0.9, 0.98)
eps = 1e-9
NUM_EPOCHS = 36 #the result given will come if NUM_EPOCHS is set to 36.
split_size = 0.1

SRC_LANGUAGE = 'hi'
TGT_LANGUAGE = 'en'

# vocab_size = {}
# vocab_size[SRC_LANGUAGE] = 7000
# vocab_size[TGT_LANGUAGE] = 7000

In [3]:
#uncomment the following to mount while using colab
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
!mkdir -p "/content/drive/My Drive/My Folder"

In [5]:
!git clone "https://github.com/anoopkunchukuttan/indic_nlp_library"
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
!pip install Morfessor
# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME=r"/content/indic_nlp_library"

# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES="/content/indic_nlp_resources"

import sys
sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)
from indicnlp import loader
loader.load()

Cloning into 'indic_nlp_library'...
remote: Enumerating objects: 1399, done.[K
remote: Counting objects: 100% (180/180), done.[K
remote: Compressing objects: 100% (61/61), done.[K
remote: Total 1399 (delta 135), reused 143 (delta 116), pack-reused 1219[K
Receiving objects: 100% (1399/1399), 9.57 MiB | 14.45 MiB/s, done.
Resolving deltas: 100% (745/745), done.
Cloning into 'indic_nlp_resources'...
remote: Enumerating objects: 139, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 139 (delta 2), reused 2 (delta 0), pack-reused 126[K
Receiving objects: 100% (139/139), 149.77 MiB | 16.21 MiB/s, done.
Resolving deltas: 100% (53/53), done.
Collecting Morfessor
  Downloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Installing collected packages: Morfessor
Successfully installed Morfessor-2.0.6


In [6]:
Lang = 'HG'
answer_filename = '/content/drive/My Drive/My Folder/answer_'+Lang +'.txt'
model_filename = '/content/drive/My Drive/My Folder/model_'+Lang +'.pt'

In [7]:
#loading data from the desired directory
DATA_PATH = '/content/drive/MyDrive/Machine_Translation_Data/English_'+Lang+'.csv'
# TEST_PATH = '/kaggle/input/cs779-mt/eng_Hindi_data_dev_X.csv'
FINAL_TEST_DATA = '/content/drive/MyDrive/Machine_Translation_Data/English_'+Lang+'_val.csv'
data = pd.read_csv(DATA_PATH, header = None)
data.columns = ['hindi', 'english']

# test = pd.read_csv(TEST_PATH, header = None)
# test.columns = ['sentence']
final_test_data = pd.read_csv(FINAL_TEST_DATA, header = None)
final_test_data.columns = ['sentence']
data.head()


Unnamed: 0,hindi,english
0,"મરી, આદુ, એલચી અને હળદર જેવા સુગંધિત છોડ અને મ...","Aromatic plants and spices such as pepper, gin..."
1,ઇન્દોર જિલ્લાની તમામ બ્લડ બેંકમાંથી ઓછામાં ઓછા...,Find all blood banks having atleast 1 units of...
2,મેલબોર્નમાં તેના ગુમ થયા પછીના દિવસોમાં તેની ફ...,His Ford sedan was sighted via CCTV footage in...
3,જો ચંદ્રગુપ્તએ કર્ણાટકમાં ત્યાગ તરીકે પોતાનું ...,If the Jain tradition about Chandragupta endin...
4,જમીન પર નિશાન કરવા અને બીજને યોગ્ય ભાર કે વજન ...,Marking out the land and weighing the seed int...


In [8]:
def swap_columns(df, col1, col2):
    col_list = list(df.columns)
    x, y = col_list.index(col1), col_list.index(col2)
    col_list[y], col_list[x] = col_list[x], col_list[y]
    df = df[col_list]
    return df

data = swap_columns(data, 'hindi', 'english')
data.head()


Unnamed: 0,english,hindi
0,"Aromatic plants and spices such as pepper, gin...","મરી, આદુ, એલચી અને હળદર જેવા સુગંધિત છોડ અને મ..."
1,Find all blood banks having atleast 1 units of...,ઇન્દોર જિલ્લાની તમામ બ્લડ બેંકમાંથી ઓછામાં ઓછા...
2,His Ford sedan was sighted via CCTV footage in...,મેલબોર્નમાં તેના ગુમ થયા પછીના દિવસોમાં તેની ફ...
3,If the Jain tradition about Chandragupta endin...,જો ચંદ્રગુપ્તએ કર્ણાટકમાં ત્યાગ તરીકે પોતાનું ...
4,Marking out the land and weighing the seed int...,જમીન પર નિશાન કરવા અને બીજને યોગ્ય ભાર કે વજન ...


In [9]:
data['hindi']

0         મરી, આદુ, એલચી અને હળદર જેવા સુગંધિત છોડ અને મ...
1         ઇન્દોર જિલ્લાની તમામ બ્લડ બેંકમાંથી ઓછામાં ઓછા...
2         મેલબોર્નમાં તેના ગુમ થયા પછીના દિવસોમાં તેની ફ...
3         જો ચંદ્રગુપ્તએ કર્ણાટકમાં ત્યાગ તરીકે પોતાનું ...
4         જમીન પર નિશાન કરવા અને બીજને યોગ્ય ભાર કે વજન ...
                                ...                        
128274            2021-22 प्रो कबड्डी लीग का आठवां सत्र है।
128275    कभी-कभी प्रयोगशाला में बाहर के कर्मचारी जांच क...
128276                                       3. आम रोगजनकों
128277    दिन 2 हरिद्वार बारकोट 220kilomItara 7 घंटे देह...
128278    खेत में नमी की कमी होने पर सिंचाई नितान्त आवश्...
Name: hindi, Length: 128279, dtype: object

In [10]:
final_test_data['sentence']

0        માતાના લોહીમાં હિમોગ્લોબીનનું પ્રમાણ ૮થી ઓછું ...
1                                                બુધવાર છે
2        આ જથ્થાબંધ વ્યાપારના ૪.૭ % અર્થાત્ રૂપિયા ૪૮૫ ...
3        આ તમામ નદીઓ પવિત્ર છે અને આ નદીઓમાં પવિત્ર ડૂબ...
4        વાઘની સુરક્ષા માટે માનસ રાષ્ટ્રીય ઉદ્યાનમાં વા...
                               ...                        
18320    k इनटू ed आप में से कितने लोग k इनटू ed कहते हैं?
18321    दक्षिण भारत में काबिनी नदी कावेरी नदी की प्रमु...
18322    दोनों गावों में, निवासी अपने शिष्यों के साथ रह...
18323    इस परियोजना की घोषणा तत्कालीन प्रधानमंत्री मनम...
18324    ब्रुश करने की उचित विधि ही प्लाक को रोकने की स...
Name: sentence, Length: 18325, dtype: object

In [11]:
data['english'] = data['english'].apply(str)
data['hindi'] = data['hindi'].apply(str)
final_test_data['sentence'] = final_test_data['sentence'].apply(str)

In [12]:
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
from tqdm import tqdm

for i in tqdm(range(len(data['hindi']))):
  data['hindi'][i] = UnicodeIndicTransliterator.transliterate(data['hindi'][i],'gu','hi')

100%|██████████| 128279/128279 [00:49<00:00, 2576.24it/s]


In [13]:
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
from tqdm import tqdm

for i in tqdm(range(len(final_test_data['sentence']))):
  final_test_data['sentence'][i] = UnicodeIndicTransliterator.transliterate(final_test_data['sentence'][i],'gu','hi')

100%|██████████| 18325/18325 [00:02<00:00, 6256.21it/s]


In [14]:
data['hindi']

0         मरी, आदु, एलची अने हळदर जेवा सुगंधित छोड अने म...
1         इन्दोर जिल्लानी तमाम ब्लड बेंकमांथी ओछामां ओछा...
2         मेलबोर्नमां तेना गुम थया पछीना दिवसोमां तेनी फ...
3         जो चंद्रगुप्तए कर्णाटकमां त्याग तरीके पोतानुं ...
4         जमीन पर निशान करवा अने बीजने योग्य भार के वजन ...
                                ...                        
128274            2021-22 प्रो कबड्डी लीग का आठवां सत्र है।
128275    कभी-कभी प्रयोगशाला में बाहर के कर्मचारी जांच क...
128276                                       3. आम रोगजनकों
128277    दिन 2 हरिद्वार बारकोट 220kilomItara 7 घंटे देह...
128278    खेत में नमी की कमी होने पर सिंचाई नितान्त आवश्...
Name: hindi, Length: 128279, dtype: object

In [15]:
final_test_data['sentence']

0        माताना लोहीमां हिमोग्लोबीननुं प्रमाण ८थी ओछुं ...
1                                                बुधवार छे
2        आ जथ्थाबंध व्यापारना ४.७ % अर्थात् रूपिया ४८५ ...
3        आ तमाम नदीओ पवित्र छे अने आ नदीओमां पवित्र डूब...
4        वाघनी सुरक्षा माटे मानस राष्ट्रीय उद्यानमां वा...
                               ...                        
18320    k इनटू ed आप में से कितने लोग k इनटू ed कहते हैं?
18321    दक्षिण भारत में काबिनी नदी कावेरी नदी की प्रमु...
18322    दोनों गावों में, निवासी अपने शिष्यों के साथ रह...
18323    इस परियोजना की घोषणा तत्कालीन प्रधानमंत्री मनम...
18324    ब्रुश करने की उचित विधि ही प्लाक को रोकने की स...
Name: sentence, Length: 18325, dtype: object

In [16]:
#train test split using sklearn
train, eval = train_test_split(data, test_size = split_size, random_state = 42)
train = train.reset_index(drop = True)
eval = eval.reset_index(drop = True)
print(train.shape)
print(eval.shape)
train.head()

(115451, 2)
(12828, 2)


Unnamed: 0,english,hindi
0,Winter rice crop is raised preferably in low l...,शियाळु चोखानो पाक नीचा स्तरवाळा विस्तारमां उगा...
1,"With so much going on, Humayun did not even me...","आटलुं बधुं चालतुं होवाथी, हुमायु पर्शियामां ते..."
2,When the need of rest to be given to the tired...,दिवसमां त्रण प्याला चा पीवाथी मासपेशियोमां खें...
3,"The use of ""Mughal"" and ""Moghul"" derived from ...","""मुगल"" और ""मोगल"" का उपयोग ""मंगोल"" के अरबी और फ..."
4,"Therefore , every person should stay beware of...",अतः हर व्यक्ति को इस से सावधान रहना चाहिए ।


In [17]:
#defining the iterable class for creating the iterable dataset
#it takes two series as input namely hindi and english sentence series and
#generates a tuple of source and target sentence as follows
class MyIterableDataset(IterableDataset):
    def __init__(self, english_sentences, hindi_sentences):
        self.english_sentences = english_sentences
        self.hindi_sentences = hindi_sentences
        self.index = 0

    def __iter__(self):
        return self

    def __next__(self):
        if self.index >= len(self.english_sentences):
            raise StopIteration
        else:
            english_sentence = self.english_sentences[self.index]
            hindi_sentence = self.hindi_sentences[self.index]
            self.index += 1
            return hindi_sentence, english_sentence

# Example usage
train_iter = MyIterableDataset(train['english'], train['hindi'])
eval_iter = MyIterableDataset(eval['english'], eval['hindi'])


In [18]:
import spacy
# !python -m spacy download en_core_web_sm
eng = spacy.load("en_core_web_sm")

from indicnlp.tokenize import indic_tokenize

def engTokenize(text):
    """
    Tokenize an English text and return a list of tokens
    """
    return [str(token.text) for token in eng.tokenizer(str(text))]

def hiTokenize(text):
    """
    Tokenize a German text and return a list of tokens
    """
    return [str(t) for t in indic_tokenize.trivial_tokenize(str(text))]

In [19]:
# def getTokens(data_iter, place):
#     """
#     Function to yield tokens from an iterator. Since, our iterator contains
#     tuple of sentences (source and target), `place` parameters defines for which
#     index to return the tokens for. `place=0` for source and `place=1` for target
#     """
#     for english, german in data_iter:
#         if place == 0:
#             yield engTokenize(english)
#         else:
#             yield hiTokenize(german)

In [20]:
vocab_size = {}
vocab_size[SRC_LANGUAGE] = 59000
vocab_size[TGT_LANGUAGE] = 43000

In [21]:
# Place-holders
token_transform = {}
vocab_transform = {}

token_transform[SRC_LANGUAGE] = hiTokenize
token_transform[TGT_LANGUAGE] = engTokenize

# function to generate the tokens for each language
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    #create the iterator object of the dataset given
    train_iter = MyIterableDataset(train['english'], train['hindi'])
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=2,
                                                    specials=special_symbols,
                                                    special_first=True,
                                                    max_tokens = vocab_size[ln]
                                                    )

#setting the default index to unknown index which means that it will assume the token to be unknown if
#it sees a word not in the dictionary.
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)

In [22]:
len(vocab_transform[SRC_LANGUAGE])

59000

In [23]:
len(vocab_transform['en'])

43000

In [24]:
print("tokenized hindi sentence ", token_transform['en'](train['english'][0]))
print("numericalized hindi sentence ", vocab_transform['en'](token_transform['hi'](train['english'][0])))
#check for the correct tokenization and numericalization

tokenized hindi sentence  ['Winter', 'rice', 'crop', 'is', 'raised', 'preferably', 'in', 'low', 'lying', 'areas', 'that', 'remain', 'flooded', 'mainly', 'during', 'the', 'rainy', 'season', '.']
numericalized hindi sentence  [5489, 638, 250, 10, 1972, 4624, 9, 500, 2506, 255, 17, 776, 12849, 933, 114, 4, 3177, 465, 5]


In [25]:
# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 7000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# The Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [26]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


#mask creation for preventing the knowledge of presence of elements in future time steps
def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [27]:
#hyper-paramerter setting

torch.manual_seed(0)

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
print(SRC_VOCAB_SIZE)
print(TGT_VOCAB_SIZE)



#creating the model with the hyperparams specified as above
transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

# initializes the weight matrices of the transformer model using Xavier uniform initialization
for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE) #push the model to the device

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX) #cross entropy loss

optimizer = torch.optim.Adam(transformer.parameters(), lr=lr, betas=betas, eps=eps) #adam optimizer

59000
43000




In [28]:
'''creating the collate function to be passed in the dataloader which will basically apply
this function to every entry of the batch and make the data feedable to the model
'''
from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# src and tgt language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor


# function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

In [29]:
train_iter = MyIterableDataset(train['english'], train['hindi'])
train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)
count = 1
for src, tgt in train_dataloader:
  if count == 5:
    break
  print(src.shape)
  count+=1

#checking the shapes of different batches

torch.Size([36, 16])
torch.Size([50, 16])
torch.Size([39, 16])
torch.Size([40, 16])


In [30]:
ntrain = train.shape[0]
neval = eval.shape[0]
print(ntrain)
print(neval)

#total number of samples in the train and eval dataset

115451
12828


In [31]:
ntrainbatches = int(np.ceil(ntrain/BATCH_SIZE))
nevalbatches = int(np.ceil(neval/BATCH_SIZE))
print("number of train batches ", ntrainbatches)
print("number of eval batches ", nevalbatches)

#total number of batches in the train and eval dataset

number of train batches  7216
number of eval batches  802


In [32]:
# Functions for training and evaluation on the whole dataset for one epoch

def train_epoch(model, optimizer):
    model.train()
    losses = 0
    train_iter = MyIterableDataset(train['english'], train['hindi'])
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    for src, tgt in train_dataloader:

        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)


        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]

        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    print("train losses ", losses)
    return losses / ntrainbatches


def evaluate(model):
    model.eval()
    losses = 0


    eval_iter = MyIterableDataset(eval['english'], eval['hindi'])
    val_dataloader = DataLoader(eval_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)


    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()
    print("validation losses ", losses)
    return losses / nevalbatches

In [33]:
!export CUDA_LAUNCH_BLOCKING=1 #might give error some time, just comment out if it does so

In [34]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [35]:
!nvidia-smi

Sun Oct 29 08:15:04 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P0    29W /  70W |   1203MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [36]:
# #training loop
# prev_val_loss = 0
# for epoch in range(1, NUM_EPOCHS+1):
#     start_time = timer()

#     train_loss = train_epoch(transformer, optimizer)

#     end_time = timer()
#     val_loss = evaluate(transformer)
#     if epoch == 1:
#         prev_val_loss = val_loss
#     else:
#         e = prev_val_loss - val_loss
#         if e < 0.001:
#             break
#         prev_val_loss = val_loss

#     print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))

In [37]:
# torch.save(transformer.state_dict(),'/content/drive/My Drive/My Folder/HG_Transliteration.pt')

In [38]:
'''functions for decoding the final output tensor into the english sentence. We have used
two types of decoding techniques namely beam search decode and greedy decode. Although
we have used only the greedy decode scheme for our purpose for the reason that it takes
less '''

import heapq
import nltk
from nltk.translate.bleu_score import corpus_bleu


def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

def beam_search_decode(model, src, src_mask, max_len, start_symbol, beam_size=3):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)

    # Initialize the beam with a single hypothesis
    beams = [(0.0, ys)]

    # Repeat beam expansion until max_len or EOS is reached
    for i in range(max_len-1):
        new_beams = []
        for score, ys in beams:
            # Check if the last token in the sequence is EOS
            if ys[-1] == EOS_IDX:
                new_beams.append((score, ys))
                continue

            memory = memory.to(DEVICE)
            tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                        .type(torch.bool)).to(DEVICE)
            tgt_mask = tgt_mask.unsqueeze(0)  # Add a new dimension at index 0
            tgt_mask = tgt_mask.repeat(2, tgt_mask.shape[1], tgt_mask.shape[1])
            print(ys.unsqueeze(0).shape, memory.squeeze(1).shape, tgt_mask.shape)
            out = model.decode(ys, memory, tgt_mask)
            out = out.squeeze(0)
            prob = model.generator(out[-1])
            top_probs, top_idxs = torch.topk(prob, beam_size)

            # Expand the beam with each possible next token
            for j in range(beam_size):
                next_word = top_idxs[j].item()
                score_j = score + top_probs[j].item()
                p = torch.tensor([next_word]).type_as(src.data)
                p = p.unsqueeze(0)
                print(ys.shape, p.shape)
                ys_j = torch.cat([ys, p], dim=0)
                new_beams.append((score_j, ys_j))

        # Keep only the top beam_size hypotheses
        beams = heapq.nlargest(beam_size, new_beams, key=lambda x: x[0])

    # Return the hypothesis with the highest score
    return beams[0][1]


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

In [39]:
#example of translation
# print(translate(transformer, "ईमान लाओ और उसके रसूल के साथ होकर जिहाद करो"))

In [40]:
transformer.load_state_dict(torch.load('/content/drive/My Drive/My Folder/HG_Transliteration.pt'))

<All keys matched successfully>

In [42]:
#checking whether the loaded model is same as the transformer
print(final_test_data['sentence'][25], translate(transformer, final_test_data['sentence'][25]))

काप्पिल बीचनुं नजीकनुं रेलवेस्टेशन कासरगोड , ‍‍‍१२ किलोमीटर दूर छे .  The nearest railway station of Adinath , Taiwan is at a distance of ₹ <unk> kilometres .


In [43]:
import nltk

def calculate_bleu_score(file1_path, file2_path):
    nltk.download('punkt')

    truth = []
    with open(file1_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            line = nltk.word_tokenize(line)
            truth.append([line])

    submission_answer = []
    with open(file2_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            line = nltk.word_tokenize(line)
            #line = [token for token in line if token != '"']
            submission_answer.append(line)
            #print(newline)

    score = nltk.translate.bleu_score.corpus_bleu(truth, submission_answer)
    return score

In [46]:
#saving the predicted answers of the final_test_data
count = 0
with open('/content/drive/My Drive/My Folder/English_pred.txt', 'w', encoding = 'utf-8') as f:
  for sentence in tqdm(eval['hindi']):
    translated = translate(transformer, sentence)
    # print(type(translated))
    count+=1
    f.write(translated + '\n')

100%|██████████| 12828/12828 [30:49<00:00,  6.94it/s]


In [47]:
count = 0
with open('/content/drive/My Drive/My Folder/English_true.txt', 'w', encoding = 'utf-8') as f:
  for sentence in tqdm(eval['english']):

    count+=1
    f.write(sentence + '\n')

100%|██████████| 12828/12828 [00:00<00:00, 503542.58it/s]


In [48]:
score = calculate_bleu_score('/content/drive/My Drive/My Folder/English_true.txt', '/content/drive/My Drive/My Folder/English_pred.txt')
print(score)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


0.14317899334589848
