In [None]:
# Cloning Indic NLP library
!git clone "https://github.com/anoopkunchukuttan/indic_nlp_library"
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
!pip install Morfessor
!pip install -U nltk

In [None]:
# Importing all the libraries
import csv
import sys
from spacy.lang.en import English
from io import open
import string
import re
import random
import nltk
import time
import os
import math
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
from torch.nn.utils import clip_grad_norm_
import torch.nn.functional as F
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from nltk.tokenize import word_tokenize
import operator
from queue import PriorityQueue
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
from nltk.translate.meteor_score import single_meteor_score
from nltk.tokenize.treebank import TreebankWordDetokenizer


nltk.download('punkt')  
nltk.download('wordnet')

# This variable which store whether gpu is available or not
use_cuda = torch.cuda.is_available()
# input path for train data
input_path = '/kaggle/input/hineng/train/train.csv'

INDIC_NLP_LIB_HOME=r"/kaggle/working/indic_nlp_library"
INDIC_NLP_RESOURCES="/kaggle/working/indic_nlp_resources"
sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))

from indicnlp import common
from indicnlp import loader
from indicnlp.tokenize import indic_tokenize 
from indicnlp.normalize.indic_normalize import BaseNormalizer

common.set_resources_path(INDIC_NLP_RESOURCES)
loader.load()

In [None]:
# Reading the data from the csv file 
hindi_sent = []
english_sent = []
with open(input_path, 'r') as file:
    my_reader = csv.reader(file, delimiter=',')
    header = next(my_reader)
    for row in my_reader:
        hindi_sent.append(row[1])
        english_sent.append(row[2])
pairs = list(zip(hindi_sent, english_sent))

In [None]:
ukn = 0 # Unknown index
sos = 1 # Start of Sequence index
eos = 2 # End of Sequence index
pad = 3 # Padding index
teacher_forcing = 0.5 # teacher forcing ratio
MAX_LENGTH = 60 # Max length for input sentence while training
MAX_LENGTH_OUT = 100 # Max length output sentence generated
MIN_FREQUENCY = 1 # Min frequency of word to be counted in vocabulary
non_word_idxs = [0,1,2,3] # non-word indexes

In [None]:
# To normalize hindi sentences
def normalize_sentence(s):
    normalizer = BaseNormalizer("hi", remove_nuktas=False)
    output_text=normalizer.normalize(s)
    return output_text

# To tokenize sentence
def tokenize_sentence(s, lang):
    if(lang.name == 'English'):
        tokens = list(word_tokenize(s))
    else:
        tokens = list(indic_tokenize.trivial_tokenize(normalize_sentence(s)))
    return tokens


# get list of indexes of each token corresponding to sequence
def indexesFromSentence(lang, s):
    if(lang.name == 'English'):
        return [lang.word2index.get(str(word),ukn) for word in list(word_tokenize(s))]
    else:
        return [lang.word2index.get(word, ukn) for word in list(indic_tokenize.trivial_tokenize(s))]

# returns the indexes as above just adds the <sos> and <eos>
def variableFromSentence(lang, s):
    indexes = indexesFromSentence(lang, s)
    indexes = [sos]+indexes+[eos]
    return indexes
    
# this returns a pair of list of indexes corresponding to hindi and english sentence
def variablesFromPair(hindi_lang,english_lang,pair):
    input_variable = variableFromSentence(hindi_lang, pair[0])
    target_variable = variableFromSentence(english_lang, pair[1])
    return (input_variable, target_variable)

# This does the required padding
def do_padding(pair, max_leng):
    if(len(pair) > max_leng):
        pair = pair[:max_leng]
    else:
        sz = len(pair)
        for i in range(max_leng-sz):
            pair.append(pad)
    return pair


# Creating out language using the data
def prepare_data(pairs):
    hindi_lang = Language('Hindi')
    english_lang = Language('English')
    for pair in pairs:
        for token in tokenize_sentence(pair[0], hindi_lang):
            hindi_lang.addWord(token)
        for token in tokenize_sentence(pair[1], english_lang):
            english_lang.addWord(str(token))
            
    print(f'English has {english_lang.vocab} words')
    print(f'Hindi has {hindi_lang.vocab} words')
    
    return (hindi_lang, english_lang)

In [None]:
# Creating a Language class 
class Language:
  def __init__(self, name):
    self.name = name # name of language
    self.word2index = {"<sos>": 1, "<eos>": 2,"<ukn>": 0, "<pad>": 3} # word to index dictionary
    self.index2word = {1: "<sos>", 2: "<eos>", 0: "<ukn>", 3: "<pad>"} # index to word dictionary
    self.vocab = 4 # vocabulary of the language
    self.minfrequency = MIN_FREQUENCY # minimum frequency of each word required to be added to the vocab
    self.wordcount = {"<sos>": self.minfrequency, "<eos>": self.minfrequency,"<ukn>": self.minfrequency, "<pad>": self.minfrequency} # dictionary to store the count of each word

  def addWord(self, word): # adding word to language
    if word not in self.wordcount:
        self.wordcount[word] = 0
    self.wordcount[word]+=1
    if((self.wordcount[word] >= self.minfrequency) and (word not in self.word2index)):
      self.word2index[word] = self.vocab
      self.index2word[self.vocab] = word
      self.vocab += 1

## Used in beam search
class BeamNode(object):
    def __init__(self, hiddenstate, previousNode, wordId, logProb, length):
        self.leng = length
        self.h = hiddenstate
        self.wordid = wordId
        self.prevNode = previousNode
        self.logp = logProb
        
    def eval(self):
        return self.logp / float(self.leng - 1 + 1e-6) 
    def __gt__(self, other):
        return self.leng > other.leng
    def __lt__(self, other):
        return self.leng < other.leng 

In [None]:
# The language object for Hindi and English language
hindi_lang = Language('Hindi')
english_lang = Language('English')

In [None]:
# Here I remove the special characters from out english sentences
temp = pairs
pairs = []
for pair in temp:
    curr = pair[1].lower()
    split_true = list(filter(None, re.split(r'[\s!"#$%&\()+,-./:;<=>?@\\^_`{|}~]+', curr)))
    pairs.append((pair[0], ' '.join(split_true)))

In [None]:
# Here I am doing the train-dev split 
random.seed(44)
random.shuffle(pairs)
train_length = int(0.98*len(pairs))
train_split = pairs[:train_length]
dev_split = pairs[train_length:]

In [None]:
# Here I am filtering the train split, and will use only those hindi sentences whose length <= MAX_LENGTH
temp = train_split
train_split = []
for pair in temp:
    if(len(tokenize_sentence(pair[0], hindi_lang))<=MAX_LENGTH):
        train_split.append(pair)

In [None]:
hindi_lang, english_lang = prepare_data(train_split)

In [None]:
## In this cell we are converting our sentences into indexes, doing padding and then creating the dataloader object for both train and dev set
training_pairs = []
eval_pairs = []
true_sentences = []
for i in range(len(train_split)): # train_split is a list of pairs of sentences, (hindi sentence, english sentences)
    training_pairs.append(variablesFromPair(hindi_lang, english_lang, train_split[i]))# variables from pair returns a pair, (list of index of hindi sentence, list of index of english sentence)

for i in range(len(dev_split)):
    eval_pairs.append(variablesFromPair(hindi_lang, english_lang, dev_split[i]))
    true_sentences.append(dev_split[i][1])

x_train = []
y_train = []
x_eval = []
y_eval = []
for pair in training_pairs:
    x_train.append(do_padding(pair[0], MAX_LENGTH))
    y_train.append(do_padding(pair[1], MAX_LENGTH_OUT))

for pair in eval_pairs:
    x_eval.append(do_padding(pair[0], MAX_LENGTH))
    y_eval.append(do_padding(pair[1], MAX_LENGTH_OUT))

x_train = torch.tensor(x_train)
y_train = torch.tensor(y_train)

x_eval = torch.tensor(x_eval)
y_eval = torch.tensor(y_eval)

train = torch.utils.data.TensorDataset(x_train,y_train)
train_data = torch.utils.data.DataLoader(train, batch_size=32)

dev = torch.utils.data.TensorDataset(x_eval,y_eval)
dev_data = torch.utils.data.DataLoader(dev, batch_size=32)

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, n_layers=1, dropout=0.5):
        super(Encoder, self).__init__()
        self.input_size = input_size  # The vocabulary size of hindi
        self.hidden_size = hidden_size  # The hidden size in our case 300
        self.embed_size = embed_size  # The embedding size in our case 256
        self.embed = nn.Embedding(input_size, embed_size) # Using the embedding layer to get our embedding corresponding to our sentence 
        self.gru = nn.GRU(embed_size, hidden_size, n_layers, dropout=dropout, bidirectional=True) # Passing the embedding through a bi-directional GRU

    def forward(self, src, hidden=None):
        embedded = self.embed(src)  # Using the embedding layer on our input
        outputs, hidden = self.gru(embedded, hidden)  # Using the GRU layer on the embeddings of the sequence
        outputs = (outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]) # As the GRU is bidirectional, adding both the diirections output
        return outputs, hidden


class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size # The hidden size in our case 300
        self.v = nn.Parameter(torch.rand(hidden_size))
        self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
        self.v.data.uniform_(-1.0/math.sqrt(self.v.size(0)), 1.0/math.sqrt(self.v.size(0))) 

    def forward(self, hidden, encoder_outputs):
        steps = encoder_outputs.size(0)
        h = hidden.repeat(steps, 1, 1).transpose(0, 1) 
        encoder_outputs = encoder_outputs.transpose(0, 1) 
        energy = F.relu(self.attn(torch.cat([h, encoder_outputs], 2)))
        value = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1) 
        energy = torch.bmm(value, energy.transpose(1,2)).squeeze(1) 
        
        return F.softmax(energy, dim=1).unsqueeze(1) 


class Decoder(nn.Module):
    def __init__(self, embed_size, hidden_size, output_size, n_layers=1, dropout=0.2):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size 
        self.n_layers = n_layers 
        self.output_size = output_size 
        self.embed_size = embed_size 

        self.embed = nn.Embedding(output_size, embed_size)
        self.dropout = nn.Dropout(dropout, inplace=True)
        self.gru = nn.GRU(hidden_size + embed_size, hidden_size, n_layers, dropout=dropout)
        self.out = nn.Linear(hidden_size * 2, output_size)
        self.attention = Attention(hidden_size)

    def forward(self, input, last_hidden, encoder_outputs): 
        embedded = self.embed(input).unsqueeze(0) # Get the embedding of the current input word (last output word)
        embedded = self.dropout(embedded) # add some dropout
        attn_weights = self.attention(last_hidden[-1], encoder_outputs)  # Calculate attention weights and apply to encoder outputs
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1)).transpose(0, 1)  # Carry batch matrix multiplication with attention weights
        input_to_gru = torch.cat([embedded, context], 2) # Combine embedded input word and attended context, run through RNN
        output, hidden = self.gru(input_to_gru, last_hidden) 
        output = self.out(torch.cat([output.squeeze(0), context.squeeze(0)], 1)) 
        output = F.log_softmax(output, dim=1) ## using softmax to get probabilities for each word in vocab
        return output, hidden, attn_weights  



In [None]:
# This is just the combination of the decoder, encoder and attention module class
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        max_len = trg.size(0) 
        batch_size = src.size(1)
        vocab_size = self.decoder.output_size
        if(use_cuda):
            outputs = Variable(torch.zeros(max_len, batch_size, vocab_size)).cuda()
        else:
            outputs = Variable(torch.zeros(max_len, batch_size, vocab_size))
        encoder_output, hidden = self.encoder(src)
        hidden = hidden[:self.decoder.n_layers] 
        all_attention = []
        output = Variable(trg.data[0, :])
        for t in range(1, max_len):
            output, hidden, attn_weights = self.decoder(
                output, hidden, encoder_output) 
            all_attention.append(attn_weights)
            outputs[t] = output
            top1 = output.data.max(1)[1]
            is_teacher = random.random() < teacher_forcing_ratio
            if(use_cuda):
                output = Variable(trg.data[t] if is_teacher else top1).cuda()
            else:
                output = Variable(trg.data[t] if is_teacher else top1)
        return outputs, all_attention

    # Here we find the decoded sentences given a batch of hindi sentences after indexing
    def decode(self, src, strategy='beam'):
        encoder_output, hidden = self.encoder(src) 
        hidden = hidden[:self.decoder.n_layers] 
        if strategy == 'beam':
            return self.beam_search(hidden, encoder_output)
        else:
            return self.greedy_search(hidden, encoder_output)
        
        
    def infer(self, input_sent, strategy = 'beam'): # input should be a tensor of shape [sent_len, 1]
        encoder_output, hidden = self.encoder(input_sent)
        hidden = hidden[:self.decoder.n_layers]
        if(strategy == 'beam'):
            return self.beam_search(hidden, encoder_output)
        else:
            return self.greedy_search(hidden, encoder_output)

    # Greedy strategy of decoding i.e at every step we pick the word with the maximum probability
    def greedy_search(self, decoder_hidden, encoder_outputs):
        batch_size = decoder_hidden.shape[1]
        seq_len = MAX_LENGTH_OUT
        decoded_batch = torch.zeros((batch_size, seq_len))
        if(use_cuda):
            decoder_input = Variable(torch.tensor([1]*batch_size)).cuda()  # sos
        else:
            decoder_input = Variable(torch.tensor([1]*batch_size))
        for t in range(seq_len):
            decoder_output, decoder_hidden, _ = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.data.topk(1) 
            topi = topi.view(-1)
            decoded_batch[:, t] = topi
            decoder_input = topi.detach().view(-1)

        return decoded_batch

    # Beam strategy of decoding here we pick a set of top k words s.t the overall probability is maximized rather than taking maximum probability word at each step. 
    # This decoding takes a lot of time so during training we use greedy to check model performance on evaluation. In test set decoding we use this with beam size of 5
    def beam_search(self, decoder_hiddens, encoder_outputs):
        beam_size = 5 # The beam size
        topk = 1  # number of sentences we need to infer for a particular hindi sentence
        decoded_batch = [] # This will store the decoded sentences for entire batch
        seq_len = MAX_LENGTH_OUT # The maximum length of inferred sentence
        batch_size = decoder_hiddens.shape[1]
#         print(batch_size)
        # This strategy works when we do one sentence at a time so loop over the batch size
        for idx in range(batch_size):  # batch_size 
            encoder_output = encoder_outputs[:, idx, :].unsqueeze(1) 
            decoder_hidden = decoder_hiddens[:, idx, :].unsqueeze(0) 
            # Check if cuda is available
            if(use_cuda):
                decoder_input = torch.LongTensor([sos]).cuda()
            else:
                decoder_input = torch.LongTensor([sos])

            # Number of sentence to generate
            end_nodes = []
            if((topk + 1) < topk - len(end_nodes)):
                req_num = topk + 1
            else:
                req_num = topk - len(end_nodes)
            node_queue = PriorityQueue() # Using a priority queue to create the queue
            # starting node -  hidden vector, previous node, word id, logp, length
            node = BeamNode(decoder_hidden, None, decoder_input, 0, 1)
            node_queue.put((-node.eval(), node))
            size_of_queue = 1
            limit = 2000
            # start beam search
            while True:
                # give up when decoding takes too long
                if size_of_queue > limit:
                    break
                # Here we extract the best node
                score, n = node_queue.get()
                decoder_hidden = n.h
                decoder_input = n.wordid

                if (n.wordid.item() == eos):
                    if (n.prevNode != None):
                        end_nodes.append((score, n))
                        # if required number is reached then break
                        if len(end_nodes) >= req_num:
                            break
                        else:
                            continue
                # decode for one step using decoder
                decoder_output, decoder_hidden, _ = self.decoder(decoder_input, decoder_hidden, encoder_output)
                log_probability, indexes = torch.topk(decoder_output, beam_size)
                n_next = []
                for new_k in range(beam_size):
                    decoded_t = indexes[0][new_k].view(-1)
                    log_prob = log_probability[0][new_k].item()
                    node = BeamNode(decoder_hidden, n, decoded_t, n.logp + log_prob, n.leng + 1)
                    score = -node.eval()
                    n_next.append((score, node))

                # put them into queue
                for i in range(len(n_next)):
                    score, nn = n_next[i]
                    node_queue.put((score, nn))
                    # increase size_of_queue
                size_of_queue += len(n_next) - 1
            # Choose the best paths for back-tracking
            if len(end_nodes) == 0:
                end_nodes = [node_queue.get() for _ in range(topk)]
            inferred = []
            for score, nn in sorted(end_nodes, key=operator.itemgetter(0)):
                possibilities = []
                possibilities.append(nn.wordid)
                # Back tracking
                while nn.prevNode != None:
                    nn = nn.prevNode
                    possibilities.append(nn.wordid)
                possibilities = possibilities[::-1]
                inferred.append(possibilities)
            decoded_batch.append(inferred)

        return decoded_batch


In [None]:
## initialising the hyperparameters
epochs = 10
batch_size = 32
lr = 0.0001
grad_clip = 10.0
embed_size = 256
hidden_size = 300

In [None]:
## Initialize the model parameters
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.05)


## Function to train the model
def train(epoch, model, optimizer, train_iter, grad_clip):
    vocab_size = english_lang.vocab
    model.train()
    total_loss = 0
    strt = time.time()
    overall_loss = 0
    for idx, batch in enumerate(train_iter):
        src = batch[0]
        trg = batch[1]
        src = src.permute(1,0)
        trg = trg.permute(1,0)
  
        if(use_cuda):
          src, trg = src.cuda(), trg.cuda()
        optimizer.zero_grad()
        output, _ = model(src, trg)
        loss = F.nll_loss(output[1:].view(-1, vocab_size), trg[1:].contiguous().view(-1), ignore_index=pad)
        loss.backward()
        clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()
        total_loss += loss.data.item()
        overall_loss += loss.data.item()

        if idx % 100 == 0 and idx != 0:
            total_loss = total_loss / 100
            print("[%d][loss:%5.2f][time:%5.2f]" %(idx, total_loss, time.time()-strt))
            strt = time.time()
            total_loss = 0
    return overall_loss/len(train_iter)

## Function to get attention weights
def get_attention_weights(model, src, trg): #src --> [length, 1], trg --> [length, 1]
    with torch.no_grad():
        model.eval()
        if(use_cuda):
            src = src.cuda()
            trg = trg.cuda()
        output, attention_wts = model(src, trg, teacher_forcing_ratio=1.0)
        return attention_wts
## Returns sentences from list of indexes
def indexes_to_sentences(idxs):
    sequence = []
    for idx in idxs:
        if(idx.item() not in non_word_idxs):
            sequence.append(english_lang.index2word[idx.item()])
    return TreebankWordDetokenizer().detokenize(sequence)

# Returns sentences from list of indexes
def indexes_to_sentences_beam(idxs):
    sequence = []
    for idx in idxs[0]:
        if(idx.item() not in non_word_idxs):
            sequence.append(english_lang.index2word[idx.item()])
    return TreebankWordDetokenizer().detokenize(sequence)

## evaluate the model
def evaluate(model, val_iter):
    vocab_size = english_lang.vocab
    with torch.no_grad():
        model.eval()
        total_loss = 0
        for idx, batch in enumerate(val_iter):
            src = batch[0]
            trg = batch[1]
            src = src.permute(1,0)
            trg = trg.permute(1,0)
            if(use_cuda):
                src = src.data.cuda()
                trg = trg.data.cuda()
            output, _ = model(src, trg, teacher_forcing_ratio=0.0)
            loss = F.nll_loss(output[1:].view(-1, vocab_size), trg[1:].contiguous().view(-1), ignore_index=pad)
            total_loss += loss.data.item()
        return total_loss / len(val_iter)
    
## evaluate model and get bleu score on dev set    
def evaluate_for_score(true_sentences, val_iter):
    pred_sentences_greedy = []
    pred_sentences_beam = []
    for idx, batch in enumerate(val_iter):
        src = batch[0].permute(1,0)
        if(use_cuda):
            src = src.data.cuda()
        decoded_greedy = seq2seq.decode(src, 'greedy')
        for i in range(len(decoded_greedy)):
            pred_sentences_greedy.append(indexes_to_sentences(decoded_greedy[i]))
#         decoded_beam = seq2seq.decode(src)
#         for i in range(len(decoded_beam)):
#             pred_sentences_beam.append(indexes_to_sentences_beam(decoded_beam[i]))
    print('Scores for greedy decoding')
    greedy_score =  get_scores(true_sentences, pred_sentences_greedy)
    beam_score = 0
#     print('Scores for beam decoding')
#     beam_score = get_scores(true_sentences, pred_sentences_beam)
    return (max(greedy_score, beam_score), pred_sentences_beam)
    
## the evaluation script 
def get_scores(true_sentences, pred_sentences):
    
    if len(true_sentences) != len(pred_sentences):
        print(f'E: Number of sentences do not match. True: {len(true_sentences)} Pred: {len(pred_sentences)}')
        sys.exit()

    for i in range(len(true_sentences)):
        true_sentences[i]=true_sentences[i].lower()
        pred_sentences[i]=pred_sentences[i].lower()
    
    true_sentences_joined, pred_sentences_joined = [], []

    for i in range(len(true_sentences)):
        # some punctuations from string.punctuation
        split_true = list(filter(None, re.split(r'[\s!"#$%&\()+,-./:;<=>?@\\^_`{|}~]+', true_sentences[i])))
        split_pred = list(filter(None, re.split(r'[\s!"#$%&\()+,-./:;<=>?@\\^_`{|}~]+', pred_sentences[i])))
        true_sentences_joined.append(' '.join(split_true))
        pred_sentences_joined.append(' '.join(split_pred))
    
    scores = {}
    # Macro-averaged BLEU-4 score.
    scores['bleu_4_macro'] = 0
    for ref, hyp in zip(true_sentences, pred_sentences):
        scores['bleu_4_macro'] += sentence_bleu(
            [ref.split()],
            hyp.split(),
            smoothing_function=SmoothingFunction().method2
        )
    scores['bleu_4_macro'] /= len(true_sentences)

    # BLEU-4 score.
    scores['bleu_4'] = corpus_bleu(
        [[ref.split()] for ref in true_sentences],
        [hyp.split() for hyp in pred_sentences],
        smoothing_function=SmoothingFunction().method2
    )

    # METEOR score.
    scores['meteor'] = 0
    for ref, hyp in zip(true_sentences, pred_sentences):
        scores['meteor'] += single_meteor_score(ref, hyp)
    scores['meteor'] /= len(true_sentences)

    print(f'D: Scores: {scores}')
    
    return scores['bleu_4']

## this function uses the model to get the translations on the test set and forms the answer.txt file
def get_test_result(model, test_path):
    # Printing the converted sentences to a new anster.txt file
    output_path = 'answer.txt' # output path of the translated test sentences
    # Reading the test sentences from csv files which needs to be translated
    test_sentences = []
    with open(test_path, 'r') as file:
        my_reader = csv.reader(file, delimiter=',')
        header = next(my_reader)
        for row in my_reader:
            test_sentences.append(row[2])

    pred_new = []
    count = 0
    start = time.time()
    for sent in test_sentences:
        pred_new.append(indexes_to_sentences_beam(model.infer(torch.tensor(do_padding(variableFromSentence(hindi_lang, sent), 100)).view(-1,1).cuda())[0]))
        count+=1
        if(count%100 == 0):
            print(f'{count} sentences translated {pred_new[-1]}, time taken: {time.time()-start}')

    all_text = ""
    for sent in pred_new:
        all_text+=sent+"\n"

    # Getting the final translated sentences and outputing it
    f = open(output_path,"w")
    f.write(all_text)
    f.close()

In [None]:
### Declaring the model

encoder = Encoder(hindi_lang.vocab, embed_size, hidden_size, n_layers=2, dropout=0.5)
decoder = Decoder(embed_size, hidden_size, english_lang.vocab, n_layers=1, dropout=0.5)
if use_cuda:
    seq2seq = Seq2Seq(encoder, decoder).cuda()
else:
    seq2seq = Seq2Seq(encoder, decoder)
optimizer = optim.Adam(seq2seq.parameters(), lr=lr)

## Initializing the model
seq2seq.apply(init_weights)

In [None]:
## To train a new model run this cell otherwise skip this

max_bleu = 0
model_name = 'CS779_seq2seq_best.pth'
start_time = time.time()
train_losses = []
val_losses = []
bleu_scores = []
for e in range(1, epochs+1):
    train_iter = iter(train_data)
    train_loss = train(e, seq2seq, optimizer, train_iter, grad_clip)
    train_losses.append(train_loss)
    val_iter = iter(dev_data)
    val_loss = evaluate(seq2seq, val_iter)
    val_losses.append(val_loss)
    print("[Epoch:%d] time:%5.3f val_loss:%5.3f" % (e, (time.time()-start_time), val_loss))
    start_time = time.time()
    val_iter = iter(dev_data)
    score, _ = evaluate_for_score(true_sentences, val_iter)
    bleu_scores.append(score)
    print(f'bleu score: {score}')
    if(score > max_bleu):
        if(os.path.exists(model_name)):
            os.remove(model_name)
        torch.save(seq2seq, model_name)
        print('Saving Model ...')
        max_bleu = score

In [None]:
## To load the pre-traineed model run this cell
saved_model_path = '/kaggle/input/cs779-pretrained-model-2/CS779_seq2seq_best_filtered.pth'
seq2seq = torch.load(saved_model_path)

In [None]:
## use the model to get score on dev set
val_iter = iter(dev_data)
score, pred = evaluate_for_score(true_sentences, val_iter)

In [None]:
## use model on test set to get score
test_path = '/kaggle/input/testphase/testhindistatements.csv' ## path to test file
get_test_result(seq2seq, test_path)