## IMPORTING LIBRARIES

In [1]:
import requests,zipfile,io
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch import optim
import numpy as np
import random

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## DOWNLOADING AND UNZIPPING DATA

In [3]:
def download_data(url="https://drive.google.com/u/0/uc?id=1uRKU4as2NlS9i8sdLRS1e326vQRdhvfw&export=download"):
    response=requests.get(url)
    z = zipfile.ZipFile(io.BytesIO(response.content))
    z.extractall()

## METHODS FOR GETTING CHARACTERS FOR CORPUSS AND ADDING THEIR INDICES

In [4]:
def get_corpus(data):
    eng_corpus=set()
    hin_corpus=set()
    for i in range(0,len(data)):
        eng_word=data[0][i]
        hin_word=data[1][i]
        for ch in eng_word:
            eng_corpus.add(ch)
        for ch in hin_word:
            hin_corpus.add(ch)
        # End Delimiter
        eng_corpus.add('#')
        hin_corpus.add('#')
        # Start Delimiter
#         eng_corpus.add('^')
        hin_corpus.add('^')
    return hin_corpus,eng_corpus

In [5]:
def word2index(data):
    hin_corpus,eng_corpus=get_corpus(data)
    engchar_idx={}
    hinchar_idx={}
    idx_engchar={}
    idx_hinchar={}
    i=0
    for char in eng_corpus:
        engchar_idx[char]=i
        idx_engchar[i]=char
        i+=1
    i=0
    for char in hin_corpus:
        hinchar_idx[char]=i
        idx_hinchar[i]=char
        i+=1
    return engchar_idx,hinchar_idx,idx_engchar,idx_hinchar,len(eng_corpus),len(hin_corpus)

## DATA PREPROCESSING

In [6]:
def maxlen(data):
    maxlen_eng=0
    maxlen_hin=0
    for i in range(0,len(data)):
        eng_word=data[0][i]
        hin_word=data[1][i]
        if(len(eng_word)>maxlen_eng):
            maxlen_eng=len(eng_word)
        if(len(hin_word)>maxlen_hin):
            maxlen_hin=len(hin_word)
    return maxlen_eng,maxlen_hin

In [7]:
def pre_process(data,eng_to_idx,hin_to_idx):
    eng=[]
    hin=[]
    maxlen_eng,maxlen_hin=maxlen(data)
    
    unknown= 999
    for i in range(0,len(data)):
        sz=0
        eng_word=data[0][i]
        hin_word='^'+data[1][i]
        eng_word = eng_word.ljust(maxlen_eng+1, '#')
        hin_word = hin_word.ljust(maxlen_hin+1, '#')
        idx=[]
        for char in eng_word:
            if eng_to_idx.get(char) is not None:
                idx.append(eng_to_idx[char])
            else:
                idx.append(unknown)
        eng.append(idx)
        idx=[]
        for char in hin_word:
            if hin_to_idx.get(char) is not None:
                idx.append(hin_to_idx[char])
            else:
                idx.append(unknown)
        hin.append(idx)    
    return eng,hin

## LOADING OUR CUSTOM DATASET TO DATALOADER

In [8]:
class MyDataset(Dataset):
    def __init__(self, train_x,train_y, transform=None):
        self.train_x = train_x
        self.train_y = train_y
        self.transform = transform
        
    
    def __len__(self):
        return len(self.train_x)
    
    def __getitem__(self, idx):
        if self.transform:
            sample = self.transform(sample)
        return torch.tensor(self.train_x[idx]).to(device),torch.tensor(self.train_y[idx]).to(device)

def get_data():
    download_data()
    
    train_df=pd.read_csv("aksharantar_sampled/hin/hin_train.csv",header=None)
    test_df=pd.read_csv("aksharantar_sampled/hin/hin_test.csv",header=None)
    val_df=pd.read_csv("aksharantar_sampled/hin/hin_valid.csv",header=None)
    eng_to_idx,hin_to_idx,idx_to_eng,idx_to_hin,input_len,target_len=word2index(train_df)
    
    return train_df,test_df,val_df,eng_to_idx,hin_to_idx,idx_to_eng,idx_to_hin,input_len,target_len

## Seq2Seq MODEL

In [9]:
class Encoder(nn.Module):
    def __init__(self,input_size,hidden_size,embedding_size,num_of_layers,batch_size,bidirec):
        super(Encoder,self).__init__()
        self.hidden_size=hidden_size
        self.batch_size=batch_size
        self.input_size=input_size
        self.embedding_size=embedding_size
        self.embedding=nn.Embedding(input_size,embedding_size)
        self.num_of_layers=num_of_layers
        self.bidirec=bidirec
        self.gru = nn.GRU(embedding_size,hidden_size,num_of_layers,bidirectional=bidirec)

    def forward(self,input,hidden):
        embedded=self.embedding(input).view(-1,self.batch_size, self.embedding_size)
        output,hidden=self.gru(embedded,hidden)
    
        if self.bidirec:
            hidden=hidden.resize(2,self.num_of_layers,self.batch_size,self.hidden_size)
            hidden=torch.add(hidden[0],hidden[1])/2
            
        return output,hidden

    def initHidden(self):
        if(self.bidirec):
            return torch.zeros(2*self.num_of_layers,self.batch_size,self.hidden_size,device=device)
        else:
            return torch.zeros(self.num_of_layers,self.batch_size,self.hidden_size,device=device)

class Decoder(nn.Module):
    def __init__(self, output_size,hidden_size, embedding_size, decoder_layers,batch_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding_size=embedding_size
        self.embedding = nn.Embedding(output_size, embedding_size)
        self.gru = nn.GRU(embedding_size,hidden_size, decoder_layers)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.batch_size=batch_size

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(-1, self.batch_size, self.embedding_size)
        output, hidden = self.gru(embedded, hidden)
        output = self.softmax(self.out(output))
        return output, hidden

In [10]:
def train(train_data,encoder,decoder,loss_fun,encoder_optimizer,decoder_optimizer,encoder_layers,decoder_layers):
    total_loss=0
    teacher_forcing_ratio=0.5
    for train_x,train_y in train_data:
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        train_x=train_x.T
        train_y=train_y.T
        timesteps=len(train_x)
        encoder_hidden=encoder.initHidden()
        loss=0
        encoder_output,encoder_hidden=encoder(train_x,encoder_hidden)
        
        if(decoder_layers>encoder_layers):
            i = decoder_layers
            decoder_hidden=encoder_hidden
            
            while True:
                if(i==encoder_layers):
                    break
                # Concatenate the two tensors along the first dimension
                decoder_hidden = torch.cat([decoder_hidden, encoder_hidden[-1].unsqueeze(0)], dim=0)
                i-=1
                
        elif(decoder_layers<encoder_layers):
            decoder_hidden=encoder_hidden[-decoder_layers:]
            
        else:
            decoder_hidden=encoder_hidden
        
        decoder_input = train_y[0]
        
        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
        if use_teacher_forcing:
            for i in range(0,len(train_y)):
                decoder_output, decoder_hidden= decoder(decoder_input, decoder_hidden)
                loss+=loss_fun(torch.squeeze(decoder_output), train_y[i])
                decoder_input = train_y[i]  # Teacher forcing
        else:
            for i in range(0,len(train_y)):
                decoder_output,decoder_hidden=decoder(decoder_input,decoder_hidden)
                max_prob,index=decoder_output.topk(1)
                loss+=loss_fun(torch.squeeze(decoder_output), train_y[i])
                decoder_input=index
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()
        total_loss+=loss
    return total_loss.item()/len(train_y),encoder,decoder

In [11]:
def train_iter(input_data,input_len,target_len,epochs,batch_size,embedding_size,encoder_layers,decoder_layers,hidden_size,cell_type,bi_directional,dropout,beam_size):
    lr=0.001
    encoder=Encoder(input_len,hidden_size,embedding_size,encoder_layers,batch_size,bi_directional).to(device)
    decoder=Decoder(target_len,hidden_size,embedding_size,decoder_layers,batch_size).to(device)
    encoder_optimizer=optim.Adam(encoder.parameters(),lr)
    decoder_optimizer=optim.Adam(decoder.parameters(),lr)
    loss_fun=nn.CrossEntropyLoss(reduction="sum")
    encoder.train()
    decoder.train()
    epoch_loss=[]
    for i in range(0,epochs):
        loss,encoder,decoder=train(input_data,encoder,decoder,loss_fun,encoder_optimizer,decoder_optimizer,encoder_layers,decoder_layers)
        epoch_loss.append(loss/51200)
    
    train_predictions=eval(input_data,encoder,decoder,encoder_layers,decoder_layers)
    return train_predictions,epoch_loss,encoder,decoder,encoder_layers,decoder_layers

In [12]:
def eval(input_data,encoder,decoder,encoder_layers,decoder_layers):
    encoder.eval()
    decoder.eval()
    pred_words=list()
    for x,y in input_data:
        decoder_words=[]
        x=x.T
        y=y.T
        encoder_hidden=encoder.initHidden()
        timesteps=len(x)
        encoder_output,encoder_hidden=encoder(x,encoder_hidden)
        if(decoder_layers>encoder_layers):
            i = decoder_layers
            decoder_hidden=encoder_hidden
            while True:
                if(i==encoder_layers):
                    break
                # Concatenate the two tensors along the first dimension
                decoder_hidden = torch.cat([decoder_hidden, encoder_hidden[-1].unsqueeze(0)], dim=0)
                i-=1
        
        elif(decoder_layers<encoder_layers):
            decoder_hidden=encoder_hidden[-decoder_layers:] 
        
        else:
            decoder_hidden=encoder_hidden
        decoder_input = y[0]
        
        for i in range(0,len(y)):
            decoder_output,decoder_hidden=decoder(decoder_input,decoder_hidden)
            max_prob,index=decoder_output.topk(1)
            index=index.squeeze()
            decoder_input=index
            decoder_words.append(index.tolist())
        decoder_words=np.array(decoder_words)
        pred_words.append(decoder_words.T)
    
    
    predictions=[]
    for batch in pred_words:
        for word in batch:
            predictions.append(word)
    return predictions

In [13]:
def accuracy(predictions,train_y):
    count=0
    for i in range(0,len(predictions)):
        p=predictions[i].tolist()
        if p==train_y[i]:
            count+=1
    return count,len(predictions)

In [14]:
train_df,test_df,val_df,eng_to_idx,hin_to_idx,idx_to_eng,idx_to_hin,input_len,target_len=get_data()

train_x,train_y = pre_process(train_df,eng_to_idx,hin_to_idx)
test_x,test_y = pre_process(test_df,eng_to_idx,hin_to_idx)
val_x,val_y = pre_process(val_df,eng_to_idx,hin_to_idx)

train_dataset=MyDataset(train_x,train_y)
test_dataset=MyDataset(test_x,test_y)
val_dataset=MyDataset(val_x,val_y)


train_dataloader=DataLoader(train_dataset,batch_size=128)
test_dataloader=DataLoader(test_dataset,batch_size=128)
val_dataloader=DataLoader(val_dataset,batch_size=128)


In [15]:
train_predictions,epoch_loss,encoder,decoder,encoder_layers,decoder_layers=train_iter(train_dataloader,input_len,target_len,50,128,256,3,2,256,"GRU",True,0.2,16)



In [16]:
print(epoch_loss)

[1.5703898111979167, 0.9853421456473214, 0.8294122023809524, 0.7106489490327381, 0.6144423130580356, 0.5907033575148809, 0.5387016950334822, 0.49251226515997026, 0.4624995349702381, 0.4303104945591518, 0.41987743559337803, 0.3967113095238095, 0.3666245814732143, 0.3521223667689732, 0.3582190232049851, 0.3424424525669643, 0.34697806222098215, 0.3049237060546875, 0.2938509695870536, 0.2828720238095238, 0.30916495186941967, 0.3084758068266369, 0.2902264694940476, 0.2711185128348214, 0.2562541271391369, 0.2508927408854167, 0.2162175787062872, 0.20768310546875, 0.19995717366536458, 0.20758438836960566, 0.23259403773716517, 0.20509020124162947, 0.20172678629557292, 0.19641087123325893, 0.1860189238048735, 0.1924930681501116, 0.19381934756324404, 0.18708140055338543, 0.24034245082310268, 0.236819327218192, 0.19135992140997027, 0.1709168933686756, 0.16007250104631698, 0.1470549083891369, 0.14734915597098214, 0.144521731422061, 0.15340349469866071, 0.1512815202985491, 0.16245404924665177, 0.161

In [17]:
val_predictions=eval(val_dataloader,encoder,decoder,encoder_layers,decoder_layers)

In [18]:
test_predictions=eval(test_dataloader,encoder,decoder,encoder_layers,decoder_layers)

In [19]:
acc,total=accuracy(train_predictions,train_y)
print(acc/total*100)

75.083984375


In [20]:
acc,total=accuracy(val_predictions,val_y)
print(acc/total*100)

26.123046875


In [21]:
acc,total=accuracy(test_predictions,test_y)
print(acc/total*100)

26.025390625
