In [None]:
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.15.2-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.22.2-py2.py3-none-any.whl (203 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m203.3/203.3 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting pathtools (from wandb)
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting

In [None]:
import pandas as pd
import torchtext
import numpy as np
import torch
import torch.nn as nn
import random
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import wandb

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
zip_path = "/content/drive/MyDrive/aksharantar_sampled.zip"
!cp "{zip_path}" .
!unzip -q aksharantar_sampled.zip
!rm aksharantar_sampled.zip 

In [None]:
def get_data(lang):
  train_csv=f"aksharantar_sampled/{lang}/{lang}_train.csv"
  test_csv=f"aksharantar_sampled/{lang}/{lang}_test.csv"
  val_csv=f"aksharantar_sampled/{lang}/{lang}_valid.csv"
  
  return train_csv,test_csv,val_csv

train_data,test_data,val_data=get_data("hin")

In [None]:
### For train
input_texts = []
target_texts = []

train_df = pd.read_csv(train_data, header=None, names=["1", "2"]).astype(str)

# Add all the input and target texts with start sequence and end sequence added to target 
for index, row in train_df.iterrows():
      input_text = row['1']
      target_text = row['2']
      if target_text == '' or input_text == '':
          continue
      target_text = "\t" + target_text + "\n"
      input_texts.append(input_text)
      target_texts.append(target_text)

english_tokens = set()
hindi_tokens = set()

for x,y in zip(input_texts,target_texts):
    for ch in x:
        english_tokens.add(ch)
    for ch in y:
        hindi_tokens.add(ch)
    
english_tokens = sorted(list(english_tokens))
hindi_tokens = sorted(list(hindi_tokens))

eng_token_map = dict([(ch,i+1) for i,ch in enumerate(english_tokens)])
hin_token_map = dict([(ch,i+1) for i,ch in enumerate(hindi_tokens)])

eng_token_map["<UNK>"]=len(english_tokens)+1
hin_token_map["<UNK>"]=len(hindi_tokens)+1
eng_token_map['<PAD>']=0
hin_token_map['<PAD>']=0

max_eng_len = max([len(i) for i in input_texts])
max_hin_len = max([len(i) for i in target_texts])

In [None]:
def pre_process(data):
    
    input_texts = []
    target_texts = []
    
    df = pd.read_csv(data, header=None, names=["1", "2"]).astype(str)

    for index, row in df.iterrows():
      input_text = row['1']
      target_text = row['2']
      if target_text == '' or input_text == '':
          continue
      target_text = "\t" + target_text + "\n"
      input_texts.append(input_text)
      target_texts.append(target_text)

    
    a = np.zeros((len(input_texts),max_eng_len+2),dtype="float32")
    b = np.zeros((len(target_texts),max_hin_len+2),dtype="float32")
    
    
    for i,(x,y) in enumerate(zip(input_texts,target_texts)):
        for j,ch in enumerate(x):
            a[i,j] = eng_token_map.get(ch,eng_token_map["<UNK>"])

        for j,ch in enumerate(y):
            b[i,j] = hin_token_map.get(ch,hin_token_map["<UNK>"])
        
      
    return a,b

In [None]:
trainx, trainy= pre_process(train_data)
valx, valy= pre_process(val_data)
testx,testy,= pre_process(test_data)

In [None]:
reverse_eng_map = dict([(i,char) for char,i in eng_token_map.items()])
reverse_hin_map = dict([(i,char) for char,i in hin_token_map.items()])

In [None]:

class CustomDataset(Dataset):
    def __init__(self, input_data, target_data):
        self.input_data = input_data
        self.target_data = target_data
    

    def __len__(self):
        return len(self.input_data)

    def __getitem__(self, idx):
        input_seq = self.input_data[idx]
        target_seq = self.target_data[idx]

        return input_seq, target_seq

def custom_collate(batch):
    input_seqs, target_seqs= zip(*batch)
    input_seqs = torch.from_numpy(np.stack(input_seqs, axis=1))
    target_seqs = torch.from_numpy(np.stack(target_seqs, axis=1))

    return input_seqs, target_seqs


train_dataset = CustomDataset(trainx, trainy)
#train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=custom_collate)

val_dataset = CustomDataset(valx, valy)
#val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=custom_collate)

test_dataset = CustomDataset(testx, testy)
#test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=custom_collate)


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'


class Encoder(nn.Module):
    def __init__(self, input_size, embed_dim, hidden_size, num_layers, dropout,cell_type):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, embed_dim,padding_idx=0)
        self.dropout = nn.Dropout(dropout)
        self.cell_type=cell_type

        if cell_type=="LSTM":
          self.rnn = nn.LSTM(embed_dim, hidden_size, num_layers, dropout=dropout)
        elif cell_type=="GRU":
          self.rnn=nn.GRU(embed_dim,hidden_size,num_layers,dropout=dropout)
        else:
          self.rnn=nn.RNN(embed_dim,hidden_size,num_layers,dropout=dropout)
    
    def forward(self, x):
        embedded = self.dropout(self.embedding(x))

        if self.cell_type=="LSTM":
          output, (hidden, cell) = self.rnn(embedded)
          return hidden, cell
        
        elif self.cell_type=="GRU":
          output, hidden = self.rnn(embedded)

          return output, hidden
        
        else:
          output, hidden = self.rnn(embedded)

          return output,hidden



class Decoder(nn.Module):
    def __init__(self, output_size, embed_dim, hidden_size, num_layers, dropout,cell_type):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.cell_type=cell_type
        self.embedding = nn.Embedding(output_size, embed_dim,padding_idx=0)
        if cell_type=="LSTM":
          self.rnn = nn.LSTM(embed_dim, hidden_size, num_layers,  dropout=dropout)
        elif cell_type=="GRU":
          self.rnn=nn.GRU(embed_dim,hidden_size,num_layers,dropout=dropout)
        else:
          self.rnn=nn.RNN(embed_dim,hidden_size,num_layers,dropout=dropout)

        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout=nn.Dropout(dropout)

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(0)
        embedded = self.dropout(self.embedding(x))
        if self.cell_type=="LSTM":
          output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
          output= self.fc(output)
          output = output.squeeze(0)
          return output, hidden, cell
        
        elif self.cell_type=="GRU":
          output, hidden=self.rnn(embedded,hidden)
          output=self.fc(output)
          output=output.squeeze(0)
          return output, hidden
        
        else:
          output, hidden=self.rnn(embedded,hidden)
          output=self.fc(output)
          output = output.squeeze(0)
          return output, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
       
    def forward(self, source, target, teacher_forcing_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(hin_token_map)

        outputs = torch.zeros(target_len,batch_size, target_vocab_size).to(device)
        if self.encoder.cell_type=="LSTM":
          hidden, cell = self.encoder(source)

          x = target[0]

          for t in range(1, target_len):
              output, hidden, cell = self.decoder(x, hidden, cell)
              outputs[t] = output
              top1 = output.argmax(1)
              if random.random() < teacher_forcing_ratio:
                  x = target[t]
              else:
                  x = top1

          return outputs
        
        elif self.encoder.cell_type=="GRU":
          enc_output,hidden = self.encoder(source)

          x = target[0]

          for t in range(1, target_len):
              output,hidden=self.decoder(x,enc_output,hidden,None)
              outputs[t] = output
              top1= output.argmax(1)
              if random.random() < teacher_forcing_ratio:
                  x = target[t]
              else:
                  x = top1
          return outputs
        
        else:
          enc_output,hidden = self.encoder(source)

          x = target[0]

          for t in range(1, target_len):
              output,hidden=self.decoder(x,enc_output,hidden,None)
              outputs[t] = output
              top1= output.argmax(1)
              if random.random() < teacher_forcing_ratio:
                  x = target[t]
              else:
                  x = top1
          return outputs


def build_model(cell = "LSTM",nunits = 64, enc_dec_layers = 2,embed_dim = 128,dropout=0):
    encoder = Encoder(input_size=len(eng_token_map), embed_dim=embed_dim, hidden_size=nunits, num_layers=enc_dec_layers, dropout=dropout,cell_type=cell)
    decoder = Decoder(output_size=len(hin_token_map), embed_dim=embed_dim, hidden_size=nunits, num_layers=enc_dec_layers, dropout=dropout,cell_type=cell)
    model = Seq2Seq(encoder, decoder)
    return model


In [None]:


def train(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0.0
    total_chars = 0
    correct_chars = 0
    total_word_correct=0
    for i, (input_seq, target_seq) in enumerate(dataloader):
        input_seq = input_seq.long().to(device)
        target_seq = target_seq.long().to(device)

        optimizer.zero_grad()

        output = model(input_seq, target_seq)
        _, predicted = torch.max(output, dim=2)


        for j in range(predicted.shape[1]):
              predicted_seq = predicted[:, j]
              targets_seq = target_seq[:, j]

              # Find the index of the first EOS token in the sequence (for character & word-level accuracy)
              eos_idx = (targets_seq == hin_token_map["\n"]).nonzero()
              if eos_idx.numel() > 0:
                  eos_idx = eos_idx[0][0]
                  predicted_seq = predicted_seq[:eos_idx]
                  targets_seq = targets_seq[:eos_idx]

              
        
        # reshape for cross-entropy loss
        output_flatten = output[1:].view(-1, output.shape[-1])
        trg_flatten = target_seq[1:].view(-1)

        loss = criterion(output_flatten, trg_flatten)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        predicted_seq=predicted_seq[1:].view(-1)
        targets_seq=targets_seq[1:].view(-1)
     
        correct_chars += torch.sum(predicted_seq == targets_seq).item()
        total_chars += targets_seq.numel()


    return total_loss / len(dataloader), correct_chars/total_chars

In [None]:

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.0

    total_chars = 0
    correct_chars = 0

    with torch.no_grad():
        for i, (input_seq, target_seq) in enumerate(dataloader):
            input_seq = input_seq.long().to(device)
            target_seq = target_seq.long().to(device)

            output = model(input_seq, target_seq,0)
            _, predicted = torch.max(output, dim=2)


            for j in range(predicted.shape[1]):
                predicted_seq = predicted[:, j]
                targets_seq = target_seq[:, j]

                # Find the index of the first EOS token in the sequence
                eos_idx = (targets_seq == hin_token_map["\n"]).nonzero()
                if eos_idx.numel() > 0:
                    eos_idx = eos_idx[0][0]
                    predicted_seq = predicted_seq[:eos_idx]
                    targets_seq = targets_seq[:eos_idx]


            # reshape for cross-entropy loss
            output_flatten = output[1:].view(-1, output.shape[-1])
            trg_flatten = target_seq[1:].view(-1)

            loss = criterion(output_flatten, trg_flatten)

            total_loss += loss.item()

            predicted_seq=predicted_seq[1:].view(-1)
            targets_seq=targets_seq[1:].view(-1)
     
            correct_chars += torch.sum(predicted_seq == targets_seq).item()
            total_chars += targets_seq.numel()

    return total_loss / len(dataloader), correct_chars/total_chars

In [None]:
'''
Best Model Configuration
Hidden unit-512
Embedding dim-256
Cell-LSTM
Lr-0.0001
Batch-64
Enc_dec_layer-2
Dropout-0.27
'''

'\nBest Model Configuration\nHidden unit-512\nEmbedding dim-258\nCell-LSTM\nLr-0.0001\nBatch-64\nEnc_dec_layer-2\nDropout-0.27\n'

In [None]:
N_EPOCHS = 25
best_valid_loss = float('inf')

model=build_model(cell = "LSTM",nunits = 512, enc_dec_layers = 2,embed_dim = 256,dropout=0.27)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=custom_collate)

val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=custom_collate)
model=model.to(device)
criterion = nn.CrossEntropyLoss(ignore_index=hin_token_map["<PAD>"]) # ignore padding index
optimizer = optim.Adam(model.parameters(), lr=1e-4)
for epoch in range(N_EPOCHS):    
    train_loss,acc = train(model=model, dataloader=train_dataloader, optimizer=optimizer, criterion=criterion,device=device)
    valid_loss,ch = evaluate(model, val_dataloader, criterion,device)

torch.save(model.state_dict(), 'best_model_without_att.pt')

In [None]:
%cp best_model_without_att.pt /content/drive/My\ Drive

In [None]:
model.load_state_dict(torch.load('best_model_without_att.pt'))
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=custom_collate)
test_loss,test_acc = evaluate(model, test_dataloader, criterion,device)
#print(f'Test Loss: {test_loss:.9f}')
#print(f'Acc: {ch:.6f}')

In [None]:
####
# For exact string match
####
real_hindi_list=[]
real_eng_list=[]
pred_hindi_list=[]


test_dataloader2 = DataLoader(test_dataset, batch_size=len(testx), shuffle=False, collate_fn=custom_collate)

input,target=next(iter(test_dataloader2))

total_correct=0
for i in range(len(input[0])-1):
  model.eval()
  with torch.no_grad():
      outputs = model(input[:,i:i+1].long().to(device), target[:,i:i+1].long().to(device), 0)
      output_idx = outputs[1:].squeeze(1).argmax(1)
  predicted_word=[]
  real_word_hindi=[]
  real_word_eng=[]

  for idx in output_idx.cpu():
    num=int(idx.numpy())
    if num ==2:
      break
    else:
      predicted_word.append(reverse_hin_map[num])

  for idx in target[:,i:i+1].numpy():
    if idx==2:
      break
    elif idx==1:
      pass
    else:
      real_word_hindi.append(reverse_hin_map[int(idx)])
  
  for idx in input[:,i:i+1].numpy():
    if idx==0:
      break
    elif idx==1:
      pass
    else:
      real_word_eng.append(reverse_eng_map[int(idx)])

  real_hindi_list.append("".join(real_word_hindi))
  real_eng_list.append("".join(real_word_eng))
  pred_hindi_list.append("".join(predicted_word))

  print("".join(predicted_word),"".join(real_word_hindi))
  if "".join(predicted_word)=="".join(real_word_hindi):
    total_correct+=1

#' '.join([target.vocab.itos[idx] for idx in output_idx])

print(f'Test Loss: {test_loss:.9f}')
print(f'Test Acc(charcter-level): {ch:.6f}')
print(f'Exact String Match: {total_correct/len(testy)}')

थर्मैक्स थरमैक्स
सिखाएगा सिखाएगा
लीर्न लर्न
ट्विटर्स ट्विटर्स
तिरुनेेवली तिरुनेलवेली
इंडेपेंडेंस इंडिपेंडेंस
स्पेशियों स्पेशियों
शुरूह शुरूः
कोल्हापुर कोल्हापुर
अझर अजहर
करार क़रार
अंका अंक
एपवीडी डब्ल्यूपीडी
हाशी हाशिए
ग्लेंडले ग्लेंडल
उधेद उधेड़
एक्ती इकठ्ठी
आईडीए आईडिया
अम्बिकापुर अम्बिकापुर
मकेरेरे माकेरेरे
सबूदाने साबूदाने
फूहद़ता फूहड़ता
सिक्वेंट सेक्वेंट
शूएब शुऐब
पनिहाती पानीहाटी
समेतति समेटती
उखरुल उखरुल
ब्रह्मलिन ब्रह्मलीन
उतराधिकारी उतराधिकारी
इकबाल इक़बाल
दयाापुरा दयालपुरा
सोहराई सोहराई
तकरीबन तक़रीबन
फर्रुखनगर फर्रूखनगर
थेंगा ठेंगा
त्योइहारों त्यौहारों
कर्णेश्वरध्म कर्णेश्वरधाम
उमनाथ उमानाथ
दांशील दानशील
साहित्योत्स्व साहित्योत्सव
शान्तिनिकेतन शांतिनिकेतन
शिकययकत्रता शिकायतकर्ता
अंदरखाने अंदरखाने
पंटर पंटर
लीडारों लीडरों
गलगंड गलगंड
कारणियाँ कार्नियां
मुर्गीपालन मुर्गीपालन
मुशहहिद मुशाहिद
मोडुल्स मॉड्यूल्स
राजौरी रजौरी
सुश्रुषा सुश्रुषा
श्रृंगार शृंगार
हॉल्ट होल्ट
लाइगिकता लैंगिकता
इजाजत इजाजत
वंक्षेत्र वनक्षेत्र
भुतल भूतल
स्वादप्रेमियों स्वादप्रेमियों
निनतीज़ नाइनटीज
फ्रे

In [None]:
pred_vanilla=pd.DataFrame({"English":real_eng_list,"Real_Hindi":real_hindi_list,"Predicted_Hindi":pred_hindi_list})

In [None]:
pred_vanilla.to_csv("pred_vanilla.csv")

In [None]:
pred_vanilla

Unnamed: 0,English,Real_Hindi,Predicted_Hindi
0,thermx,थरमैक्स,थर्मैक्स
1,sikheg,सिखाएगा,सिखाएगा
2,lern,लर्न,लीर्न
3,twitters,ट्विटर्स,ट्विटर्स
4,tirunelveli,तिरुनेलवेली,तिरुनेेवली
...,...,...,...
4090,belch,बेलचा,बेलचा
4091,sflt,सफ़लता,सफलता
4092,shbn,शबाना,शबना
4093,khtootol,खातूटोला,खातूतोला


In [None]:

valid_dataloader2 = DataLoader(val_dataset, batch_size=len(valx), shuffle=False, collate_fn=custom_collate)

input,target=next(iter(valid_dataloader2))

total_correct=0
for i in range(len(input[0])-1):
  model.eval()
  with torch.no_grad():
      outputs = model(input[:,i:i+1].long().to(device), target[:,i:i+1].long().to(device), 0)
      output_idx = outputs[1:].squeeze(1).argmax(1)
  predicted_word=[]
  real_word_hindi=[]
  real_word_eng=[]

  for idx in output_idx.cpu():
    num=int(idx.numpy())
    if num ==2:
      break
    else:
      predicted_word.append(reverse_hin_map[num])

  for idx in target[:,i:i+1].numpy():
    if idx==2:
      break
    elif idx==1:
      pass
    else:
      real_word_hindi.append(reverse_hin_map[int(idx)])
  
  for idx in input[:,i:i+1].numpy():
    if idx==2:
      break
    elif idx==1:
      pass
    else:
      real_word_eng.append(reverse_eng_map[int(idx)])


  print("".join(predicted_word),"".join(real_word_hindi))
  if "".join(predicted_word)=="".join(real_word_hindi):
    total_correct+=1

#' '.join([target.vocab.itos[idx] for idx in output_idx])


print(f"Val Loss:{valid_loss}")
print(f"Character Level Accuracy:{ch}")
print(f"Validation(Exact String Match):{total_correct/len(valx)}")

जैसवाल जयसवाल
बजाई बजाई
संघथन संघठन
हैवान हैवान
निलगिरी नीलगिरि
ड्र्टग्रामी द्रुतग्रामी
झड़पों झड़पों
नकरोंडा नकरोंदा
ईएसएल ईईएसएल
बचता बचता
वोड़का वोडका
क्रिस्टियान क्रिस्टन
गुआनजुता गुआनाजुआटा
शेर्षी शेरशी
मननाकी मन्नाकी
जिम्मा जिम्मा
युन यूं
बैग बेग
मारसी मारसी
पुस पूस
चर्मफुल चार्मफुल
श्रीपेरु श्रीपेरू
रुसियो रशियो
आईएचएफएल आईएचएफएल
गणीनाथ गणिनाथ
बंदीकुई बंदीकुई
लसलले लासाले
नमदान नामदान
डिकाइन डिकेन
रेडिज़िन रीडिजाइन
होलोबोन होलोबोन
इलाकों इलाकों
आश्चर्यजनक आश्चर्यजनक
क्रुस क्रूस
ओप्शन आप्शन
महल महल
रिकैपिटलाइजेशन रिकैपिटलाइजेशन
बोर्डो बोर्दो
मनमाना मनमाना
कागजो कागज़ों
पापा पापा
जटबाड़ा जटवाड़ा
स्थिति स्थिति
खांकर्मी खानकर्मी
उपमुख्यमंत्री उपमुख्यमंत्री
रों रोन
फैमिलाइज फेमिलीज
हकलाते हकलाते
नकसलवाादियों नक्सलवादियों
जातिलताओं जटिलताओं
चिक्कामगगुलारू चिक्कमंगलूरू
अंतरकलाह अंतर्कलह
सुर्क्षित सुरक्षित
ब्लूटिक ब्लूटिक
घिसेन घिसें
लेपिजिग लीप्ज़िग
श्रवंजी श्रवणजी
पोध्मुबु पोधुंबू
भद्रोल भद्रोल
मापे मापे
कैदोपुर कैदोपुर
ट्टुनर ट्यूनर
द्रौपदी द्रोपदी
सुपरकारिडोर सुपरकॉरिडोर
सोचती सोचती