In [40]:
import csv
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable 
import copy
import torch.nn.functional as F
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils import clip_grad_norm_
import random

In [41]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [42]:
def loadData(train_path, val_path, test_path):
    train_data = csv.reader(open(train_path,encoding='utf8'))
    val_data = csv.reader(open(val_path,encoding='utf8'))
    test_data = csv.reader(open(test_path,encoding='utf8'))
    train_words , train_translations = [], []
    val_words , val_translations = [], []
    test_words , test_translations = [], []
    
    empty, start, end ='', '^', '$'
    for pair in train_data:
        train_words.append(pair[0] + end)
        train_translations.append(start + pair[1] + end)
    for pair in val_data:
        val_words.append(pair[0] + end)
        val_translations.append(start + pair[1] + end)
    for pair in test_data:
        test_words.append(pair[0] + end)
        test_translations.append(start + pair[1] + end)
    
    train_words , train_translations = np.array(train_words), np.array(train_translations)
    val_words , val_translations = np.array(val_words), np.array(val_translations)
    test_words , test_translations = np.array(test_words), np.array(test_translations)

    inout_vocab = set()
    output_vocab = set()
    
    for w in train_words:
        for c in w:
            inout_vocab.add(c)
    for w in val_words:
        for c in w:
            inout_vocab.add(c)
    for w in test_words:
        for c in w:
            inout_vocab.add(c)
            
    for w in train_translations:
        for c in w:
            output_vocab.add(c)
    for w in val_translations:
        for c in w:
            output_vocab.add(c)
    for w in test_translations:
        for c in w:
            output_vocab.add(c)
    
    inout_vocab.remove(end)
    output_vocab.remove(start)
    output_vocab.remove(end)  
    inout_vocab, output_vocab = [empty, start, end] + list(sorted(inout_vocab)), [empty, start, end] + list(sorted(output_vocab))
            
    input_index = dict([(char, idx) for idx, char in enumerate(inout_vocab)])
    output_index =  dict([(char, idx) for idx, char in enumerate(output_vocab)])
    input_index_rev = dict([(idx, char) for char, idx in input_index.items()])
    output_index_rev = dict([(idx, char) for char, idx in output_index.items()])
    
    max_enc_len = max([len(word) for word in np.hstack((train_words, test_words, val_words))])
    max_dec_len = max([len(word) for word in np.hstack((train_translations, val_translations, test_translations))])
    max_len = max(max_enc_len, max_dec_len)
        
    result = {
        'train_words' : train_words,
        'train_translations' : train_translations,
        'val_words' : val_words,
        'val_translations' : val_translations,
        'test_words' : test_words,
        'test_translations' : test_translations,
        'max_enc_len' : max_enc_len,
        'max_dec_len' : max_dec_len,
        'max_len' : max_len,
        'input_index' : input_index,
        'output_index' : output_index,
        'input_index_rev' : input_index_rev,
        'output_index_rev' : output_index_rev
    }
    return result

In [43]:
def create_tensor(result):
    input_data = np.zeros((result['max_len'],len(result['train_words'])), dtype = 'int64')
    output_data = np.zeros((result['max_len'],len(result['train_words'])), dtype = 'int64')
    
    val_input_data = np.zeros((result['max_len'],len(result['val_words'])), dtype = 'int64')
    val_output_data = np.zeros((result['max_len'],len(result['val_words'])), dtype = 'int64')
    
    test_input_data = np.zeros((result['max_len'],len(result['test_words'])), dtype = 'int64')
    test_output_data = np.zeros((result['max_len'],len(result['test_words'])), dtype = 'int64')
    
    for idx, (w, t) in enumerate(zip(result['train_words'], result['train_translations'])):
        for i, char in enumerate(w):
            input_data[i, idx] = result['input_index'][char]
        for i, char in enumerate(t):
            output_data[i, idx] = result['output_index'][char]
        
    for idx, (w, t) in enumerate(zip(result['val_words'], result['val_translations'])):
        for i, char in enumerate(w):
            val_input_data[i, idx] = result['input_index'][char]
        for i, char in enumerate(t):
            val_output_data[i, idx] = result['output_index'][char]
    
    for idx, (w, t) in enumerate(zip(result['test_words'], result['test_translations'])):
        for i, char in enumerate(w):
            test_input_data[i, idx] = result['input_index'][char]
        for i, char in enumerate(t):
            test_output_data[i, idx] = result['output_index'][char]
    
    input_data, output_data = torch.tensor(input_data,dtype = torch.int64), torch.tensor(output_data, dtype = torch.int64)
    val_input_data, val_output_data = torch.tensor(val_input_data,dtype = torch.int64), torch.tensor(val_output_data, dtype = torch.int64)
    test_input_data, test_output_data = torch.tensor(test_input_data,dtype = torch.int64), torch.tensor(test_output_data, dtype = torch.int64)
    
    tensors = {
        'input_data' : input_data,
        'output_data' : output_data,
        'val_input_data' : val_input_data,
        'val_output_data' : val_output_data, 
        'test_input_data' : test_input_data,
        'test_output_data' : test_output_data
    }
    return tensors

In [44]:
language = 'hin'
# dataset_path = r'C:\Users\gragh\OneDrive\Desktop\Codes\CS6910 DL\Assignment 3\DataSet\aksharantar_sampled'
dataset_path = '/kaggle/input/dl-ass3/aksharantar_sampled'

train_path = os.path.join(dataset_path, language, language + '_train.csv')
val_path = os.path.join(dataset_path, language, language + '_valid.csv')
test_path = os.path.join(dataset_path, language, language + '_test.csv')
result = loadData(train_path, val_path, test_path)
tensors = create_tensor(result)

print('Input data : ', result['train_words'])
print('Output data : ', result['train_translations'])
print('Number of samples : ', len(result['train_words']))

print('Input data : ', result['val_words'])
print('Output data : ', result['val_translations'])
print('Number of val samples : ', len(result['val_words']))

print('Input data : ', result['test_words'])
print('Output data : ', result['test_translations'])
print('Number of test samples : ', len(result['test_words']))

print('Max incoder length : ', result['max_enc_len'])
print('Max decoder length : ', result['max_dec_len'])

print('Input index length', len(result['input_index']))
print('Output index length', len(result['output_index']))
print('Input index', result['input_index'])
print('Output index', result['output_index'])
print('Input index Rev', result['input_index_rev'])
print('Output index Rev', result['output_index_rev'])

print('Input Data', tensors['input_data'].shape)
print('Output Data', tensors['output_data'].shape)
print('Input Data Val', tensors['val_input_data'].shape)
print('Output Data Val', tensors['val_output_data'].shape)
print('Input Data Test', tensors['test_input_data'].shape)
print('Output Data Test', tensors['test_output_data'].shape)

# print(tensors['input_data'][:,0])
# print(tensors['output_data'][:,0])

Input data :  ['shastragaar$' 'bindhya$' 'kirankant$' ... 'asahmaton$' 'sulgaayin$'
 'anchuthengu$']
Output data :  ['^शस्त्रागार$' '^बिन्द्या$' '^किरणकांत$' ... '^असहमतों$' '^सुलगायीं$'
 '^अंचुतेंगु$']
Number of samples :  51200
Input data :  ['jaisawal$' 'bajai$' 'sanghthan$' ... 'ekamreshwar$' 'bluetooth$'
 'govindram$']
Output data :  ['^जयसवाल$' '^बजाई$' '^संघठन$' ... '^एकाम्रेश्वर$' '^ब्ल्यूटूथ$'
 '^गोविंद्राम$']
Number of val samples :  4096
Input data :  ['thermax$' 'sikhaaega$' 'learn$' ... 'khaatootolaa$' 'shivastava$'
 'preranapuree$']
Output data :  ['^थरमैक्स$' '^सिखाएगा$' '^लर्न$' ... '^खातूटोला$' '^शिवास्तव$'
 '^प्रेरणापुरी$']
Number of test samples :  4096
Max incoder length :  27
Max decoder length :  22
Input index length 29
Output index length 68
Input index {'': 0, '^': 1, '$': 2, 'a': 3, 'b': 4, 'c': 5, 'd': 6, 'e': 7, 'f': 8, 'g': 9, 'h': 10, 'i': 11, 'j': 12, 'k': 13, 'l': 14, 'm': 15, 'n': 16, 'o': 17, 'p': 18, 'q': 19, 'r': 20, 's': 21, 't': 22, 'u': 23, 'v': 2

In [62]:
class Encoder(nn.Module): 
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout):
        super(Encoder, self).__init__()
        drop_par = dropout
        self.dropout = nn.Dropout(drop_par)
        self.num_layers, self.hidden_size, = num_layers, hidden_size
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout)

    def forward(self, x):
        drop_par = self.embedding(x)
        outputs, (hidden, cell) = self.rnn(self.dropout(drop_par))
        return hidden, cell


class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, dropout):
        super(Decoder, self).__init__()
        drop_par, hidden_layer_size = dropout, hidden_size
        self.dropout,self.num_layers, self.hidden_size = nn.Dropout(drop_par),  num_layers, hidden_layer_size
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout= drop_par)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        x  = x.unsqueeze(0) 
        embedding = self.embedding(x)
        outputs, (hidden, cell) = self.rnn(self.dropout(embedding), (hidden, cell))
        predictions = self.fc(outputs).squeeze(0)
        predictions = F.log_softmax(predictions, dim = 1)
        return predictions, hidden, cell
    
class Seq2Seq(nn.Module):

    def __init__(self, encoder, decoder, output_index_len):
        super(Seq2Seq, self).__init__()  
        self.decoder, self.encoder = decoder, encoder
        self.output_index_len = output_index_len

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size, target_len, target_vocab_size = source.shape[1], target.shape[0], self.output_index_len
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        x = target[0]
        hidden, cell = self.encoder(source)

        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[t], best_guess = output, output.argmax(1)
            x = best_guess if random.random() >= teacher_force_ratio else target[t]
        return outputs

In [63]:
# def evaluate(model, val_loader, criterion):
#     model.eval()
#     total_loss = 0
#     correct_predictions = 0
#     total_predictions = 0
    
#     with torch.no_grad():
#         for source, target in val_loader:
#             source = source.to(device)
#             target = target.to(device)
#             output = model(source, target, 0)
#             output_dim = output.shape[-1]
#             output = output[1:].view(-1, output_dim)
#             target = target[1:].view(-1)
#             loss = criterion(output, target)
#             torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1)
#             total_loss += loss.item()
#             predicted = output.argmax(dim=1)
#             correct_predictions += (predicted == target).sum().item()
#             total_predictions += target.numel()

#     val_loss = total_loss / len(val_loader)
#     val_accuracy = correct_predictions / total_predictions

#     return val_loss, val_accuracy

# def Train(model, train_loader, val_loader, criterion, optimizer, num_epochs):
#     for epoch in range(num_epochs):
#         model.train()
#         total_loss = 0
#         correct_predictions = 0
#         total_predictions = 0

#         for source, target in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
#             source = source.to(device)
#             target = target.to(device)

#             optimizer.zero_grad()
#             output = model(source, target)
#             output_dim = output.shape[-1]

#             output = output[1:].view(-1, output_dim)
#             target = target[1:].view(-1)

#             loss = criterion(output, target)

#             loss.backward()
#             optimizer.step()

#             total_loss += loss.item()

#             # Calculate accuracy
#             predicted = output.argmax(dim=1)
#             correct_predictions += (predicted == target).sum().item()
#             total_predictions += target.numel()

#         train_loss = total_loss / len(train_loader)
#         train_accuracy = correct_predictions / total_predictions
#         val_loss, val_accuracy = evaluate(model, val_loader, criterion)
#         print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}, Val Loss: {val_loss:.4f} Val Accuracy: {val_accuracy:.4f}")    

#     return model

In [70]:
language = 'hin'
# dataset_path = r'C:\Users\gragh\OneDrive\Desktop\Codes\CS6910 DL\Assignment 3\DataSet\aksharantar_sampled'
dataset_path = '/kaggle/input/dl-ass3/aksharantar_sampled'

train_path = os.path.join(dataset_path, language, language + '_train.csv')
val_path = os.path.join(dataset_path, language, language + '_valid.csv')
test_path = os.path.join(dataset_path, language, language + '_test.csv')
result = loadData(train_path, val_path, test_path)
tensors = create_tensor(result)

params = {
    "input_size": len(result['input_index']),
    "output_size": len(result['output_index']),
    "embedding_size": 256,
    "hidden_size": 512,
    "num_layers": 2,
    "cell_type": "LSTM",
    "dropout": 0.5,
    "learning_rate": 0.001,
    "batch_size": 64,
    "num_epochs": 10
}

encoder = Encoder(params['input_size'], params['embedding_size'], params['hidden_size'], params['num_layers'], params['dropout']).to(device)
decoder = Decoder(params['output_size'], params['embedding_size'], params['hidden_size'], params['output_size'], params['num_layers'], params['dropout']).to(device)

model = Seq2Seq(encoder, decoder, len(result['output_index'])).to(device)
# print(model)

# Print total number of parameters in the model
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Total Trainable Parameters: {total_params}')

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'])

train_ds_x, train_ds_y = torch.split(tensors['input_data'], 32, dim = 1), torch.split(tensors['output_data'], 32, dim = 1)
val_ds_x, val_ds_y = torch.split(tensors['val_input_data'], 32, dim=1), torch.split(tensors['val_output_data'], 32, dim=1)

# correct_prediction = 0

for epoch in range(params['num_epochs']):
  total_words = 0
  correct_pred = 0
  total_loss = 0
  model.train()
  with tqdm(total=len(train_ds_x), desc='Training') as pbar:
    for i, (x, y) in enumerate(zip(train_ds_x, train_ds_y)):
      target, inp_data = y.to(device), x.to(device)
      output = model(inp_data, target)
      pad_mask = (target != result['output_index'][''])  # Replace PAD_INDEX with your actual padding character index
      non_pad_targets = target[pad_mask]  # Select non-padding elements
      non_pad_outputs = output[pad_mask].reshape(-1, output.shape[2])  # Select corresponding outputs
      loss = criterion(non_pad_outputs, non_pad_targets)
      optimizer.zero_grad()
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
      optimizer.step()
      total_words += non_pad_targets.size(0)
      correct_pred += torch.sum(torch.argmax(non_pad_outputs, dim=1) == non_pad_targets).item()
      total_loss += loss.item()
      pbar.update(1)

  accuracy = correct_pred / total_words
  avg_loss = total_loss / len(train_ds_x)  # Average loss per batch

  model.eval()
  with torch.no_grad():
    val_total_loss = 0
    val_total_words = 0
    val_correct_pred = 0
    with tqdm(total=len(val_ds_x), desc = 'Validation') as pbar:
      for x_val, y_val in zip(val_ds_x, val_ds_y):
        target_val, inp_data_val = y_val.to(device), x_val.to(device)
        output_val = model(inp_data_val, target_val)
        pad_mask = (target_val != result['output_index'][''])
        non_pad_targets = target_val[pad_mask]
        non_pad_outputs = output_val[pad_mask].reshape(-1, output_val.shape[2])
        val_loss = criterion(non_pad_outputs, non_pad_targets)
        val_total_loss += val_loss.item()
        val_total_words += non_pad_targets.size(0)
        val_correct_pred += torch.sum(torch.argmax(non_pad_outputs, dim=1) == non_pad_targets).item()
        pbar.update(1)

    val_accuracy = val_correct_pred / val_total_words
    val_avg_loss = val_total_loss / len(val_ds_x)
    print(f"Epoch {epoch+1} Train Accuracy: {accuracy*100:.4f}, Train Loss: {avg_loss:.4f} Validation Accuracy: {val_accuracy*100:.4f}, Validation Loss: {val_avg_loss:.4f}")


Total Trainable Parameters: 7416132


Training: 100%|██████████| 1600/1600 [00:58<00:00, 27.54it/s]
Validation: 100%|██████████| 128/128 [00:01<00:00, 78.38it/s]


Epoch 1 Train Accuracy: 34.3797, Train Loss: 2.4545 Validation Accuracy: 57.7149, Validation  Loss: 1.5050


Training: 100%|██████████| 1600/1600 [00:58<00:00, 27.47it/s]
Validation: 100%|██████████| 128/128 [00:01<00:00, 78.60it/s]


Epoch 2 Train Accuracy: 58.4598, Train Loss: 1.4433 Validation Accuracy: 65.6898, Validation  Loss: 1.2402


Training: 100%|██████████| 1600/1600 [00:58<00:00, 27.37it/s]
Validation: 100%|██████████| 128/128 [00:01<00:00, 79.26it/s]


Epoch 3 Train Accuracy: 64.5703, Train Loss: 1.2346 Validation Accuracy: 68.0348, Validation  Loss: 1.1656


Training: 100%|██████████| 1600/1600 [00:58<00:00, 27.52it/s]
Validation: 100%|██████████| 128/128 [00:01<00:00, 68.93it/s]


Epoch 4 Train Accuracy: 67.4981, Train Loss: 1.1421 Validation Accuracy: 69.2716, Validation  Loss: 1.1195


Training: 100%|██████████| 1600/1600 [00:57<00:00, 27.63it/s]
Validation: 100%|██████████| 128/128 [00:01<00:00, 77.32it/s]


Epoch 5 Train Accuracy: 69.3933, Train Loss: 1.0812 Validation Accuracy: 70.3799, Validation  Loss: 1.0945


Training: 100%|██████████| 1600/1600 [00:58<00:00, 27.49it/s]
Validation: 100%|██████████| 128/128 [00:01<00:00, 78.98it/s]


Epoch 6 Train Accuracy: 70.7724, Train Loss: 1.0375 Validation Accuracy: 71.0626, Validation  Loss: 1.0788


Training: 100%|██████████| 1600/1600 [00:58<00:00, 27.55it/s]
Validation: 100%|██████████| 128/128 [00:01<00:00, 77.47it/s]


Epoch 7 Train Accuracy: 72.0694, Train Loss: 0.9984 Validation Accuracy: 71.2911, Validation  Loss: 1.0781


Training: 100%|██████████| 1600/1600 [00:58<00:00, 27.44it/s]
Validation: 100%|██████████| 128/128 [00:01<00:00, 75.72it/s]


Epoch 8 Train Accuracy: 72.9594, Train Loss: 0.9691 Validation Accuracy: 71.6795, Validation  Loss: 1.0694


Training: 100%|██████████| 1600/1600 [00:58<00:00, 27.36it/s]
Validation: 100%|██████████| 128/128 [00:01<00:00, 76.75it/s]


Epoch 9 Train Accuracy: 73.9322, Train Loss: 0.9392 Validation Accuracy: 72.2622, Validation  Loss: 1.0471


Training: 100%|██████████| 1600/1600 [00:58<00:00, 27.55it/s]
Validation: 100%|██████████| 128/128 [00:01<00:00, 76.73it/s]

Epoch 10 Train Accuracy: 74.3781, Train Loss: 0.9246 Validation Accuracy: 72.1480, Validation  Loss: 1.0601





In [71]:
def predict(model, word, input_char_index, output_char_index, reverse_target_char_index):
    data, word_t = np.zeros((len(input_char_index),1), dtype= int), ''
    t_z = 0
    for t, char in enumerate(word):
        data[t, 0] = input_char_index[char]
    t_z = t+1   
    data[t_z :,0] = input_char_index["$"]
    data = torch.tensor(data,dtype = torch.int64).to(device)
    with torch.no_grad():
        hidden, cell = model.encoder(data)
    out_t = output_char_index['^']    
    out_chr_reshape = np.array(out_t).reshape(1,)    
    x = torch.tensor(out_chr_reshape).to(device)

    for t in range(1, len(output_char_index)):
        output, hidden, cell = model.decoder(x, hidden, cell)
        ch = reverse_target_char_index[output.argmax(1).item()]
        if ch != '$':
            word_t = word_t+ch
        else:
            break
    return word_t

In [87]:
words = ['harsh', 'iit', 'madras', 'nirav', 'nidhi', 'nishchal', 'nishant', 'neymar', 'neha', 'raghav', 'rahul', 'rohit', 'hahahahaha', 'ohohohoh']
for w in result['train_words'][:10]:
    output_sequence = predict(model, w[:-1], result['input_index'], result['output_index'], result['output_index_rev'])
    print(w[:-1],"->",output_sequence)
for w in result['val_words'][:10]:
    output_sequence = predict(model, w[:-1], result['input_index'], result['output_index'], result['output_index_rev'])
    print(w[:-1],"->",output_sequence)

shastragaar -> शसस्््ररगग
bindhya -> बिंधध््
kirankant -> किरनकककं्
yagyopaveet -> यजगञयोपपिितत
ratania -> रताानिि
vaganyache -> वागण््येेे
deshbharamadhye -> देशभभररमम्््े
sughadpan -> सुघाडपप
mohiwal -> मोहिववल
sarvasangrah -> सररवससंगग््््
jaisawal -> जैसवालल
bajai -> बजाज
sanghthan -> संंघठन
haiwaan -> हैववव
nilgiri -> निललििररररर
drutgrami -> द्रटटररररररररर
jhadapon -> झडडपोंंं
nakronda -> नककरररंडड
eesl -> ईससएल
bachta -> बचचतत
