# Read the data

In [28]:
!pip install lightning wandb

[0m

In [29]:
!WANDB_API_KEY=8c780297be240a84f5c8b7d669cb158839b2637a

In [30]:
import pandas as pd
import torch 
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import lightning as pl
from pytorch_lightning.loggers import WandbLogger
import random
import wandb

In [31]:
!WANDB_API_KEY=8c780297be240a84f5c8b7d669cb158839b2637a wandb login

[34m[1mwandb[0m: Currently logged in as: [33mcs20b075[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [35]:
path = "/kaggle/input/aksharantar/aksharantar_sampled/hin"
train_path = path + "/hin_train.csv"
valid_path = path + "/hin_valid.csv"
test_path = path + "/hin_test.csv"

In [36]:
def get_data(path):
    dataset = pd.read_csv(path, header=None)
    dataset = dataset.values
    input = dataset[:, 0]
    output = dataset[:, 1]
    return input, output

In [37]:
train_dataset = get_data(train_path)
val_dataset = get_data(valid_path)

In [38]:
def convert_word_to_tensor(word, lang):
    lang_to_int = {'SOS': 0, 'EOS': 1, 'PAD': 2}
    if lang == 'eng':
        lang_to_int.update({chr(i): i-94 for i in range(97, 123)})
    elif lang == 'hin':
        lang_to_int.update({chr(i): i-2300 for i in range(2304, 2432)})
    
    a = [lang_to_int['SOS']]

    for i in word:
        a.append(lang_to_int[i])

    a.append(lang_to_int['EOS'])
    if len(a) < 24:
        a.extend([lang_to_int['PAD']]*(24-len(a)))
    
    return torch.tensor(a)

In [39]:
class AksharantarDataset(Dataset):
    def __init__(self, dataset):
        super().__init__()
        self.dataset = dataset
        self.input = dataset[0]
        self.output = dataset[1]
        mask = np.array([len(elem) < 21 for elem in self.input]) & np.array([len(elem) < 21 for elem in self.output])
        self.input = self.input[mask]
        self.output = self.output[mask]
        self.len = len(self.input)
    
    def __getitem__(self, index):
        return convert_word_to_tensor(self.input[index], 'eng'), convert_word_to_tensor(self.output[index], 'hin')
    
    def __len__(self):
        return self.len

In [40]:
class CustomDataModule(pl.LightningDataModule):
    def __init__(self, dataset, val_dataset, batch_size=32):
        super().__init__()
        self.dataset = train_dataset
        self.val_dataset = val_dataset
        self.batch_size = batch_size

    def train_dataloader(self):
        dataset = AksharantarDataset(self.dataset)
        return DataLoader(dataset, batch_size=self.batch_size, num_workers=2)
    def val_dataloader(self):
        dataset = AksharantarDataset(self.val_dataset)
        return DataLoader(dataset, batch_size=self.batch_size, num_workers=2)

In [41]:
train_loader = CustomDataModule(train_dataset, val_dataset, 32)
# val_loader = CustomDataModule(val_dataset, 32)

# Encoder model

In [42]:
class Encoder(pl.LightningModule):
    def __init__(self, input_size, hidden_size, cell_type, num_layers=1, dropout=0, bidirectional=False):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.cell_type = cell_type
        if cell_type == 'LSTM':
            self.rnn = nn.LSTM
        elif cell_type == 'GRU':
            self.rnn = nn.GRU
        else:
            self.rnn = nn.RNN
        self.direction = 2 if bidirectional else 1
        self.first_cell = self.rnn(hidden_size, hidden_size, bidirectional=bidirectional)
        self.rnns = nn.ModuleList([self.rnn(hidden_size*self.direction, hidden_size, bidirectional=bidirectional)]*(num_layers-1))
        self.num_layers = num_layers

    def forward(self, input, hidden):
        embedded = self.embedding(input)
        # embedded = embedded.view(1, 1, -1)
        output = embedded
        output, hidden = self.first_cell(output, hidden)
        for i in range(self.num_layers-1):
            output, hidden = self.rnns[i](output, hidden)
        return output, hidden

    def init_hidden(self):
        if self.cell_type == 'LSTM':
            return torch.zeros(self.direction, self.hidden_size), torch.zeros(self.direction, self.hidden_size)
        return torch.zeros(self.direction, self.hidden_size, device=self.device)

# Decoder

In [43]:
class Decoder(pl.LightningModule):
    def __init__(self, output_size, hidden_size, cell_type, num_layers=1, bidirectional=False, dropout=0):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        if cell_type == 'LSTM':
            self.cell_type = nn.LSTM
        elif cell_type == 'GRU':
            self.cell_type = nn.GRU
        else:
            self.cell_type = nn.RNN
        self.first_cell = self.cell_type(hidden_size, hidden_size, bidirectional=bidirectional, batch_first=True)
        self.direction = 2 if bidirectional else 1
        self.rnns= nn.ModuleList([self.cell_type(hidden_size*self.direction, hidden_size, bidirectional=bidirectional, batch_first=True)]*(num_layers-1))
        self.out = nn.Linear(hidden_size*self.direction, output_size)
        self.softmax = nn.LogSoftmax(dim=-1)
        self.num_layers = num_layers

    def forward(self, input, hidden):
        output = self.embedding(input)
        output = nn.functional.relu(output)
        output, hidden = self.first_cell(output, hidden)
        for i in range(self.num_layers-1):
            output, hidden = self.rnns[i](output, hidden)
        linear_output = self.out(output)
        output = self.softmax(self.out(output))
        if output.shape[0] == 1:
            output = output.squeeze(0)
        return output, hidden

# Seq2seq model

In [57]:
class Seq2seq(pl.LightningModule):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder.to(self.device)
        self.decoder = decoder.to(self.device)

    def forward(self, input):
        batched = True if len(input.shape) > 1 else False
        if not batched:
            input = input.unsqueeze(0)
        batch_size = input.shape[0]
        input_length = input.shape[1]
        
        self.encoder = self.encoder.to(self.device)
        self.decoder = self.decoder.to(self.device)
        
        encoder_hidden=None
        
        encoder_hidden_outputs = torch.zeros(batch_size, self.encoder.hidden_size*self.encoder.direction, device=self.device)
        encoder_output_gate = torch.zeros(batch_size, self.encoder.hidden_size*self.encoder.direction, device=self.device)
        for i in range(batch_size):
            if self.encoder.cell_type == 'LSTM':
                a, b = self.encoder.init_hidden()
                encoder_hidden = a.to(self.device), b.to(self.device)
            else:
                encoder_hidden = self.encoder.init_hidden().to(self.device)

            _, encoder_hidden = self.encoder(input[i], encoder_hidden)
            if self.encoder.cell_type == 'LSTM':
                encoder_hidden_outputs[i] = encoder_hidden[0].flatten()
                encoder_output_gate[i] = encoder_hidden[1].flatten()
            else:
                encoder_hidden_outputs[i] = encoder_hidden.flatten()
        output_sequence = [[]]*batch_size
        for i in range(batch_size):
            if self.encoder.cell_type == 'LSTM':
                decoder_hidden = encoder_hidden_outputs[i].view(self.decoder.direction, -1), encoder_output_gate[i].view(self.decoder.direction, -1)
            else:
                decoder_hidden = encoder_hidden_outputs[i].view(self.decoder.direction, -1)
            decoder_input = torch.tensor([0], device=self.device)
            output_sequence[i].append(decoder_input)
            for j in range(input_length):
                decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
                decoder_input = torch.tensor([decoder_output.argmax().item()]).to(self.device)
                output_sequence[i].append(decoder_input)
        output_sequence = torch.tensor(output_sequence, device=self.device)
        if not batched:
            output_sequence = output_sequence.squeeze(0)
        return output_sequence
        
    def training_step(self, batch, batch_idx):
        input, target = batch
        
        self.encoder = self.encoder.to(self.device)
        self.decoder = self.decoder.to(self.device)
        
        batched = True if len(input.shape) > 1 else False
        if not batched:
            input = input.unsqueeze(0)
            target = target.unsqueeze(0)
        input = input.to(self.device)
        target = target.to(self.device)
        batch_size = input.shape[0]
        input_length = input.shape[1]
        target_length = target.shape[1]

        encoder_hidden = None
        encoder_hidden_outputs = torch.zeros(batch_size, self.encoder.hidden_size*self.encoder.direction, device=self.device)
        encoder_output_gate = torch.zeros(batch_size, self.encoder.hidden_size*self.encoder.direction, device=self.device)
        for i in range(batch_size):
            if self.encoder.cell_type == 'LSTM':
                a, b = self.encoder.init_hidden()
                encoder_hidden = a.to(self.device), b.to(self.device)
            else:
                encoder_hidden = self.encoder.init_hidden().to(self.device)

            _, encoder_hidden = self.encoder(input[i], encoder_hidden)
            if self.encoder.cell_type == 'LSTM':
                encoder_hidden_outputs[i] = encoder_hidden[0].flatten()
                encoder_output_gate[i] = encoder_hidden[1].flatten()
            else:
                encoder_hidden_outputs[i] = encoder_hidden.flatten()
        loss = 0
        correct_words = 0
        if random.random() < 0.5: 
            if self.encoder.cell_type == 'LSTM':
                decoder_hidden = encoder_hidden_outputs.view(batch_size, self.decoder.direction, -1).transpose(0, 1).contiguous(), encoder_output_gate.view(batch_size, self.decoder.direction, -1).transpose(0, 1).contiguous()
            else:
                decoder_hidden = encoder_hidden_outputs.view(batch_size, self.decoder.direction, -1).transpose(0, 1).contiguous()
            decoder_input = target[:, 0].unsqueeze(1)
            correct = None
            for j in range(target_length):
                decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
                squeezed_output = decoder_output.squeeze(1)
                for i in range(batch_size):
                    loss += nn.functional.nll_loss(squeezed_output[i], target[i, j])
                decoder_input = target[:, j].unsqueeze(1)
                if correct is None:
                    correct = decoder_output.argmax(dim=-1) == target[:, j]
                else:
                    correct = (decoder_output.argmax(dim=-1) == target[:, j]) & correct
            correct_words = correct.sum()

            # for i in range(batch_size):
            #     if self.encoder.cell_type == 'LSTM':
            #         decoder_hidden = encoder_hidden_outputs[i].view(self.decoder.direction, -1), encoder_output_gate[i].view(self.decoder.direction, -1)
            #     else:
            #         decoder_hidden = encoder_hidden_outputs[i].view(self.decoder.direction, -1)
            #     correct = True
            #     for j in range(target_length):
            #         decoder_output, decoder_hidden = self.decoder(target[i, j].unsqueeze(0), decoder_hidden)
            #         loss += nn.functional.nll_loss(decoder_output, target[i, j])
            #         if correct and target[i, j]!=decoder_output.argmax().item():
            #             correct=False
            #     if correct:
            #         correct_words += 1
        else:
            if self.encoder.cell_type == 'LSTM':
                decoder_hidden = encoder_hidden_outputs.view(batch_size, self.decoder.direction, -1).transpose(0, 1).contiguous(), encoder_output_gate.view(batch_size, self.decoder.direction, -1).transpose(0, 1).contiguous()
            else:
                decoder_hidden = encoder_hidden_outputs.view(batch_size, self.decoder.direction, -1).transpose(0, 1).contiguous()
            decoder_input = target[:, 0].unsqueeze(1)
            correct = None
            for j in range(target_length):
                decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
                squeezed_output = decoder_output.squeeze(1)
                for i in range(batch_size):
                    loss += nn.functional.nll_loss(squeezed_output[i], target[i, j])
                decoder_input = decoder_output.argmax(dim=-1)
                if correct is None:
                    correct = decoder_input == target[:, j]
                else:
                    correct = (decoder_input == target[:, j]) & correct
            correct_words = correct.sum()
            # for i in range(batch_size):
            #     if self.encoder.cell_type == 'LSTM':
            #         decoder_hidden = encoder_hidden_outputs[i].view(self.decoder.direction, -1), encoder_output_gate[i].view(self.decoder.direction, -1)
            #     else:
            #         decoder_hidden = encoder_hidden_outputs[i].view(self.decoder.direction, -1)
            #     decoder_input = target[i, 0].unsqueeze(0)
            #     correct = True
            #     for j in range(target_length):
            #         decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
            #         loss += nn.functional.nll_loss(decoder_output, target[i, j])
            #         decoder_input = torch.tensor([decoder_output.argmax().item()]).to(self.device)
            #         if correct and target[i, j]!= decoder_output.argmax().item():
            #             correct = False
            #     if correct:
            #         correct_words  += 1


        # print(correct_words, batch_size, correct_words/batch_size)
        reported_loss = loss / (batch_size * target_length)
        self.log('train_loss', reported_loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log('train_acc', correct_words/batch_size, on_step=True, on_epoch=True, logger=True, prog_bar=True)
        return loss
    def validation_step(self, batch, batch_idx):
        input, target = batch
        self.encoder = self.encoder.to(self.device)
        self.decoder = self.decoder.to(self.device)
        batched = True if len(input.shape) > 1 else False
        if not batched:
            input = input.unsqueeze(0)
            target = target.unsqueeze(0)
        batch_size = input.shape[0]
        input_length = input.shape[1]
        target_length = target.shape[1]
        encoder_hidden = None
        
        
        encoder_hidden_outputs = torch.zeros(batch_size, self.encoder.hidden_size*self.encoder.direction, device=self.device)
        encoder_output_gate = torch.zeros(batch_size, self.encoder.hidden_size*self.encoder.direction, device=self.device)
        for i in range(batch_size):
            if self.encoder.cell_type == 'LSTM':
                a, b = self.encoder.init_hidden()
                encoder_hidden = a.to(self.device), b.to(self.device)
            else:
                encoder_hidden = self.encoder.init_hidden().to(self.device)

            _, encoder_hidden = self.encoder(input[i], encoder_hidden)
            if self.encoder.cell_type == 'LSTM':
                encoder_hidden_outputs[i] = encoder_hidden[0].flatten()
                encoder_output_gate[i] = encoder_hidden[1].flatten()
            else:
                encoder_hidden_outputs[i] = encoder_hidden.flatten()
        loss = 0
        correct_words = 0

        if self.encoder.cell_type == 'LSTM':
            decoder_hidden = encoder_hidden_outputs.view(batch_size, self.decoder.direction, -1).transpose(0, 1).contiguous(), encoder_output_gate.view(batch_size, self.decoder.direction, -1).transpose(0, 1).contiguous()
        else:
            decoder_hidden = encoder_hidden_outputs.view(batch_size, self.decoder.direction, -1).transpose(0, 1).contiguous()
        decoder_input = target[:, 0].unsqueeze(1)
        correct = None
        for j in range(target_length):
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
            squeezed_output = decoder_output.squeeze(1)
            for i in range(batch_size):
                loss += nn.functional.nll_loss(squeezed_output[i], target[i, j])
            decoder_input = decoder_output.argmax(dim=-1)
            if correct is None:
                correct = decoder_input == target[:, j]
            else:
                correct = (decoder_input == target[:, j]) & correct
        correct_words = correct.sum()

        # for i in range(batch_size):
        #     if self.encoder.cell_type == 'LSTM':
        #         decoder_hidden = encoder_hidden_outputs[i].view(self.decoder.direction, -1), encoder_output_gate[i].view(self.decoder.direction, -1)
        #     else:
        #         decoder_hidden = encoder_hidden_outputs[i].view(self.decoder.direction, -1)
        #     decoder_input = target[i, 0].unsqueeze(0)
        #     correct = True
        #     for j in range(target_length):
        #         decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
        #         loss += nn.functional.nll_loss(decoder_output, target[i, j])
        #         decoder_input = torch.tensor([decoder_output.argmax().item()]).to(self.device)
        #         if correct and target[i, j]!= decoder_output.argmax().item():
        #             correct = False
        #     if correct:
        #         correct_words  += 1
        reported_loss = loss / (batch_size * target_length)
        self.log('val_loss', reported_loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log('val_acc', correct_words/batch_size, on_step=True, on_epoch=True, logger=True)
        return loss
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)

In [58]:
encoder = Encoder(30, 100, cell_type="LSTM", num_layers=2, bidirectional=True, dropout=0.1)
decoder = Decoder(150, 100, cell_type="LSTM", num_layers=2, bidirectional=True)
model = Seq2seq(encoder, decoder)

In [None]:
trainer = pl.Trainer(accelerator='gpu', devices=2, max_epochs=10)
trainer.fit(model, train_loader)

INFO: Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
INFO: Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
INFO: LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1]
INFO: 
  | Name    | Type    | Params
------------------------------------
0 | encoder | Encoder | 406 K 
1 | decoder | Decoder | 448 K 
------------------------------------
854 K     Trainable params
0         Non-trainable params
854 K     Total params
3.418     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  "num_workers>0, persistent_workers=False, and strategy=ddp_spawn"


Training: 0it [00:00, ?it/s]



In [None]:
def convert_tensor_to_word(tensor, lang):
    int_to_lang = {0: 'SOS', 1: 'EOS', 2: 'PAD'}
    if lang == 'eng':
        int_to_lang.update({i-94: chr(i) for i in range(97, 123)})
    elif lang == 'hin':
        int_to_lang.update({i-2300: chr(i) for i in range(2304, 2432)})
    
    word = ''
    for i in tensor:
        word += int_to_lang[i.item()]
    return word

In [None]:
sweep_config = {
    'method': 'bayes',
    'metric': {
        'name': 'val_loss',
        'goal': 'minimize'
    },
    'parameters': {
        'hidden_size': {
            'values': [16, 32, 64],
        },
        'encoder_num_layers': {
            'values': [1, 2, 3],
        },
        'decoder_num_layers': {
            'values': [1, 2, 3],
        },
        'bidirectional': {
            'values': [True, False],
        },
        'cell_type': {
            'values': ['LSTM', 'GRU'],
        },
    }
}

In [None]:
def sweep_fn():
    wandb.init()
    config = wandb.config
    dropout_val = 0
    encoder=Encoder(30, config.hidden_size, config.cell_type, num_layers=config.encoder_num_layers, bidirectional=config.bidirectional)
    decoder = Decoder(150, config.hidden_size, config.cell_type, num_layers=config.decoder_num_layers, bidirectional=config.bidirectional)
    model = Seq2seq(encoder, decoder)
    logger = WandbLogger(project='CS6910 Assignment 3', entity='cs20b075')
    trainer = pl.Trainer(accelerator='gpu', devices=2, max_epochs=5, precision=16, logger=logger)
    trainer.fit(model, train_loader)

In [None]:
wandb.login(key="8c780297be240a84f5c8b7d669cb158839b2637a")

In [None]:
sweep_id = wandb.sweep(sweep=sweep_config, project="CS6910 Assignment 3")
wandb.agent(sweep_id=sweep_id, function=sweep_fn, count=10)

In [None]:
wandb.finish()