# Read the data

In [2]:
!pip install lightning wandb

Defaulting to user installation because normal site-packages is not writeable


In [3]:
import pandas as pd
import torch 
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import lightning as pl
from pytorch_lightning.loggers import WandbLogger
import random

In [4]:
path = "aksharantar_sampled/hin"
train_path = path + "/hin_train.csv"
valid_path = path + "/hin_valid.csv"
test_path = path + "/hin_test.csv"

In [5]:
def get_data(path):
    dataset = pd.read_csv(path, header=None)
    dataset = dataset.values
    input = dataset[:, 0]
    output = dataset[:, 1]
    return input, output

In [6]:
train_dataset = get_data(train_path)

In [7]:
def convert_word_to_tensor(word, lang):
    lang_to_int = {'SOS': 0, 'EOS': 1, 'PAD': 2}
    if lang == 'eng':
        lang_to_int.update({chr(i): i-94 for i in range(97, 123)})
    elif lang == 'hin':
        lang_to_int.update({chr(i): i-2300 for i in range(2304, 2432)})
    
    a = [lang_to_int['SOS']]

    for i in word:
        a.append(lang_to_int[i])

    a.append(lang_to_int['EOS'])
    if len(a) < 12:
        a.extend([lang_to_int['PAD']]*(12-len(a)))
    
    return torch.tensor(a)

In [8]:
class AksharantarDataset(Dataset):
    def __init__(self, dataset):
        super().__init__()
        self.dataset = dataset
        self.input = dataset[0]
        self.output = dataset[1]
        mask = np.array([len(elem) < 10 for elem in self.input]) & np.array([len(elem) < 10 for elem in self.output])
        self.input = self.input[mask]
        self.output = self.output[mask]
        self.len = len(self.input)
    
    def __getitem__(self, index):
        return convert_word_to_tensor(self.input[index], 'eng'), convert_word_to_tensor(self.output[index], 'hin')
    
    def __len__(self):
        return self.len

In [9]:
class CustomDataModule(pl.LightningDataModule):
    def __init__(self, dataset, batch_size=32):
        super().__init__()
        self.dataset = train_dataset
        self.batch_size = batch_size

    def train_dataloader(self):
        dataset = AksharantarDataset(self.dataset)
        return DataLoader(dataset, batch_size=self.batch_size, num_workers=2)

In [10]:
train_loader = CustomDataModule(train_dataset, 32)

# Encoder model

In [28]:
class Encoder(pl.LightningModule):
    def __init__(self, input_size, hidden_size, cell_type, num_layers=1, dropout=0, bidirectional=False):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        
        if cell_type == 'LSTM':
            self.rnn = nn.LSTM
        elif cell_type == 'GRU':
            self.rnn = nn.GRU
        else:
            self.rnn = nn.RNN
        self.direction = 2 if bidirectional else 1
        self.first_cell = self.rnn(hidden_size, hidden_size, dropout=dropout, bidirectional=bidirectional)
        self.rnns = [self.rnn(hidden_size*self.direction, hidden_size, dropout=dropout, bidirectional=bidirectional)]*(num_layers-1)
        self.num_layers = num_layers

    def forward(self, input, hidden):
        embedded = self.embedding(input)
        # embedded = embedded.view(1, 1, -1)
        output = embedded
        output, hidden = self.first_cell(output, hidden)
        for i in range(self.num_layers-1):
            output, hidden = self.rnns[i](output, hidden)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(self.direction, self.hidden_size, device=self.device)

# Decoder

In [29]:
class Decoder(pl.LightningModule):
    def __init__(self, output_size, hidden_size, cell_type, num_layers=1, bidirectional=False, dropout=0):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        if cell_type == 'LSTM':
            self.cell_type = nn.LSTM
        elif cell_type == 'GRU':
            self.cell_type = nn.GRU
        else:
            self.cell_type = nn.RNN
        self.first_cell = self.cell_type(hidden_size, hidden_size, bidirectional=bidirectional, dropout=dropout)
        self.direction = 2 if bidirectional else 1
        self.rnns= [self.cell_type(hidden_size*self.direction, hidden_size, bidirectional=bidirectional, dropout=dropout)]*(num_layers-1)
        self.out = nn.Linear(hidden_size*self.direction, output_size)
        self.softmax = nn.LogSoftmax(dim=0)
        self.num_layers = num_layers

    def forward(self, input, hidden):
        output = self.embedding(input)
        output = nn.functional.relu(output)
        output, hidden = self.first_cell(output, hidden)
        for i in range(self.num_layers-1):
            output, hidden = self.rnns[i](output, hidden)
        linear_output = self.out(output[0])
        output = self.softmax(self.out(output[0]))
        return output, hidden

# Seq2seq model

In [30]:
class Seq2seq(pl.LightningModule):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input):
        batched = True if len(input.shape) > 1 else False
        if not batched:
            input = input.unsqueeze(0)
        batch_size = input.shape[0]
        input_length = input.shape[1]

        encoder_hidden = self.encoder.init_hidden()
        encoder_hidden_outputs = torch.zeros(batch_size, self.encoder.hidden_size*self.encoder.direction, device=self.device)
        for i in range(batch_size):
            _, encoder_hidden = self.encoder(input[i], encoder_hidden)
            encoder_hidden_outputs[i] = encoder_hidden.flatten()
        output_sequence = [[]]*batch_size
        for i in range(batch_size):
            decoder_hidden = encoder_hidden_outputs[i].view(self.decoder.direction, -1)
            decoder_input = torch.tensor([0], device=self.device)
            output_sequence[i].append(decoder_input)
            for j in range(input_length):
                decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
                decoder_input = torch.tensor([decoder_output.argmax().item()]).to(self.device)
                output_sequence[i].append(decoder_input)
        output_sequence = torch.tensor(output_sequence, device=self.device)
        if not batched:
            output_sequence = output_sequence.squeeze(0)
        return output_sequence
        
    def training_step(self, batch, batch_idx):
        input, target = batch
        
        batched = True if len(input.shape) > 1 else False
        if not batched:
            input = input.unsqueeze(0)
            target = target.unsqueeze(0)
        batch_size = input.shape[0]
        input_length = input.shape[1]
        target_length = target.shape[1]

        encoder_hidden = self.encoder.init_hidden()
        encoder_hidden_outputs = torch.zeros(batch_size, self.encoder.hidden_size*self.encoder.direction, device=self.device)
        for i in range(batch_size):
            _, encoder_hidden = self.encoder(input[i], encoder_hidden)
            encoder_hidden_outputs[i] = encoder_hidden.flatten()
        loss = 0
        if random.random() < 0.5: 
            for i in range(batch_size):
                decoder_hidden = encoder_hidden_outputs[i].view(self.decoder.direction, -1)
                for j in range(target_length):
                    decoder_output, decoder_hidden = self.decoder(target[i, j].unsqueeze(0), decoder_hidden)
                    loss += nn.functional.nll_loss(decoder_output, target[i, j])
        else:
            for i in range(batch_size):
                decoder_hidden = encoder_hidden_outputs[i].view(self.decoder.direction, -1)
                decoder_input = target[i, 0].unsqueeze(0)
                for j in range(target_length):
                    decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
                    loss += nn.functional.nll_loss(decoder_output, target[i, j])
                    decoder_input = torch.tensor([decoder_output.argmax().item()]).to(self.device)
        reported_loss = loss / (batch_size * target_length)
        self.log('train_loss', reported_loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)

In [36]:
encoder = Encoder(30, 100, cell_type="GRU", num_layers=2, bidirectional=True, dropout=0.1)
decoder = Decoder(150, 100, cell_type="GRU", num_layers=2, bidirectional=True)
model = Seq2seq(encoder, decoder)



In [37]:
trainer = pl.Trainer(max_epochs=10)
trainer.fit(model, train_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type    | Params
------------------------------------
0 | encoder | Encoder | 124 K 
1 | decoder | Decoder | 166 K 
------------------------------------
290 K     Trainable params
0         Non-trainable params
290 K     Total params
1.162     Total estimated model params size (MB)
  rank_zero_warn(


Epoch 0:   0%|          | 5/1017 [00:05<19:07,  1.13s/it, v_num=120, train_loss_step=4.630]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [None]:
torch.save(model.state_dict(), 'model.ckpt')

In [34]:
def convert_tensor_to_word(tensor, lang):
    int_to_lang = {0: 'SOS', 1: 'EOS', 2: 'PAD'}
    if lang == 'eng':
        int_to_lang.update({i-94: chr(i) for i in range(97, 123)})
    elif lang == 'hin':
        int_to_lang.update({i-2300: chr(i) for i in range(2304, 2432)})
    
    word = ''
    for i in tensor:
        word += int_to_lang[i.item()]
    return word

In [35]:
convert_tensor_to_word(model(convert_word_to_tensor('ghar', 'eng')), 'hin')

'SOSSOSPADPADPADPADPADPADPADPADPADPADPAD'

In [None]:
sweep_config = {
    'method': 'bayes',
    'metric': {
        'name': 'val_loss',
        'goal': 'minimize'
    },
    'parameters': {
        'hidden_size': {
            'values': [16, 32, 64],
        },
        'encoder_num_layers': {
            'values': [1, 2, 3],
        },
        'decoder_num_layers': {
            'values': [1, 2, 3],
        },
        'dropout': {
            'values': [0.1, 0.2, 0.3],
        },
        'bidirectional': {
            'values': [True, False],
        },
        'cell_type': {
            'values': ['LSTM', 'GRU'],
        },
    }
}

In [None]:
def sweep_fn():
    