In [1]:
!pip install lightning wandb

Collecting lightning
  Downloading lightning-2.5.1.post0-py3-none-any.whl.metadata (39 kB)
Collecting packaging<25.0,>=20.0 (from lightning)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<4.0,>=2.1.0->lightning)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<4.0,>=2.1.0->lightning)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch<4.0,>=2.1.0->lightning)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch<4.0,>=2.1.0->lightning)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch<4.0,>=2.1.0->lightning)
  Downloading nvidia_cusolver_cu12-11

In [3]:
import numpy as np
import pandas as pd
import torch
import os
from torch import nn
import sys
from torch.utils.data import Dataset, DataLoader
import wandb
import regex as re
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import wandb
import lightning as pl
from pytorch_lightning import LightningModule
from pytorch_lightning.loggers import WandbLogger
from torch.nn.functional import pad

In [4]:
train_path = "/kaggle/input/dakshina/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv"
valid_path = "/kaggle/input/dakshina/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.dev.tsv"
test_path = "/kaggle/input/dakshina/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.test.tsv"

train_df = pd.read_csv(train_path, sep="\t", header=None, names=["native", "latin", 'n_annot'], encoding='utf-8')
valid_df = pd.read_csv(valid_path, sep="\t", header=None, names=["native", "latin", 'n_annot'], encoding='utf-8')
test_df = pd.read_csv(test_path, sep="\t", header=None, names=["native", "latin", 'n_annot'], encoding='utf-8')

train_df.head()

Unnamed: 0,native,latin,n_annot
0,ஃபியட்,fiat,2
1,ஃபியட்,phiyat,1
2,ஃபியட்,piyat,1
3,ஃபிரான்ஸ்,firaans,1
4,ஃபிரான்ஸ்,france,2


In [5]:
train_df = train_df[~train_df['latin'].isna()]
valid_df = valid_df[~valid_df['latin'].isna()]
test_df = test_df[~test_df['latin'].isna()]

In [6]:
class NativeTokenizer():
    def __init__(self, train_path, valid_path, test_path, special_tokens={'START': '<start>','END':'<end>', 'PAD':'<pad>'}):
        
        self.train_df = pd.read_csv(train_path, sep="\t", header=None, names=["native", "latin", 'n_annot'], encoding='utf-8')
        self.valid_df = pd.read_csv(valid_path, sep="\t", header=None, names=["native", "latin", 'n_annot'], encoding='utf-8')
        self.test_df = pd.read_csv(test_path, sep="\t", header=None, names=["native", "latin", 'n_annot'], encoding='utf-8')
        self.special_tokens = special_tokens
        # Build vocabulary
        self._build_vocab(add_special_tokens=True)
        
        # Id to token mapping
        self.id_to_latin = {i: char for i, char in enumerate(self.latin_vocab)}
        self.id_to_native = {i: char for i, char in enumerate(self.native_vocab)}

        self.latin_vocab_size = len(self.latin_vocab)
        self.nat_vocab_size = len(self.native_vocab)

    # Build vocabulary
    def _build_vocab(self, add_special_tokens=True):
        self.nat_set = set()
        self.latin_set = set()
        for lat, nat in zip(self.train_df['latin'], self.train_df['native']):
            nat_chars = re.findall(r'\X' , nat)
            try:
                lat_chars = list(lat)
            except:
                print(f"Invalid latin string: {lat}, skipping....")
            
            for char in nat_chars:
                self.nat_set.add(char)
            for char in lat_chars:
               self.latin_set.add(char.lower())
            
        self.nat_set = sorted(list(self.nat_set))
        self.latin_set = sorted(list(self.latin_set))
        
        if add_special_tokens:
            self.nat_set = list(self.special_tokens.values()) + self.nat_set
            self.latin_set = [self.special_tokens['PAD']] + self.latin_set   

        self.latin_vocab = {char: i for i, char in enumerate(self.latin_set)}
        self.native_vocab = {char: i for i, char in enumerate(self.nat_set)}

    def tokenize(self, text, lang='latin'):
        if type(text) != str:
            print("Invalid text:", text)
            print("Language must be a string, but got", type(text))
        if lang == 'latin':
            return [self.latin_vocab[char] for char in text]
        elif lang == 'native':
            return [self.native_vocab['<start>']] + [self.native_vocab[char] for char in re.findall('\X', text)] + [self.native_vocab['<end>']]
        else:
            raise ValueError("Language must be either 'latin' or 'native'.")




In [7]:
tokenizer = NativeTokenizer(train_path, valid_path, test_path)
print(f"Latin vocab size: {tokenizer.latin_vocab_size}")
print(f"Native vocab size: {tokenizer.nat_vocab_size}")

Invalid latin string: nan, skipping....
Invalid latin string: nan, skipping....
Invalid latin string: nan, skipping....
Latin vocab size: 27
Native vocab size: 253


In [8]:
class LatNatDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        entry = self.df.iloc[idx]
        latin_word = entry['latin']
        native_word = entry['native']
               
        # Tokenize and convert to IDs
        #latin_ids = [self.tokenizer.latin_vocab[i] for i in latin_word]
        #native_ids = [self.tokenizer.native_vocab[i] for i in re.findall(r'\X' , native_word)]
        latin_ids = self.tokenizer.tokenize(latin_word, lang='latin')
        native_ids = self.tokenizer.tokenize(native_word, lang='native')


        return (torch.tensor(latin_ids),
            torch.tensor(native_ids))

    def collate_fn(self, batch):
        x,y = zip(*batch)
        x_len = [len(seq) for seq in x]
        y_len = [len(seq) for seq in y]

        padded_x = pad_sequence(x, batch_first=True, padding_value=self.tokenizer.latin_vocab['<pad>'])
        padded_y = pad_sequence(y, batch_first=True, padding_value=self.tokenizer.native_vocab['<pad>'])
        
        x_len, perm_idx = torch.tensor(x_len).sort(0, descending=True)
        padded_x = padded_x[perm_idx]

        y_len = torch.tensor(y_len).sort(0, descending=True)
        padded_y = padded_y[perm_idx]

        return padded_x, x_len, padded_y, y_len



In [9]:
train_dataset = LatNatDataset(train_df, tokenizer)
valid_dataset = LatNatDataset(valid_df, tokenizer)
test_dataset = LatNatDataset(test_df, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=train_dataset.collate_fn, num_workers=2)
valid_dataloader = DataLoader(valid_dataset, batch_size=32, shuffle=False, collate_fn=valid_dataset.collate_fn , num_workers=2)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=test_dataset.collate_fn, num_workers=2)

In [10]:
class Encoder(torch.nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, cell, num_layers, dropout, activation=None):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = torch.nn.Embedding(num_embeddings=input_size, embedding_dim=embedding_size, )
        if cell =='rnn':
            self.rnn = torch.nn.RNN(input_size=embedding_size, hidden_size=hidden_size, batch_first=True, num_layers=num_layers, dropout=dropout, nonlinearity=activation)
        elif cell == 'LSTM':
            self.rnn = torch.nn.GRU(input_size=embedding_size, hidden_size=hidden_size, batch_first=True, num_layers=num_layers,dropout=dropout)
        elif cell == 'GRU':
            self.rnn = torch.nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, batch_first=True, num_layers=num_layers, dropout=dropout)
    
    def forward(self, seq, seq_len):
        embedding = self.embedding(input=seq)
        packed = pack_padded_sequence(input=embedding, lengths=seq_len.cpu(), batch_first=True, enforce_sorted=True)
        output, hidden = self.rnn(packed)
        output, _ = pad_packed_sequence(output, batch_first=True)
        return output, hidden

In [11]:
class Decoder(torch.nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size, cell, num_layers, dropout, activation=None):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = torch.nn.Embedding(num_embeddings=output_size, embedding_dim=embedding_size)
        if cell == 'rnn':
            self.rnn = torch.nn.RNN(input_size=embedding_size, hidden_size=hidden_size, batch_first=True, num_layers=num_layers, nonlinearity=activation, dropout=dropout)
        elif cell == 'LSTM':
            self.rnn = torch.nn.GRU(input_size=embedding_size, hidden_size=hidden_size, batch_first=True, num_layers=num_layers, dropout=dropout)
        elif cell == 'GRU':
            self.rnn = torch.nn.GRU(input_size=embedding_size, hidden_size=hidden_size, batch_first=True, num_layers=num_layers, dropout=dropout)
        self.out = torch.nn.Linear(hidden_size, output_size)
        self.softmax = torch.nn.LogSoftmax(dim=2)  

    def forward(self, input_step, hidden):
        # input_step: (batch_size, 1) [a single timestep]
        embedded = self.embedding(input_step)  # (batch_size, 1, hidden_size)

        rnn_output, hidden = self.rnn(embedded, hidden)  # output: (batch_size, 1, hidden_size)
        output = self.out(rnn_output)  # (batch_size, 1, output_size)
        return output, hidden


In [13]:
class RNN_light(pl.LightningModule):
    def __init__(self, input_sizes, embedding_size, hidden_size, cell, layers, dropout, activation, beam_size, optim, special_tokens, lr):
        super().__init__()
        self.optim = optim
        self.save_hyperparameters()
        self.beam_size = beam_size
        if layers == 1:
            print("Dropout is not applied for 1 layer")
            dropout = 0 
        self.encoder = Encoder(input_sizes[0], embedding_size, hidden_size, cell=cell, num_layers=layers, dropout=dropout, activation=activation)
        self.decoder = Decoder(input_sizes[1], embedding_size, hidden_size, cell=cell, num_layers=layers, dropout=dropout, activation=activation)
        self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=special_tokens['<pad>'], reduction='sum')
        self.special_tokens = special_tokens   
        self.beam_size = beam_size 
    def forward(self, input_tensor=[], input_lengths=[], decoder_input=[], decoder_hidden= [], encoder=False):
        if encoder:
            _, decoder_hidden = self.encoder(input_tensor, input_lengths)
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden.contiguous())

        else:
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden.contiguous())
        return decoder_output, decoder_hidden

    
    def training_step(self, batch, batch_idx):
        input_tensor, input_lengths, target_tensor, target_lengths = batch
        decoder_input = target_tensor[:, :-1].detach().clone()
        decoder_target = target_tensor[:, 1:].detach().clone()
        loss = 0
        #_, decoder_hidden = self.encoder(input_tensor, input_lengths)


        for i in range(target_tensor.shape[1]-1):
            if i ==0:
                # first step
                #decoder_output, decoder_hidden = self.decoder(decoder_input[:, i].unsqueeze(1), decoder_hidden)
                decoder_output, decoder_hidden = self(input_tensor = input_tensor, input_lengths=input_lengths, decoder_input = decoder_input[:, i].unsqueeze(1), encoder=True)
   
                loss += self.loss_fn(decoder_output.squeeze(1), decoder_target[:, i])
                preds = decoder_output.argmax(dim=2).cpu().numpy()
            else:
                # rest of the steps
                #decoder_output, decoder_hidden = self.decoder(decoder_input[:, i].unsqueeze(1), decoder_hidden)
                decoder_output, decoder_hidden = self(decoder_input=decoder_input[:, i].unsqueeze(1), decoder_hidden=decoder_hidden)
 
                loss += self.loss_fn(decoder_output.squeeze(1), decoder_target[:, i])
                preds = np.hstack((preds, decoder_output.argmax(dim=2).cpu().numpy()))
        
        # masking pad tokens and end tokens for accuracy calculation
        non_pad = (decoder_target[:, :-1] != self.special_tokens['<pad>']).sum()
        mask = ~torch.isin(decoder_target[:,:-1], torch.tensor(list(self.special_tokens.values()), device = input_tensor.device))

        masked_preds = torch.tensor(preds[:, :-1], device = input_tensor.device).masked_fill(~mask, self.special_tokens['<pad>'])
        masked_targets = decoder_target[:, :-1].masked_fill(~mask, self.special_tokens['<pad>'])
        exact_matches = (masked_preds == masked_targets).all(dim=1)
        accuracy = exact_matches.float().mean()
        #words = "".join([tokenizer.id_to_native[i] for i in preds[0]])
        #truth = "".join([tokenizer.id_to_native[i] for i in decoder_target[0:1, :-1].tolist()[0]])
        #print(f"Pred: {words}, Truth: {truth}")
        #rint(loss/non_pad)
        self.log("train loss", loss/non_pad, on_step = False, on_epoch = True)
        self.log("train accuracy", accuracy, on_step = False, on_epoch = True)

        return loss/non_pad

    def validation_step(self, batch, batch_idx):
        # input_tensor, input_lengths, target_tensor, target_lengths = batch
        # decoder_input = target_tensor[:, :-1].detach().clone()
        # decoder_target_batch = target_tensor[:, 1:].detach().clone()
        # loss = 0
        # _, decoder_hidden_batch = self.encoder(input_tensor, input_lengths)


        # best_seqs = []
        # for i in range(len(input_tensor)):
        #     beams = [(torch.tensor([self.special_tokens['<start>']], device = input_tensor.device), 0.0)]  # Start with start token
        #     decoder_hidden = decoder_hidden_batch[:, i, :].unsqueeze(1)  
        #     decoder_target = decoder_target_batch[i].unsqueeze(0)
        #     for j in range(decoder_target.shape[1]-1):
        #         all_candidates = []
        #         for seq, score in beams:
        #             if seq[-1].item() == self.special_tokens['<end>']:
        #                 all_candidates.append((seq, score))
        #                 continue
                    
                    
        #             input_token = seq[-1].unsqueeze(0).unsqueeze(0)  # shape (1,1)
        #             decoder_output, decoder_hidden = self(decoder_input=input_token, decoder_hidden=decoder_hidden[:,0,:].unsqueeze(1))
        #             loss += self.loss_fn(decoder_output.squeeze(1), decoder_target[0, j].unsqueeze(0))

        #             log_probs = torch.log_softmax(decoder_output, dim=-1).squeeze(0).squeeze(0)

        #             # Get top-k tokens and their log probabilities
        #             topk_log_probs, topk_tokens = torch.topk(log_probs, self.beam_size)

        #             for k in range(self.beam_size):
        #                 next_token = topk_tokens[k].unsqueeze(0)
        #                 new_seq = torch.cat([seq, next_token])
        #                 new_score = score + topk_log_probs[k].item()
        #                 all_candidates.append((new_seq, new_score))

        #         # Select top beam_width sequences
        #         beams = sorted(all_candidates, key=lambda x: x[1], reverse=True)[:self.beam_size]
                
        #         # Optional: break early if all beams ended with end_token
        #         if all(seq[-1].item() == self.special_tokens['<end>'] for seq, _ in beams):
        #             break

        #     # Return best sequence (highest score)
        #     best_seq = beams[0][0]
        #     best_seqs.append(best_seq)
        # non_pad = (decoder_target[:, :-1] != self.special_tokens['<pad>']).sum()
        # max_len = decoder_target.shape[1]      # Convert best_seqs to tensor
        # preds = [pad(seq, (0, max_len - len(seq)), value=self.special_tokens['<pad>']) for seq in best_seqs]
        # preds = torch.stack(preds)
        # #preds = pad_sequence(best_seqs, batch_first=True, padding_value=self.special_tokens['<pad>']).to(input_tensor.device)
 
        # mask = ~torch.isin(decoder_target[:,:-1], torch.tensor(list(self.special_tokens.values()), device = input_tensor.device))
        # masked_preds = preds[:, :-1].masked_fill(~mask, self.special_tokens['<pad>'])
        # masked_targets = decoder_target[:, :-1].masked_fill(~mask, self.special_tokens['<pad>'])
        # exact_matches = (masked_preds == masked_targets).all(dim=1)
        # accuracy = exact_matches.float().mean()
        # words = "".join([tokenizer.id_to_native[i.item()] for i in masked_preds[0]])
        # truth = "".join([tokenizer.id_to_native[i] for i in decoder_target[0:1, :-1].tolist()[0]])
        # print(f"Pred: {words}, Truth: {truth}")
        input_tensor, input_lengths, target_tensor, target_lengths = batch
        decoder_input = target_tensor[:, :-1].detach().clone()
        decoder_target = target_tensor[:, 1:].detach().clone()
        loss = 0
        for i in range(target_tensor.shape[1]-1):
            if i ==0:
                # first step
                #decoder_output, decoder_hidden = self.decoder(decoder_input[:, i].unsqueeze(1), decoder_hidden)
                
                decoder_input = torch.tensor([[self.special_tokens['<start>']]* input_tensor.shape[0]], device=input_tensor.device).reshape(-1, 1)
                #print(decoder_input.shape)
                
                decoder_output, decoder_hidden = self(input_tensor = input_tensor, input_lengths=input_lengths, decoder_input = decoder_input[:,], encoder=True)
        
                loss += self.loss_fn(decoder_output.squeeze(1), decoder_target[:, i])
                preds = decoder_output.argmax(dim=2).cpu().numpy()
                decoder_input =decoder_output.argmax(dim=2)
            else:
                # rest of the steps
                #decoder_output, decoder_hidden = self.decoder(decoder_input[:, i].unsqueeze(1), decoder_hidden)
                decoder_output, decoder_hidden = self(decoder_input=decoder_input[:, ], decoder_hidden=decoder_hidden)
                decoder_input =decoder_output.argmax(dim=2)

                loss += self.loss_fn(decoder_output.squeeze(1), decoder_target[:, i])
                preds = np.hstack((preds, decoder_output.argmax(dim=2).cpu().numpy()))
        
        # masking pad tokens and end tokens for accuracy calculation
        non_pad = (decoder_target[:, :-1] != self.special_tokens['<pad>']).sum()
        mask = ~torch.isin(decoder_target[:,:-1], torch.tensor(list(self.special_tokens.values()), device = input_tensor.device))
        
        masked_preds = torch.tensor(preds[:, :-1], device = input_tensor.device).masked_fill(~mask, self.special_tokens['<pad>'])
        masked_targets = decoder_target[:, :-1].masked_fill(~mask, self.special_tokens['<pad>'])
        exact_matches = (masked_preds == masked_targets).all(dim=1)
        accuracy = exact_matches.float().mean()
        #words = "".join([tokenizer.id_to_native[i] for i in preds[0]])
        #truth = "".join([tokenizer.id_to_native[i] for i in decoder_target[0:1, :-1].tolist()[0]])
        #print(f"Pred: {words}, Truth: {truth}")
        #print(loss/non_pad)

        self.log("val loss", loss/non_pad, on_step = False, on_epoch = True)
        self.log("val accuracy", accuracy, on_step = False, on_epoch = True)

        return loss/non_pad

    def test_step(self, batch, batch_idx):
        # input_tensor, input_lengths, target_tensor, target_lengths = batch
        # decoder_input = target_tensor[:, :-1].detach().clone()
        # decoder_target_batch = target_tensor[:, 1:].detach().clone()
        # loss = 0
        # _, decoder_hidden_batch = self.encoder(input_tensor, input_lengths)
        

        # best_seqs = []
        # for i in range(len(input_tensor)):
        #     beams = [(torch.tensor([self.special_tokens['<start>']], device = input_tensor.device), 0.0)]  # Start with start token
        #     decoder_hidden = decoder_hidden_batch[i].unsqueeze(0)
        #     decoder_target = decoder_target_batch[i].unsqueeze(0)
        #     for j in range(decoder_target.shape[1]-1):
        #         all_candidates = []
        #         for seq, score in beams:
        #             if seq[-1].item() == self.special_tokens['<end>']:
        #                 all_candidates.append((seq, score))
        #                 continue
                    
                    
        #             input_token = seq[-1].unsqueeze(0).unsqueeze(0)  # shape (1,1)
        #             decoder_output, decoder_hidden = self(decoder_input=input_token, decoder_hidden=decoder_hidden[:,0,:].unsqueeze(1))
        #             loss += self.loss_fn(decoder_output.squeeze(1), decoder_target[0, j].unsqueeze(0))

        #             log_probs = torch.log_softmax(decoder_output, dim=-1).squeeze(0).squeeze(0)

        #             # Get top-k tokens and their log probabilities
        #             topk_log_probs, topk_tokens = torch.topk(log_probs, self.beam_size)

        #             for k in range(self.beam_size):
        #                 next_token = topk_tokens[k].unsqueeze(0)
        #                 new_seq = torch.cat([seq, next_token])
        #                 new_score = score + topk_log_probs[k].item()
        #                 all_candidates.append((new_seq, new_score))

        #         # Select top beam_width sequences
        #         beams = sorted(all_candidates, key=lambda x: x[1], reverse=True)[:self.beam_size]

        #         # Optional: break early if all beams ended with end_token
        #         if all(seq[-1].item() == self.special_tokens['<end>'] for seq, _ in beams):
        #             break

        #     # Return best sequence (highest score)
        #     best_seq = beams[0][0]
        #     best_seqs.append(best_seq)
        
        #         # Convert best_seqs to tensor

        # preds = pad_sequence(best_seqs, batch_first=True, padding_value=self.special_tokens['<pad>']).to(input_tensor.device)
        # mask = ~torch.isin(decoder_target_batch[:,:-1], torch.tensor(list(self.special_tokens.values(), device = input_tensor.device)))
        # masked_preds = preds[:, :-1].masked_fill(~mask, -1)
        # masked_targets = decoder_target_batch[:, :-1].masked_fill(~mask, -1)
        # exact_matches = (masked_preds == masked_targets).all(dim=1)
        # accuracy = exact_matches.float().mean()
 
        # self.log("test loss", loss, on_step = False, on_epoch = True)
        # self.log("test accuracy", accuracy, on_step = False, on_epoch = True)

        # return loss
        input_tensor, input_lengths, target_tensor, target_lengths = batch
        decoder_input = target_tensor[:, :-1].detach().clone()
        decoder_target = target_tensor[:, 1:].detach().clone()
        loss = 0
        for i in range(target_tensor.shape[1]-1):
            if i ==0:
                # first step
                #decoder_output, decoder_hidden = self.decoder(decoder_input[:, i].unsqueeze(1), decoder_hidden)
                
                decoder_input = torch.tensor([[self.special_tokens['<start>']]* input_tensor.shape[0]], device=input_tensor.device).reshape(-1, 1)
                
                decoder_output, decoder_hidden = self(input_tensor = input_tensor, input_lengths=input_lengths, decoder_input = decoder_input[:,], encoder=True)
        
                loss += self.loss_fn(decoder_output.squeeze(1), decoder_target[:, i])
                preds = decoder_output.argmax(dim=2).cpu().numpy()
                decoder_input =decoder_output.argmax(dim=2)
            else:
                # rest of the steps
                #decoder_output, decoder_hidden = self.decoder(decoder_input[:, i].unsqueeze(1), decoder_hidden)
                decoder_output, decoder_hidden = self(decoder_input=decoder_input[:, ], decoder_hidden=decoder_hidden)
                decoder_input =decoder_output.argmax(dim=2)

                loss += self.loss_fn(decoder_output.squeeze(1), decoder_target[:, i])
                preds = np.hstack((preds, decoder_output.argmax(dim=2).cpu().numpy()))
        
        # masking pad tokens and end tokens for accuracy calculation
        non_pad = (decoder_target[:, :-1] != self.special_tokens['<pad>']).sum()
        mask = ~torch.isin(decoder_target[:,:-1], torch.tensor(list(self.special_tokens.values()), device = input_tensor.device))
        
        masked_preds = torch.tensor(preds[:, :-1], device = input_tensor.device).masked_fill(~mask, self.special_tokens['<pad>'])
        masked_targets = decoder_target[:, :-1].masked_fill(~mask, self.special_tokens['<pad>'])
        exact_matches = (masked_preds == masked_targets).all(dim=1)
        accuracy = exact_matches.float().mean()
        #words = "".join([tokenizer.id_to_native[i] for i in preds[0]])
        #truth = "".join([tokenizer.id_to_native[i] for i in decoder_target[0:1, :-1].tolist()[0]])
        #print(f"Pred: {words}, Truth: {truth}")
        #print(loss/non_pad)

        self.log("test loss", loss/non_pad, on_step = False, on_epoch = True)
        self.log("test accuracy", accuracy, on_step = False, on_epoch = True)

        return loss/non_pad

    def predict_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        return logits

    def configure_optimizers(self):
        if self.optim == 'sgd':
            optimizer = torch.optim.SGD(self.parameters(), lr=self.hparams.lr, momentum=0.9)
        elif self.optim == 'adam':
            optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
        return optimizer
    


In [14]:
INPUT_SIZE = tokenizer.latin_vocab_size
OUTPUT_SIZE = tokenizer.nat_vocab_size
EMBEDDING_SIZE = 128
HIDDEN_SIZE = 256
MAX_TARGET_LEN = 28  # Set this to the maximum length of your target sequences
SOS_token = tokenizer.native_vocab['<start>']
PAD_TOKEN = tokenizer.native_vocab['<pad>']
EOS_token = tokenizer.native_vocab['<end>']

In [None]:
os.environ['WANDB_API_KEY'] = "761e2f0f9986fd2e6ee9f21ef44a2665e0bc8618"
wandb.login(key=os.getenv("WANDB_API_KEY"))
special_tokens = {key: val for key, val in tokenizer.native_vocab.items() if key in ['<start>', '<end>', '<pad>']}

model = RNN_light(
    input_sizes=(tokenizer.latin_vocab_size, tokenizer.nat_vocab_size),
    embedding_size=EMBEDDING_SIZE,
    hidden_size=HIDDEN_SIZE,
    cell='LSTM',
    layers=3,
    dropout=0.2,
    activation='tanh',
    beam_size=3,
    optim='adam',
    special_tokens=special_tokens,
    lr=0.001
)
logger= WandbLogger(project= 'dl_rnn', name = "teeee1") #,resume="never")
trainer = pl.Trainer(max_epochs=5,  accelerator="auto",logger=logger, profiler='simple',  precision="16-mixed",)
trainer.fit(model, train_dataloader,  valid_dataloader)
trainer.test(model, dataloaders=test_dataloader)
#trainer.save_checkpoint("trained_model.ckpt")

## Sweeps

In [22]:
sweep_config = {
    'method': 'bayes',
    'metric': {
        'name': 'val_acc',
        'goal': 'maximize',
    },
    "early_terminate": {"type": "hyperband", "min_iter": 3},
    'parameters': {
        'lr': {
            'min': 1e-4,
            'max': 1e-2
        },
        'batch_size': {
            'values': [32, 64]
        },
        'embedding_size': {
            'values': [32, 64, 128, 256, 512]
        },
        'hidden_size': {
            'values': [32, 64, 128, 256, 512]
        },
        'cell': {
            'values': ['rnn', 'LSTM', 'GRU']
            
        },
        'activation': {'values': ['relu', 'tanh']},
        'layers': {'values': [1,2,3,4]},
        'optim': {'values': ['sgd', 'adam']},
        'dropout': {
            'min': 0.0,
            'max': 0.5
        },
        'epochs': {'values': [5, 10]}
    
    }
}


In [23]:
def trainCNN(config=None):
    with wandb.init(config=config) as run:
        config = wandb.config

        run.name = f"cell_{config.cell}_emb_{config.embedding_size}_hidden_{config.hidden_size}_D_{config.dropout:.2f}_layers_{config.layers}"
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
        try:
            train_dataset = LatNatDataset(train_df, tokenizer)
            valid_dataset = LatNatDataset(valid_df, tokenizer)
            test_dataset = LatNatDataset(test_df, tokenizer)
            
            train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=train_dataset.collate_fn, num_workers=2)
            valid_dataloader = DataLoader(valid_dataset, batch_size=config.batch_size, shuffle=False, collate_fn=valid_dataset.collate_fn , num_workers=2)
            test_dataloader = DataLoader(test_dataset, batch_size=config.batch_size, shuffle=False, collate_fn=test_dataset.collate_fn, num_workers=2)
            special_tokens = {key: val for key, val in tokenizer.native_vocab.items() if key in ['<start>', '<end>', '<pad>']}

            model = RNN_light(
                input_sizes=(tokenizer.latin_vocab_size, tokenizer.nat_vocab_size),
                embedding_size=config.embedding_size,
                hidden_size=config.hidden_size,
                cell=config.cell,
                layers=config.layers,
                dropout=config.dropout,
                activation=config.activation,
                beam_size=3,
                optim=config.optim,
                special_tokens=special_tokens,
                lr=config.lr
            )
            logger = WandbLogger(
                project=project_name, name=run.name, experiment=run, log_model=False
            )
            trainer = pl.Trainer(
                devices=1,
                accelerator="auto",
                precision="16-mixed",
                gradient_clip_val=1.0,
                max_epochs=config.epochs,
                logger=logger,
                profiler=None,
            )

            trainer.fit(model, train_dataloader, val_dataloader)
        finally:
            del trainer
            del model
            gc.collect()
            torch.cuda.empty_cache()

In [24]:
project_name = "DLa3_RNN"
sweep_id = wandb.sweep(sweep_config, project=project_name)
wandb.agent(sweep_id, function=trainCNN, count=20)



Create sweep with ID: 7jvv8kd1
Sweep URL: https://wandb.ai/deeplearn24/DLa3_RNN/sweeps/7jvv8kd1


[34m[1mwandb[0m: Agent Starting Run: 3rs7cxt0 with config:
[34m[1mwandb[0m: 	activation: tanh
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell: rnn
[34m[1mwandb[0m: 	dropout: 0.4305944059612543
[34m[1mwandb[0m: 	embedding_size: 128
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	layers: 1
[34m[1mwandb[0m: 	lr: 0.0012016756221636478
[34m[1mwandb[0m: 	optim: sgd


[1;34mwandb[0m: 
[1;34mwandb[0m: 🚀 View run [33mteeee1[0m at: [34mhttps://wandb.ai/deeplearn24/dl_rnn/runs/qj1stpwz[0m


Exception in thread Thread-10 (_run_job):
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
    self._function()
  File "/tmp/ipykernel_35/3447160926.py", line 2, in trainCNN
  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_init.py", line 1544, in init
    wandb._sentry.reraise(e)
  File "/usr/local/lib/python3.11/dist-packages/wandb/analytics/sentry.py", line 156, in reraise
    raise exc.with_traceback(sys.exc_info()[2])
  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_init.py", line 1530, in init
    return wi.init(run_settings, run_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_init.py", line 799, in init
    wandb.run.finish()
  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_run.py", line 387, in wrapper
    return func(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File 

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7dd42c3d5010>> (for post_run_cell):


BrokenPipeError: [Errno 32] Broken pipe

Exception in thread Thread-11 (_run_job):
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
    self._function()
  File "/tmp/ipykernel_35/3447160926.py", line 2, in trainCNN
  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_init.py", line 1544, in init
    wandb._sentry.reraise(e)
  File "/usr/local/lib/python3.11/dist-packages/wandb/analytics/sentry.py", line 156, in reraise
    raise exc.with_traceback(sys.exc_info()[2])
  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_init.py", line 1530, in init
    return wi.init(run_settings, run_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_init.py", line 799, in init
    wandb.run.finish()
  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_run.py", line 387, in wrapper
    return func(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File 

In [25]:
wandb.finish()

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7dd42c3d5010>> (for pre_run_cell):


BrokenPipeError: [Errno 32] Broken pipe

BrokenPipeError: [Errno 32] Broken pipe

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7dd42c3d5010>> (for post_run_cell):


BrokenPipeError: [Errno 32] Broken pipe