In [1]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


# Packages

In [2]:
import os
import torch
import pandas as pd
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


# Data Preprocessing

In [3]:
class TransliterationDataset(Dataset):
    def __init__(self, file_path, latin_vocab=None, devanagari_vocab=None, max_len=50):

        # Read the file and check the number of columns
        self.data = pd.read_csv(file_path, sep='\t', header=None)
        
        # Adjust based on the number of columns in the data
        if self.data.shape[1] == 2:
            self.data.columns = ['devanagari', 'latin']
        elif self.data.shape[1] == 3:
            # Based on the screenshot, format is: [Devanagari, Latin, Frequency]
            self.data.columns = ['devanagari', 'latin', 'frequency']
        else:
            # Try to infer based on first row
            print(f"Warning: Unexpected number of columns ({self.data.shape[1]}). Inspecting first row:")
            print(self.data.iloc[0].tolist())
            # Default naming
            self.data.columns = [f'col{i}' for i in range(self.data.shape[1])]
        
        
        # Filter out rows with sequences longer than max_len
        self.data = self.data[
            (self.data['latin'].str.len() <= max_len) & 
            (self.data['devanagari'].str.len() <= max_len)
        ]
        
        # Define special tokens BEFORE creating vocabularies
        self.PAD_TOKEN = '<PAD>'
        self.SOS_TOKEN = '<SOS>'
        self.EOS_TOKEN = '<EOS>'
        
        self.PAD_IDX = 0
        self.SOS_IDX = 1
        self.EOS_IDX = 2
        
        # Create or use provided vocabularies
        if latin_vocab is None:
            # Create a new vocabulary
            char_to_idx = self._create_vocab(self.data['latin'])
            # Add special tokens
            self.latin_vocab = {
                self.PAD_TOKEN: self.PAD_IDX,
                self.SOS_TOKEN: self.SOS_IDX,
                self.EOS_TOKEN: self.EOS_IDX
            }
            # Add character tokens with indices starting after special tokens
            for char, idx in char_to_idx.items():
                self.latin_vocab[char] = idx + 3
        else:
            # Use the provided vocabulary
            self.latin_vocab = latin_vocab
            
        if devanagari_vocab is None:
            # Create a new vocabulary
            char_to_idx = self._create_vocab(self.data['devanagari'])
            # Add special tokens
            self.devanagari_vocab = {
                self.PAD_TOKEN: self.PAD_IDX,
                self.SOS_TOKEN: self.SOS_IDX,
                self.EOS_TOKEN: self.EOS_IDX
            }
            # Add character tokens with indices starting after special tokens
            for char, idx in char_to_idx.items():
                self.devanagari_vocab[char] = idx + 3
        else:
            # Use the provided vocabulary
            self.devanagari_vocab = devanagari_vocab
            
        # Create reverse mappings (index to character)
        self.latin_idx2char = {idx: char for char, idx in self.latin_vocab.items()}
        self.devanagari_idx2char = {idx: char for char, idx in self.devanagari_vocab.items()}
        
        # Print vocabulary sizes and special token information
        print(f"Latin vocabulary size: {len(self.latin_vocab)}")
        print(f"Devanagari vocabulary size: {len(self.devanagari_vocab)}")
        print(f"Special tokens: PAD={self.PAD_TOKEN} (idx={self.PAD_IDX}), "
              f"SOS={self.SOS_TOKEN} (idx={self.SOS_IDX}), "
              f"EOS={self.EOS_TOKEN} (idx={self.EOS_IDX})")
    
    def _create_vocab(self, series):
        """Create vocabulary from series of strings"""
        chars = set()
        for s in series:
            chars.update(s)
        return {char: idx for idx, char in enumerate(sorted(chars))}
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        latin_word = self.data.iloc[idx]['latin']
        devanagari_word = self.data.iloc[idx]['devanagari']
        
        # Convert to indices
        latin_indices = [self.latin_vocab[char] for char in latin_word]
        latin_indices = [self.SOS_IDX] + latin_indices + [self.EOS_IDX]
        
        devanagari_indices = [self.devanagari_vocab[char] for char in devanagari_word]
        devanagari_indices = [self.SOS_IDX] + devanagari_indices + [self.EOS_IDX]
        
        return {
            'latin': torch.tensor(latin_indices),
            'devanagari': torch.tensor(devanagari_indices),
            'latin_len': len(latin_indices),
            'devanagari_len': len(devanagari_indices),
            'latin_text': latin_word,
            'devanagari_text': devanagari_word
        }

# Data Loading

In [5]:
def collate_fn(batch):
    """Custom collate function for padding sequences in batch"""
    latin_seqs = [item['latin'] for item in batch]
    devanagari_seqs = [item['devanagari'] for item in batch]
    latin_lens = torch.tensor([item['latin_len'] for item in batch])
    devanagari_lens = torch.tensor([item['devanagari_len'] for item in batch])

    # Pad sequences
    latin_padded = pad_sequence(latin_seqs, batch_first=True, padding_value=0)
    devanagari_padded = pad_sequence(devanagari_seqs, batch_first=True, padding_value=0)

    latin_texts = [item['latin_text'] for item in batch]
    devanagari_texts = [item['devanagari_text'] for item in batch]

    return {
        'latin': latin_padded,
        'devanagari': devanagari_padded,
        'latin_len': latin_lens,
        'devanagari_len': devanagari_lens,
        'latin_text': latin_texts,
        'devanagari_text': devanagari_texts
    }

def get_dataloaders(train_path, val_path, test_path, batch_size=32, max_len=50):

    # Create train dataset and get vocabularies
    train_dataset = TransliterationDataset(train_path, max_len=max_len)

    # Create validation and test datasets with train vocabularies
    val_dataset = TransliterationDataset(
        val_path,
        latin_vocab=train_dataset.latin_vocab,
        devanagari_vocab=train_dataset.devanagari_vocab,
        max_len=max_len
    )

    test_dataset = TransliterationDataset(
        test_path,
        latin_vocab=train_dataset.latin_vocab,
        devanagari_vocab=train_dataset.devanagari_vocab,
        max_len=max_len
    )

    # Create dataloaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_fn
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=collate_fn
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=collate_fn
    )

    return {
        'train_loader': train_loader,
        'val_loader': val_loader,
        'test_loader': test_loader,
        'latin_vocab': train_dataset.latin_vocab,
        'devanagari_vocab': train_dataset.devanagari_vocab,
        'latin_idx2char': train_dataset.latin_idx2char,
        'devanagari_idx2char': train_dataset.devanagari_idx2char
    }

# Encoder

In [6]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, cell_type='GRU', dropout=0.0):

        super().__init__()

        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.cell_type = cell_type

        # Character embedding layer
        self.embedding = nn.Embedding(input_dim, emb_dim)

        # RNN layer
        if cell_type == 'RNN':
            self.rnn = nn.RNN(emb_dim, hidden_dim, n_layers, dropout=dropout if n_layers > 1 else 0, batch_first=True)
        elif cell_type == 'LSTM':
            self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout if n_layers > 1 else 0, batch_first=True)
        elif cell_type == 'GRU':
            self.rnn = nn.GRU(emb_dim, hidden_dim, n_layers, dropout=dropout if n_layers > 1 else 0, batch_first=True)
        else:
            raise ValueError(f"Unsupported RNN cell type: {cell_type}")

        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_len):

        # Convert to embeddings
        embedded = self.dropout(self.embedding(src))  # [batch_size, seq_len, emb_dim]

        # Pack sequences to handle variable lengths efficiently
        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            embedded, src_len.cpu(), batch_first=True, enforce_sorted=False
        )

        # Pass through RNN
        if self.cell_type == 'LSTM':
            packed_outputs, (hidden, cell) = self.rnn(packed_embedded)
            # Unpack sequence
            outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs, batch_first=True)
            return outputs, (hidden, cell)
        else:
            packed_outputs, hidden = self.rnn(packed_embedded)
            # Unpack sequence
            outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs, batch_first=True)
            return outputs, hidden

# Decoder

In [7]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, cell_type='GRU', dropout=0.0):
        super().__init__()

        self.output_dim = output_dim
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.cell_type = cell_type

        # Character embedding layer
        self.embedding = nn.Embedding(output_dim, emb_dim)

        # RNN layer
        if cell_type == 'RNN':
            self.rnn = nn.RNN(emb_dim, hidden_dim, n_layers, dropout=dropout if n_layers > 1 else 0, batch_first=True)
        elif cell_type == 'LSTM':
            self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout if n_layers > 1 else 0, batch_first=True)
        elif cell_type == 'GRU':
            self.rnn = nn.GRU(emb_dim, hidden_dim, n_layers, dropout=dropout if n_layers > 1 else 0, batch_first=True)
        else:
            raise ValueError(f"Unsupported RNN cell type: {cell_type}")

        # Linear layer to produce output probabilities
        self.fc_out = nn.Linear(hidden_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden):

        # Convert to embeddings and apply dropout
        input = input.unsqueeze(1)  # Add sequence dimension [batch_size, 1]
        embedded = self.dropout(self.embedding(input))  # [batch_size, 1, emb_dim]

        # Pass through RNN
        if self.cell_type == 'LSTM':
            output, (hidden, cell) = self.rnn(embedded, hidden)
        else:
            output, hidden = self.rnn(embedded, hidden)

        # Get prediction
        prediction = self.fc_out(output.squeeze(1))  # [batch_size, output_dim]

        # Return prediction and hidden state
        if self.cell_type == 'LSTM':
            return prediction, (hidden, cell)
        else:
            return prediction, hidden

# Encoder-Decoder Adapter

In [8]:
class EncoderDecoderAdapter(nn.Module):
    """
    Adapter module to connect encoder and decoder with different layer counts
    """
    def __init__(self, encoder_layers, decoder_layers, hidden_dim, cell_type):
        super().__init__()
        self.encoder_layers = encoder_layers
        self.decoder_layers = decoder_layers
        self.hidden_dim = hidden_dim
        self.cell_type = cell_type

        # Map features: fully-connected network for each decoder layer
        self.hidden_adapters = nn.ModuleList([
            nn.Linear(encoder_layers * hidden_dim, hidden_dim)
            for _ in range(decoder_layers)
        ])

        # Cell state adapters for LSTM
        if cell_type == 'LSTM':
            self.cell_adapters = nn.ModuleList([
                nn.Linear(encoder_layers * hidden_dim, hidden_dim)
                for _ in range(decoder_layers)
            ])

    def forward(self, encoder_hidden):
        
        if self.cell_type == 'LSTM':
            hidden, cell = encoder_hidden
            batch_size = hidden.size(1)

            # Reshape to [batch_size, encoder_layers * hidden_dim]
            hidden_flat = hidden.permute(1, 0, 2).contiguous().view(batch_size, -1)
            cell_flat = cell.permute(1, 0, 2).contiguous().view(batch_size, -1)

            # Create new hidden and cell states for decoder
            decoder_hidden = []
            decoder_cell = []

            # Apply adapter networks for each decoder layer
            for i in range(self.decoder_layers):
                decoder_hidden.append(self.hidden_adapters[i](hidden_flat))
                decoder_cell.append(self.cell_adapters[i](cell_flat))

            # Stack and reshape to [decoder_layers, batch_size, hidden_dim]
            decoder_hidden = torch.stack(decoder_hidden, dim=0)
            decoder_cell = torch.stack(decoder_cell, dim=0)

            return (decoder_hidden, decoder_cell)

        else:  # RNN or GRU
            batch_size = encoder_hidden.size(1)

            # Reshape to [batch_size, encoder_layers * hidden_dim]
            hidden_flat = encoder_hidden.permute(1, 0, 2).contiguous().view(batch_size, -1)

            # Create new hidden state for decoder
            decoder_hidden = []

            # Apply adapter networks for each decoder layer
            for i in range(self.decoder_layers):
                decoder_hidden.append(self.hidden_adapters[i](hidden_flat))

            # Stack and reshape to [decoder_layers, batch_size, hidden_dim]
            decoder_hidden = torch.stack(decoder_hidden, dim=0)

            return decoder_hidden

# Seq2Seq Model

In [9]:
import random
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, teacher_forcing_ratio=0.5):

        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.teacher_forcing_ratio = teacher_forcing_ratio

        # Check that hidden dimensions match
        assert encoder.hidden_dim == decoder.hidden_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"

        # Check that cell types match
        assert encoder.cell_type == decoder.cell_type, \
            "Encoder and decoder must use the same cell type!"

        # Create adapter if layer counts differ
        if encoder.n_layers != decoder.n_layers:
            print(f"Creating adapter from {encoder.n_layers} encoder layers to {decoder.n_layers} decoder layers")
            self.adapter = EncoderDecoderAdapter(
                encoder_layers=encoder.n_layers,
                decoder_layers=decoder.n_layers,
                hidden_dim=encoder.hidden_dim,
                cell_type=encoder.cell_type
            )
        else:
            self.adapter = None

    def forward(self, src, src_len, trg, teacher_forcing_ratio=None):

        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim

        # Tensor to store decoder outputs
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        # Encode source sequence
        if self.encoder.cell_type == 'LSTM':
            encoder_outputs, (hidden, cell) = self.encoder(src, src_len)

            # Adapt encoder hidden state to decoder format if needed
            if self.adapter is not None:
                hidden, cell = self.adapter((hidden, cell))

            # Set initial decoder state
            decoder_hidden = (hidden, cell)
        else:
            encoder_outputs, hidden = self.encoder(src, src_len)

            # Adapt encoder hidden state to decoder format if needed
            if self.adapter is not None:
                hidden = self.adapter(hidden)

            # Set initial decoder state
            decoder_hidden = hidden

        # First input to the decoder is the <SOS> token
        input = trg[:, 0]  # Shape: [batch_size]

        # Use teacher forcing ratio from argument if provided, else use default
        if teacher_forcing_ratio is None:
            teacher_forcing_ratio = self.teacher_forcing_ratio

        # Decode one step at a time
        for t in range(1, trg_len):
            # Pass through decoder
            output, decoder_hidden = self.decoder(input, decoder_hidden)

            # Save to outputs tensor
            outputs[:, t, :] = output

            # Decide whether to use teacher forcing
            teacher_force = random.random() < teacher_forcing_ratio

            # Get the highest predicted token from our predictions
            top1 = output.argmax(1)

            # If teacher forcing, use actual next token as next input, else use predicted token
            input = trg[:, t] if teacher_force else top1

        return outputs

# Training & Evaluation (1 epoch)

In [10]:
import torch.optim as optim
import torch.nn.functional as F
import time
import math

def train(model, dataloader, optimizer, criterion, clip, device):
    
    #Training function for one epoch

    model.train()
    epoch_loss = 0

    for batch in dataloader:
        # Get data
        src = batch['latin'].to(device)
        trg = batch['devanagari'].to(device)
        src_len = batch['latin_len'].to(device)

        optimizer.zero_grad()

        # Forward pass
        output = model(src, src_len, trg)

        # Calculate loss
        # output shape: [batch_size, trg_len, output_dim]
        # trg shape: [batch_size, trg_len]
        # Ignore the <SOS> token (first token)
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)

        # Backward pass
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        # Update parameters
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            # Get data
            src = batch['latin'].to(device)
            trg = batch['devanagari'].to(device)
            src_len = batch['latin_len'].to(device)

            # Forward pass
            output = model(src, src_len, trg, 0)  # Turn off teacher forcing

            # Calculate loss
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

# Inference

In [11]:
def translate_with_beam(model, src, src_len, devanagari_idx2char, device, sos_idx=1, eos_idx=2, pad_idx=0, beam_size=5, max_len=50):
    """
    Translate a source sequence to target language using beam search
    """
    model.eval()
    batch_size = src.shape[0]
    translations = []

    with torch.no_grad():
        # Process each example in the batch separately
        for i in range(batch_size):
            # Get the individual source sequence
            src_seq = src[i:i+1]  # Keep batch dimension: [1, src_len]
            src_len_seq = src_len[i:i+1]  # [1]

            # Encode the source sequence
            if model.encoder.cell_type == 'LSTM':
                encoder_outputs, (hidden, cell) = model.encoder(src_seq, src_len_seq)
                
                # Apply adapter if encoder and decoder have different layer counts
                if model.adapter is not None:
                    hidden, cell = model.adapter((hidden, cell))
                
                # Set initial decoder state
                hidden_states = (hidden, cell)
            else:
                encoder_outputs, hidden = model.encoder(src_seq, src_len_seq)
                
                # Apply adapter if encoder and decoder have different layer counts
                if model.adapter is not None:
                    hidden = model.adapter(hidden)
                    
                # Set initial decoder state
                hidden_states = hidden

            # Initialize beam search
            # Each beam will contain: (sequence, score, hidden_states, completed_flag)
            beams = []

            # Start with SOS token
            beams.append(([sos_idx], 0.0, hidden_states, False))

            # Generate sequence token by token
            for t in range(max_len - 1):  # -1 because we already added SOS
                new_beams = []

                # Flag to check if all beams have completed
                all_beams_completed = True

                # Expand each current beam
                for seq, score, states, completed in beams:
                    if completed:
                        # If this beam is already complete, keep it
                        new_beams.append((seq, score, states, completed))
                        continue

                    # At least one beam is not completed
                    all_beams_completed = False

                    # Get the last token in the sequence
                    last_token = torch.tensor([seq[-1]], device=device)

                    # Forward pass through the decoder
                    output, new_states = model.decoder(last_token, states)

                    # Get probabilities
                    probs = torch.nn.functional.log_softmax(output, dim=1)

                    # Get top-k next tokens
                    topk_probs, topk_idx = probs.topk(beam_size)

                    # Create new beams
                    for j in range(beam_size):
                        next_token = topk_idx[0][j].item()
                        next_score = score + topk_probs[0][j].item()
                        next_seq = seq + [next_token]
                        next_completed = (next_token == eos_idx)

                        new_beams.append((next_seq, next_score, new_states, next_completed))

                # Update beams: select top-k beams
                # Normalize by length to avoid bias toward shorter sequences
                beams = sorted(new_beams, key=lambda x: x[1] / len(x[0]), reverse=True)[:beam_size]

                # If all beams have completed, break
                if all_beams_completed:
                    break

            # Select the best beam
            best_seq = beams[0][0]

            # Convert to characters
            chars = []
            for token in best_seq:
                if token == eos_idx:
                    break
                if token != sos_idx and token != pad_idx:
                    chars.append(devanagari_idx2char[token])

            translations.append(''.join(chars))

    return translations

In [12]:
def translate(model, src, src_len, devanagari_idx2char, device, sos_idx=1, eos_idx=2, pad_idx=0, max_len=50):
    """
    Translate a source sequence to target language using greedy decoding
    """
    model.eval()
    batch_size = src.shape[0]
    translations = []

    with torch.no_grad():
        # Encode the source sequence
        if model.encoder.cell_type == 'LSTM':
            encoder_outputs, (hidden, cell) = model.encoder(src, src_len)
            
            # Apply adapter if encoder and decoder have different layer counts
            if model.adapter is not None:
                hidden, cell = model.adapter((hidden, cell))
                
            # Set initial decoder state
            hidden_states = (hidden, cell)
        else:
            encoder_outputs, hidden = model.encoder(src, src_len)
            
            # Apply adapter if encoder and decoder have different layer counts
            if model.adapter is not None:
                hidden = model.adapter(hidden)
                
            # Set initial decoder state
            hidden_states = hidden

        # Start with < SOS > token for each example in batch
        input = torch.tensor([sos_idx] * batch_size, device=device)

        # Initialize result sequences
        result_sequences = torch.full((batch_size, max_len), pad_idx, dtype=torch.long, device=device)
        result_sequences[:, 0] = input

        # Track which sequences have ended
        ended_sequences = torch.zeros(batch_size, dtype=torch.bool, device=device)

        # Generate one character at a time
        for t in range(1, max_len):
            # Pass through decoder
            output, hidden_states = model.decoder(input, hidden_states)

            # Get the predicted token
            pred_token = output.argmax(1)

            # Save the predicted token
            result_sequences[:, t] = pred_token

            # Mark sequences that have ended (predicted <EOS>)
            ended_sequences = ended_sequences | (pred_token == eos_idx)

            # Stop if all sequences have ended
            if ended_sequences.all():
                break

            # Next input is the predicted token
            input = pred_token

    # Convert indices to characters
    for i in range(batch_size):
        seq = result_sequences[i].cpu().numpy()
        # Convert to string, stopping at <EOS> token
        chars = []
        for idx in seq:
            if idx == eos_idx:
                break
            if idx != sos_idx and idx != pad_idx:
                chars.append(devanagari_idx2char[idx.item()])

        translations.append(''.join(chars))

    return translations

# Calculate Accuracy

In [13]:
def calculate_accuracy(model, dataloader, devanagari_idx2char, device, beam_size=1, sos_idx=1, eos_idx=2, pad_idx=0):
    
    #Calculate accuracy on a dataset using either greedy or beam search

    model.eval()

    correct = 0
    total = 0
    predictions = []
    targets = []

    with torch.no_grad():
        for batch in dataloader:
            src = batch['latin'].to(device)
            src_len = batch['latin_len'].to(device)
            trg_texts = batch['devanagari_text']

            # Use either greedy or beam search
            if beam_size <= 1:
                # Use greedy search
                translations = translate(
                    model, src, src_len, devanagari_idx2char, device,
                    sos_idx=sos_idx, eos_idx=eos_idx, pad_idx=pad_idx
                )
            else:
                # Use beam search
                translations = translate_with_beam(
                    model, src, src_len, devanagari_idx2char, device,
                    sos_idx=sos_idx, eos_idx=eos_idx, pad_idx=pad_idx,
                    beam_size=beam_size
                )

            for pred, gold in zip(translations, trg_texts):
                predictions.append(pred)
                targets.append(gold)
                if pred == gold:
                    correct += 1
                total += 1

    return (correct / total), predictions, targets

# Train with wandb

In [30]:
import wandb

def train_model(config=None, run_name=None):

    if config is not None:
        config = wandb.config
        wandb.run.name = run_name
    # Otherwise, use default (Best Model)
    elif config is None:
        config = {
            'emb_dim': 256,
            'hidden_dim': 512,
            'encoder_layers': 2,
            'decoder_layers': 2,
            'cell_type': 'LSTM',
            'dropout': 0.2,
            'lr': 0.001,
            'batch_size': 128,
            'epochs': 10,
            'clip': 1.0,
            'teacher_forcing_ratio': 0.3,
            'beam_size': 3  # Start with greedy search (1) for stability
        }

    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Load data
    data = get_dataloaders(
        '/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv',
        '/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv',
        '/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv',
        batch_size=config['batch_size']
    )

    train_loader = data['train_loader']
    val_loader = data['val_loader']
    test_loader = data['test_loader']

    latin_vocab = data['latin_vocab']
    devanagari_vocab = data['devanagari_vocab']
    latin_idx2char = data['latin_idx2char']
    devanagari_idx2char = data['devanagari_idx2char']
    
    # Get special token indices directly
    # Extract from the dataset to ensure consistency
    train_dataset = train_loader.dataset
    # Use default values if the attributes don't exist
    sos_idx = getattr(train_dataset, 'SOS_IDX', 1)
    eos_idx = getattr(train_dataset, 'EOS_IDX', 2) 
    pad_idx = getattr(train_dataset, 'PAD_IDX', 0)
    

    # Create model
    input_dim = len(latin_vocab)
    output_dim = len(devanagari_vocab)

    encoder = Encoder(
        input_dim=input_dim,
        emb_dim=config['emb_dim'],
        hidden_dim=config['hidden_dim'],
        n_layers=config['encoder_layers'],
        cell_type=config['cell_type'],
        dropout=config['dropout']
    )

    decoder = Decoder(
        output_dim=output_dim,
        emb_dim=config['emb_dim'],
        hidden_dim=config['hidden_dim'],
        n_layers=config['decoder_layers'],
        cell_type=config['cell_type'],
        dropout=config['dropout']
    )

    model = Seq2Seq(
        encoder=encoder,
        decoder=decoder,
        device=device,
        teacher_forcing_ratio=config['teacher_forcing_ratio']
    )

    model = model.to(device)

    # Calculate number of parameters
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'The model has {num_params:,} trainable parameters')

    # Initialize optimizer and criterion
    optimizer = optim.Adam(model.parameters(), lr=config['lr'])
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)  # Ignore padding index

    # Training loop
    best_val_loss = float('inf')

    for epoch in range(config['epochs']):
        start_time = time.time()

        train_loss = train(model, train_loader, optimizer, criterion, config['clip'], device)
        val_loss = evaluate(model, val_loader, criterion, device)

        end_time = time.time()
        epoch_mins, epoch_secs = divmod(end_time - start_time, 60)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best-model.pt')

        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs:.2f}s')
        print(f'\tTrain Loss: {train_loss:.3f}')
        print(f'\t Val. Loss: {val_loss:.3f}')

        # Validation accuracy with updated function
        val_accuracy, _, _ = calculate_accuracy(
            model, val_loader, devanagari_idx2char, device, 
            beam_size=1,  # Use greedy search during training for speed
            sos_idx=sos_idx, eos_idx=eos_idx, pad_idx=pad_idx
        )
        print(f'\t Val. Accuracy: {val_accuracy:.3f}')

        # Log to wandb
        if wandb.run is not None:
            wandb.log({
                "train_loss": train_loss,
                "val_loss": val_loss,
                "val_accuracy": val_accuracy,
                "epoch": epoch
            })

    # Load the best model
    model.load_state_dict(torch.load('best-model.pt'))

    # Calculate test accuracy with beam search
    test_accuracy, predictions, targets = calculate_accuracy(
        model, test_loader, devanagari_idx2char, device, 
        beam_size=config['beam_size'],  # Use beam search for final evaluation
        sos_idx=sos_idx, eos_idx=eos_idx, pad_idx=pad_idx
    )
    print(f'Test Accuracy (beam size={config["beam_size"]}): {test_accuracy:.3f}')

    # if wandb.run is not None:
    #     wandb.log({"test_accuracy": test_accuracy})

    #Save predictions for analysis
    latin_texts = []
    devanagari_texts = []

    for batch in test_loader:
        latin_texts.extend(batch['latin_text'])
        devanagari_texts.extend(batch['devanagari_text'])

    # Create DataFrame with all 4 columns
    results = pd.DataFrame({
        'latin': latin_texts[:len(predictions)],
        'true': devanagari_texts[:len(predictions)],
        'predicted': predictions,
        'accuracy': [1 if pred == target else 0 for pred, target in zip(predictions, devanagari_texts[:len(predictions)])]
    })

    
    results.to_csv('./test_vanilla.csv', index=False)
    

    return model  #, test_accuracy

# Sweep Configuration

In [16]:
sweep_config = {
        'method': 'bayes',  # Bayesian optimization
        'metric': {
            'name': 'val_accuracy',
            'goal': 'maximize'
        },
        'parameters': {
            'emb_dim': {
                'values': [16, 32, 64, 128, 256]
            },
            'hidden_dim': {
                'values': [32, 64, 128, 256, 512]
            },
            'encoder_layers': {
                'values': [1, 2, 3]
            },
            'decoder_layers': {
                'values': [1, 2, 3]
            },
            'cell_type': {
                'values': ['RNN', 'GRU', 'LSTM']
            },
            'dropout': {
                'values': [0.1, 0.2, 0.3,]
            },
            'lr': {
                'values': [0.0001, 0.001, 0.01]
            },
            'batch_size': {
                'values': [32, 64, 128]
            },
            'clip': {
                'values': [0.1, 1.0, 5.0]
            },
            'teacher_forcing_ratio': {
                'values': [0.3, 0.5, 0.7]
            },
            'beam_size': {
                'values': [1, 3, 5]  # 1 is equivalent to greedy search
            },
            'epochs': {
                'values': [2]
            }
        }
}

def run_sweep():
        run = wandb.init()
        config = wandb.config
        run_name = (
        f"{config.cell_type}_"
        f"enc_{config.encoder_layers}_dec_{config.decoder_layers}_"
        f"hdim_{config.hidden_dim}_"
        f"emb_{config.emb_dim}_"
        f"bs_{config.batch_size}_"
        f"drop_{config.dropout}_"
        f"beam_{config.beam_size}_"
        f"lr_{config.lr}_"
        f"clip_{config.clip}_"
        f"tf_{config.teacher_forcing_ratio}_"
        f"epoch_{config.epochs}"
        )
        train_model(config=wandb.config, run_name=run_name)  # wandb.config is automatically passed


# Local Execution

In [31]:
model=train_model(config=None, run_name=None)

  return op(a, b)


Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
The model has 7,414,594 trainable parameters
Epoch: 01 | Time: 0.0m 17.83s
	Train Loss: 1.703
	 Val. Loss: 1.132
	 Val. Accuracy: 0.207
Epoch: 02 | Time: 0.0m 17.92s
	Train Loss: 0.831
	 Val. Loss: 0.964
	 Val. Accuracy: 0.304
Epoch: 03 | Time: 0.0m 17.88s
	Train Loss: 0.632
	 Val. Loss: 0.913
	 Val. Accuracy: 0.345
Epoch: 04 | Time: 0.0m 18.00s
	Train Loss: 0.498
	 Val. Loss: 0.935
	 Val. Accuracy: 0.351
Epoch: 05 | Time: 0.0m 18.01s
	Train Loss: 0.411
	 Val. Loss: 0.948
	 Val. Accuracy: 0.353
Epoch: 06 | Time: 0.0m 17.98s
	Train Loss: 0.342
	 Val. Loss: 0.935
	 Val. Accuracy: 0.374
Epoch: 07 | Time: 0.0m

# Wandb Execution

In [None]:

wandb.login()
sweep_id = wandb.sweep(sweep_config,project="DA6401-A3", entity="cs24m033-iit-madras")
wandb.agent(sweep_id,function=run_sweep, count = 22 )

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Create sweep with ID: r9ulhvpq
Sweep URL: https://wandb.ai/cs24m033-iit-madras/DA6401-A3/sweeps/r9ulhvpq


[34m[1mwandb[0m: Agent Starting Run: 5eem3lkq with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	clip: 5
[34m[1mwandb[0m: 	decoder_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 512
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.3


  return op(a, b)


Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Creating adapter from 2 encoder layers to 3 decoder layers
The model has 12,664,642 trainable parameters
Epoch: 01 | Time: 0.0m 21.17s
	Train Loss: 2.009
	 Val. Loss: 1.261
	 Val. Accuracy: 0.137
Epoch: 02 | Time: 0.0m 21.03s
	Train Loss: 0.977
	 Val. Loss: 1.013
	 Val. Accuracy: 0.265
Epoch: 03 | Time: 0.0m 21.12s
	Train Loss: 0.733
	 Val. Loss: 0.937
	 Val. Accuracy: 0.327
Epoch: 04 | Time: 0.0m 20.92s
	Train Loss: 0.585
	 Val. Loss: 0.944
	 Val. Accuracy: 0.333
Epoch: 05 | Time: 0.0m 20.82s
	Train Loss: 0.478
	 Val. Loss: 0.946
	 Val. Accuracy: 0.354
Epoch: 06 | Time: 0.0m 21.12s
	Train Loss: 0.392
	 Va

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▄▃▃▂▂▂▁▁▁▁▁▁▁▁
val_accuracy,▁▅▇▇█▇████▇▇██▇
val_loss,█▃▁▁▁▁▂▄▄▅▆▆▇██

0,1
epoch,14.0
train_loss,0.14834
val_accuracy,0.34901
val_loss,1.25853


[34m[1mwandb[0m: Agent Starting Run: j012sth9 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	clip: 1
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 512
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.3


  return op(a, b)


Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Creating adapter from 3 encoder layers to 2 decoder layers
The model has 8,725,314 trainable parameters
Epoch: 01 | Time: 0.0m 28.22s
	Train Loss: 2.310
	 Val. Loss: 1.660
	 Val. Accuracy: 0.044
Epoch: 02 | Time: 0.0m 27.83s
	Train Loss: 1.430
	 Val. Loss: 1.325
	 Val. Accuracy: 0.128
Epoch: 03 | Time: 0.0m 28.23s
	Train Loss: 1.155
	 Val. Loss: 1.184
	 Val. Accuracy: 0.186
Epoch: 04 | Time: 0.0m 28.11s
	Train Loss: 1.002
	 Val. Loss: 1.113
	 Val. Accuracy: 0.238
Epoch: 05 | Time: 0.0m 27.89s
	Train Loss: 0.897
	 Val. Loss: 1.030
	 Val. Accuracy: 0.252
Epoch: 06 | Time: 0.0m 27.85s
	Train Loss: 0.814
	 Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▅▄▃▃▂▂▂▂▂▁▁▁▁▁
val_accuracy,▁▃▄▅▆▆▇▇▇▇█████
val_loss,█▅▃▂▂▂▁▁▁▁▁▁▁▁▁

0,1
epoch,14.0
train_loss,0.4123
val_accuracy,0.35521
val_loss,0.99511


[34m[1mwandb[0m: Agent Starting Run: x2tc3279 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	clip: 5
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 512
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.3


  return op(a, b)


Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Creating adapter from 3 encoder layers to 2 decoder layers
The model has 12,663,618 trainable parameters
Epoch: 01 | Time: 0.0m 20.54s
	Train Loss: 2.830
	 Val. Loss: 2.214
	 Val. Accuracy: 0.003
Epoch: 02 | Time: 0.0m 20.59s
	Train Loss: 1.881
	 Val. Loss: 1.665
	 Val. Accuracy: 0.050
Epoch: 03 | Time: 0.0m 20.47s
	Train Loss: 1.479
	 Val. Loss: 1.418
	 Val. Accuracy: 0.098
Epoch: 04 | Time: 0.0m 20.56s
	Train Loss: 1.257
	 Val. Loss: 1.273
	 Val. Accuracy: 0.150
Epoch: 05 | Time: 0.0m 20.40s
	Train Loss: 1.109
	 Val. Loss: 1.194
	 Val. Accuracy: 0.179
Epoch: 06 | Time: 0.0m 20.32s
	Train Loss: 0.998
	 Va

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▅▄▃▃▂▂▂▂▂▁▁▁▁▁
val_accuracy,▁▂▃▄▅▆▆▇▇▇█████
val_loss,█▅▄▃▂▂▂▂▁▁▁▁▁▁▁

0,1
epoch,14.0
train_loss,0.52935
val_accuracy,0.33341
val_loss,0.95237


[34m[1mwandb[0m: Agent Starting Run: bhytsej2 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	clip: 5
[34m[1mwandb[0m: 	decoder_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 512
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


  return op(a, b)


Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Creating adapter from 2 encoder layers to 3 decoder layers
The model has 12,664,642 trainable parameters
Epoch: 01 | Time: 0.0m 21.05s
	Train Loss: 2.540
	 Val. Loss: 2.226
	 Val. Accuracy: 0.011
Epoch: 02 | Time: 0.0m 21.00s
	Train Loss: 1.842
	 Val. Loss: 1.833
	 Val. Accuracy: 0.036
Epoch: 03 | Time: 0.0m 21.04s
	Train Loss: 1.607
	 Val. Loss: 1.739
	 Val. Accuracy: 0.066
Epoch: 04 | Time: 0.0m 21.18s
	Train Loss: 1.492
	 Val. Loss: 1.696
	 Val. Accuracy: 0.067
Epoch: 05 | Time: 0.0m 20.90s
	Train Loss: 1.389
	 Val. Loss: 1.636
	 Val. Accuracy: 0.092
Epoch: 06 | Time: 0.0m 21.17s
	Train Loss: 1.341
	 Va

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▅▄▃▃▂▂▂▂▂▂▁▁▁▁
val_accuracy,▁▂▄▄▅▆▆▆▇▇▇▇▇██
val_loss,█▄▃▃▂▂▂▂▁▁▁▁▂▁▁

0,1
epoch,14.0
train_loss,1.03084
val_accuracy,0.14273
val_loss,1.54917


[34m[1mwandb[0m: Agent Starting Run: fvbubt5w with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	clip: 1
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


  return op(a, b)


Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Creating adapter from 3 encoder layers to 2 decoder layers
The model has 2,408,770 trainable parameters
Epoch: 01 | Time: 0.0m 25.60s
	Train Loss: 2.144
	 Val. Loss: 2.031
	 Val. Accuracy: 0.024
Epoch: 02 | Time: 0.0m 25.72s
	Train Loss: 2.027
	 Val. Loss: 2.117
	 Val. Accuracy: 0.013
Epoch: 03 | Time: 0.0m 25.35s
	Train Loss: 1.941
	 Val. Loss: 2.157
	 Val. Accuracy: 0.008
Epoch: 04 | Time: 0.0m 25.49s
	Train Loss: 1.899
	 Val. Loss: 2.021
	 Val. Accuracy: 0.021
Epoch: 05 | Time: 0.0m 25.59s
	Train Loss: 1.850
	 Val. Loss: 2.102
	 Val. Accuracy: 0.008
Epoch: 06 | Time: 0.0m 25.62s
	Train Loss: 1.954
	 Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▆▄▄▃▅▄▃▁▁▁▃▂▂▁
val_accuracy,▆▂▁▅▁▂▆▇██▂▆▃▆▃
val_loss,▅▇█▅▇▅▃▁▂▁▄▃▁▂▅

0,1
epoch,14.0
train_loss,1.75421
val_accuracy,0.01308
val_loss,2.01657


[34m[1mwandb[0m: Agent Starting Run: 3r1wuoud with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	clip: 1
[34m[1mwandb[0m: 	decoder_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7


  return op(a, b)


Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Creating adapter from 1 encoder layers to 3 decoder layers
The model has 2,541,378 trainable parameters
Epoch: 01 | Time: 0.0m 29.35s
	Train Loss: 1.852
	 Val. Loss: 1.511
	 Val. Accuracy: 0.156
Epoch: 02 | Time: 0.0m 28.88s
	Train Loss: 0.907
	 Val. Loss: 1.305
	 Val. Accuracy: 0.269
Epoch: 03 | Time: 0.0m 28.97s
	Train Loss: 0.695
	 Val. Loss: 1.249
	 Val. Accuracy: 0.293
Epoch: 04 | Time: 0.0m 29.25s
	Train Loss: 0.598
	 Val. Loss: 1.228
	 Val. Accuracy: 0.320
Epoch: 05 | Time: 0.0m 29.07s
	Train Loss: 0.520
	 Val. Loss: 1.227
	 Val. Accuracy: 0.333
Epoch: 06 | Time: 0.0m 29.26s
	Train Loss: 0.464
	 Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▄▃▃▂▂▂▂▂▁▁▁▁▁▁
val_accuracy,▁▅▆▇▇████▇██▇█▇
val_loss,█▃▂▁▁▁▂▂▂▄▄▅▆▆▆

0,1
epoch,14.0
train_loss,0.21693
val_accuracy,0.33479
val_loss,1.43372


[34m[1mwandb[0m: Agent Starting Run: 020hx6zp with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_size: 5
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	clip: 0.1
[34m[1mwandb[0m: 	decoder_layers: 3
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	lr: 0.01
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.3


  return op(a, b)


Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
The model has 2,409,794 trainable parameters
Epoch: 01 | Time: 0.0m 47.05s
	Train Loss: 1.933
	 Val. Loss: 1.678
	 Val. Accuracy: 0.035
Epoch: 02 | Time: 0.0m 46.74s
	Train Loss: 1.695
	 Val. Loss: 1.650
	 Val. Accuracy: 0.046
Epoch: 03 | Time: 0.0m 46.99s
	Train Loss: 1.676
	 Val. Loss: 1.659
	 Val. Accuracy: 0.046
Epoch: 04 | Time: 0.0m 47.03s
	Train Loss: 1.672
	 Val. Loss: 1.679
	 Val. Accuracy: 0.045
Epoch: 05 | Time: 0.0m 47.00s
	Train Loss: 1.699
	 Val. Loss: 1.631
	 Val. Accuracy: 0.049
Epoch: 06 | Time: 0.0m 46.99s
	Train Loss: 1.725
	 Val. Loss: 1.696
	 Val. Accuracy: 0.040
Epoch: 07 | Time: 0.0m

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,▇▂▁▁▂▂▃▃▄▅▆▆██▇
val_accuracy,▅█▇▇█▆▇▅▃▄▃▄▁▄▄
val_loss,▂▁▂▂▁▂▂▃▃▄▄▇█▄▄

0,1
epoch,14.0
train_loss,1.92694
val_accuracy,0.02822
val_loss,1.77143


[34m[1mwandb[0m: Agent Starting Run: zfa7thum with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	clip: 5
[34m[1mwandb[0m: 	decoder_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 512
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.3


  return op(a, b)


Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Creating adapter from 2 encoder layers to 3 decoder layers
The model has 12,664,642 trainable parameters
Epoch: 01 | Time: 0.0m 21.20s
	Train Loss: 2.833
	 Val. Loss: 2.207
	 Val. Accuracy: 0.003
Epoch: 02 | Time: 0.0m 21.02s
	Train Loss: 1.941
	 Val. Loss: 1.690
	 Val. Accuracy: 0.029
Epoch: 03 | Time: 0.0m 20.87s
	Train Loss: 1.565
	 Val. Loss: 1.450
	 Val. Accuracy: 0.072
Epoch: 04 | Time: 0.0m 21.14s
	Train Loss: 1.328
	 Val. Loss: 1.314
	 Val. Accuracy: 0.118
Epoch: 05 | Time: 0.0m 21.14s
	Train Loss: 1.181
	 Val. Loss: 1.221
	 Val. Accuracy: 0.161
Epoch: 06 | Time: 0.0m 20.97s
	Train Loss: 1.064
	 Va

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▅▄▃▃▂▂▂▂▂▂▁▁▁▁
val_accuracy,▁▂▃▄▄▅▆▆▇▇▇████
val_loss,█▅▄▃▂▂▂▂▁▁▁▁▁▁▁

0,1
epoch,14.0
train_loss,0.58868
val_accuracy,0.3201
val_loss,0.96767


[34m[1mwandb[0m: Agent Starting Run: 1dwams57 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	clip: 5
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 512
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.3


  return op(a, b)


Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
The model has 5,575,490 trainable parameters
Epoch: 01 | Time: 0.0m 16.52s
	Train Loss: 2.728
	 Val. Loss: 2.075
	 Val. Accuracy: 0.004
Epoch: 02 | Time: 0.0m 16.56s
	Train Loss: 1.772
	 Val. Loss: 1.559
	 Val. Accuracy: 0.064
Epoch: 03 | Time: 0.0m 16.74s
	Train Loss: 1.378
	 Val. Loss: 1.314
	 Val. Accuracy: 0.127
Epoch: 04 | Time: 0.0m 16.53s
	Train Loss: 1.163
	 Val. Loss: 1.187
	 Val. Accuracy: 0.189
Epoch: 05 | Time: 0.0m 16.44s
	Train Loss: 1.037
	 Val. Loss: 1.110
	 Val. Accuracy: 0.226
Epoch: 06 | Time: 0.0m 16.57s
	Train Loss: 0.949
	 Val. Loss: 1.060
	 Val. Accuracy: 0.255
Epoch: 07 | Time: 0.0m

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▅▄▃▃▂▂▂▂▂▁▁▁▁▁
val_accuracy,▁▂▃▅▅▆▆▇▇▇▇████
val_loss,█▅▃▃▂▂▂▁▁▁▁▁▁▁▁

0,1
epoch,14.0
train_loss,0.56051
val_accuracy,0.36508
val_loss,0.91271


[34m[1mwandb[0m: Agent Starting Run: hf5yw71j with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	clip: 5
[34m[1mwandb[0m: 	decoder_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 512
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


  return op(a, b)


Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Creating adapter from 3 encoder layers to 1 decoder layers
The model has 8,988,482 trainable parameters
Epoch: 01 | Time: 0.0m 17.88s
	Train Loss: 2.906
	 Val. Loss: 2.407
	 Val. Accuracy: 0.002
Epoch: 02 | Time: 0.0m 17.84s
	Train Loss: 1.936
	 Val. Loss: 1.844
	 Val. Accuracy: 0.050
Epoch: 03 | Time: 0.0m 17.96s
	Train Loss: 1.497
	 Val. Loss: 1.572
	 Val. Accuracy: 0.104
Epoch: 04 | Time: 0.0m 17.76s
	Train Loss: 1.259
	 Val. Loss: 1.419
	 Val. Accuracy: 0.156
Epoch: 05 | Time: 0.0m 17.90s
	Train Loss: 1.112
	 Val. Loss: 1.312
	 Val. Accuracy: 0.194
Epoch: 06 | Time: 0.0m 17.82s
	Train Loss: 0.995
	 Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▅▄▃▃▂▂▂▂▂▁▁▁▁▁
val_accuracy,▁▂▃▄▅▆▆▇▇▇▇████
val_loss,█▅▄▃▂▂▂▁▁▁▁▁▁▁▁

0,1
epoch,14.0
train_loss,0.55493
val_accuracy,0.32515
val_loss,1.06176


[34m[1mwandb[0m: Agent Starting Run: yx73vzjq with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	clip: 5
[34m[1mwandb[0m: 	decoder_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 512
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


  return op(a, b)


Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
The model has 2,423,618 trainable parameters
Epoch: 01 | Time: 0.0m 21.20s
	Train Loss: 2.577
	 Val. Loss: 2.110
	 Val. Accuracy: 0.013
Epoch: 02 | Time: 0.0m 21.41s
	Train Loss: 1.607
	 Val. Loss: 1.589
	 Val. Accuracy: 0.088
Epoch: 03 | Time: 0.0m 21.38s
	Train Loss: 1.227
	 Val. Loss: 1.374
	 Val. Accuracy: 0.148
Epoch: 04 | Time: 0.0m 21.05s
	Train Loss: 1.032
	 Val. Loss: 1.265
	 Val. Accuracy: 0.212
Epoch: 05 | Time: 0.0m 21.26s
	Train Loss: 0.903
	 Val. Loss: 1.189
	 Val. Accuracy: 0.248
Epoch: 06 | Time: 0.0m 21.43s
	Train Loss: 0.819
	 Val. Loss: 1.126
	 Val. Accuracy: 0.273
Epoch: 07 | Time: 0.0m

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▅▄▃▂▂▂▂▂▁▁▁▁▁▁
val_accuracy,▁▂▄▅▆▆▆▇▇▇▇████
val_loss,█▅▃▃▂▂▂▂▁▁▁▁▁▁▁

0,1
epoch,14.0
train_loss,0.47107
val_accuracy,0.37219
val_loss,1.00331


[34m[1mwandb[0m: Agent Starting Run: qokesln7 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	clip: 5
[34m[1mwandb[0m: 	decoder_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 512
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


  return op(a, b)


Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Creating adapter from 3 encoder layers to 1 decoder layers
The model has 8,988,482 trainable parameters
Epoch: 01 | Time: 0.0m 17.84s
	Train Loss: 2.920
	 Val. Loss: 2.442
	 Val. Accuracy: 0.002
Epoch: 02 | Time: 0.0m 18.04s
	Train Loss: 1.944
	 Val. Loss: 1.817
	 Val. Accuracy: 0.038
Epoch: 03 | Time: 0.0m 17.82s
	Train Loss: 1.497
	 Val. Loss: 1.565
	 Val. Accuracy: 0.098
Epoch: 04 | Time: 0.0m 17.83s
	Train Loss: 1.264
	 Val. Loss: 1.415
	 Val. Accuracy: 0.148
Epoch: 05 | Time: 0.0m 18.12s
	Train Loss: 1.113
	 Val. Loss: 1.334
	 Val. Accuracy: 0.181
Epoch: 06 | Time: 0.0m 17.86s
	Train Loss: 1.018
	 Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▅▄▃▃▂▂▂▂▂▁▁▁▁▁
val_accuracy,▁▂▃▄▅▅▆▇▇▇▇████
val_loss,█▅▄▃▂▂▂▂▁▁▁▁▁▁▁

0,1
epoch,14.0
train_loss,0.55953
val_accuracy,0.32584
val_loss,1.05455


[34m[1mwandb[0m: Agent Starting Run: wlh65bd8 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	clip: 5
[34m[1mwandb[0m: 	decoder_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 512
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.3


  return op(a, b)


Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
The model has 3,212,098 trainable parameters
Epoch: 01 | Time: 0.0m 14.75s
	Train Loss: 1.740
	 Val. Loss: 1.220
	 Val. Accuracy: 0.174
Epoch: 02 | Time: 0.0m 14.52s
	Train Loss: 0.935
	 Val. Loss: 1.037
	 Val. Accuracy: 0.278
Epoch: 03 | Time: 0.0m 14.66s
	Train Loss: 0.740
	 Val. Loss: 0.975
	 Val. Accuracy: 0.298
Epoch: 04 | Time: 0.0m 14.56s
	Train Loss: 0.627
	 Val. Loss: 0.985
	 Val. Accuracy: 0.317
Epoch: 05 | Time: 0.0m 14.72s
	Train Loss: 0.538
	 Val. Loss: 0.963
	 Val. Accuracy: 0.349
Epoch: 06 | Time: 0.0m 14.56s
	Train Loss: 0.477
	 Val. Loss: 0.943
	 Val. Accuracy: 0.351
Epoch: 07 | Time: 0.0m

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▄▃▃▃▂▂▂▂▁▁▁▁▁▁
val_accuracy,▁▅▆▆██████▇████
val_loss,█▃▂▂▂▁▂▂▃▃▃▅▅▆▇

0,1
epoch,14.0
train_loss,0.19439
val_accuracy,0.34924
val_loss,1.17654


[34m[1mwandb[0m: Agent Starting Run: 80e9cf1i with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	clip: 1
[34m[1mwandb[0m: 	decoder_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7


  return op(a, b)


Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Creating adapter from 1 encoder layers to 3 decoder layers
The model has 2,541,378 trainable parameters
Epoch: 01 | Time: 0.0m 17.91s
	Train Loss: 3.118
	 Val. Loss: 2.668
	 Val. Accuracy: 0.000
Epoch: 02 | Time: 0.0m 17.71s
	Train Loss: 2.398
	 Val. Loss: 2.210
	 Val. Accuracy: 0.005
Epoch: 03 | Time: 0.0m 18.01s
	Train Loss: 1.990
	 Val. Loss: 2.008
	 Val. Accuracy: 0.021
Epoch: 04 | Time: 0.0m 17.77s
	Train Loss: 1.720
	 Val. Loss: 1.849
	 Val. Accuracy: 0.046
Epoch: 05 | Time: 0.0m 17.88s
	Train Loss: 1.525
	 Val. Loss: 1.736
	 Val. Accuracy: 0.070
Epoch: 06 | Time: 0.0m 17.91s
	Train Loss: 1.372
	 Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▆▅▄▃▃▂▂▂▂▁▁▁▁▁
val_accuracy,▁▁▂▂▃▄▄▅▆▆▇▇███
val_loss,█▆▅▄▃▃▂▂▂▂▁▁▁▁▁

0,1
epoch,14.0
train_loss,0.83997
val_accuracy,0.24966
val_loss,1.33225


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 390xenm0 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	clip: 5
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 512
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.3


  return op(a, b)


Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
The model has 7,414,594 trainable parameters
Epoch: 01 | Time: 0.0m 45.78s
	Train Loss: 2.118
	 Val. Loss: 1.519
	 Val. Accuracy: 0.068
Epoch: 02 | Time: 0.0m 45.96s
	Train Loss: 1.318
	 Val. Loss: 1.228
	 Val. Accuracy: 0.160
Epoch: 03 | Time: 0.0m 46.02s
	Train Loss: 1.063
	 Val. Loss: 1.121
	 Val. Accuracy: 0.220
Epoch: 04 | Time: 0.0m 45.98s
	Train Loss: 0.918
	 Val. Loss: 1.031
	 Val. Accuracy: 0.261
Epoch: 05 | Time: 0.0m 45.76s
	Train Loss: 0.812
	 Val. Loss: 0.973
	 Val. Accuracy: 0.299
Epoch: 06 | Time: 0.0m 45.84s
	Train Loss: 0.733
	 Val. Loss: 0.948
	 Val. Accuracy: 0.306
Epoch: 07 | Time: 0.0m

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▅▄▃▃▃▂▂▂▂▂▁▁▁▁
val_accuracy,▁▃▅▆▆▇▇▇███████
val_loss,█▅▃▂▂▁▁▁▁▁▁▁▁▂▂

0,1
epoch,14.0
train_loss,0.32691
val_accuracy,0.3637
val_loss,0.98082


[34m[1mwandb[0m: Agent Starting Run: nor46v7s with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	clip: 5
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 512
[34m[1mwandb[0m: 	lr: 0.001
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.3


  return op(a, b)


Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
The model has 7,414,594 trainable parameters
Epoch: 01 | Time: 0.0m 27.86s
	Train Loss: 1.505
	 Val. Loss: 1.066
	 Val. Accuracy: 0.259
Epoch: 02 | Time: 0.0m 27.78s
	Train Loss: 0.780
	 Val. Loss: 0.931
	 Val. Accuracy: 0.322
Epoch: 03 | Time: 0.0m 27.70s
	Train Loss: 0.605
	 Val. Loss: 0.915
	 Val. Accuracy: 0.353
Epoch: 04 | Time: 0.0m 27.61s
	Train Loss: 0.498
	 Val. Loss: 0.915
	 Val. Accuracy: 0.362
Epoch: 05 | Time: 0.0m 27.85s
	Train Loss: 0.409
	 Val. Loss: 0.950
	 Val. Accuracy: 0.370
Epoch: 06 | Time: 0.0m 27.72s
	Train Loss: 0.346
	 Val. Loss: 0.982
	 Val. Accuracy: 0.364
Epoch: 07 | Time: 0.0m

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▄▃▃▂▂▂▂▁▁▁▁▁▁▁
val_accuracy,▁▅▇▇█▇█▇█▇▇▇▇▇▇
val_loss,▄▁▁▁▂▂▂▄▄▅▆▇▆▇█

0,1
epoch,14.0
train_loss,0.16007
val_accuracy,0.35291
val_loss,1.23868


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: qy3p2yfx with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	clip: 5
[34m[1mwandb[0m: 	decoder_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 512
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


  return op(a, b)


Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
The model has 3,212,098 trainable parameters
Epoch: 01 | Time: 0.0m 14.57s
	Train Loss: 2.802
	 Val. Loss: 2.242
	 Val. Accuracy: 0.008
Epoch: 02 | Time: 0.0m 14.78s
	Train Loss: 1.878
	 Val. Loss: 1.836
	 Val. Accuracy: 0.046
Epoch: 03 | Time: 0.0m 14.56s
	Train Loss: 1.521
	 Val. Loss: 1.614
	 Val. Accuracy: 0.086
Epoch: 04 | Time: 0.0m 14.73s
	Train Loss: 1.322
	 Val. Loss: 1.482
	 Val. Accuracy: 0.134
Epoch: 05 | Time: 0.0m 14.65s
	Train Loss: 1.182
	 Val. Loss: 1.374
	 Val. Accuracy: 0.164
Epoch: 06 | Time: 0.0m 14.93s
	Train Loss: 1.075
	 Val. Loss: 1.321
	 Val. Accuracy: 0.182
Epoch: 07 | Time: 0.0m

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▅▄▃▃▂▂▂▂▂▁▁▁▁▁
val_accuracy,▁▂▃▄▅▅▆▆▆▇▇▇███
val_loss,█▆▄▃▃▂▂▂▂▁▁▁▁▁▁

0,1
epoch,14.0
train_loss,0.65805
val_accuracy,0.31482
val_loss,1.09038


[34m[1mwandb[0m: Agent Starting Run: 9nm0cwzm with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	clip: 5
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 512
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.3


  return op(a, b)


Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Creating adapter from 1 encoder layers to 2 decoder layers
The model has 6,363,970 trainable parameters
Epoch: 01 | Time: 0.0m 26.86s
	Train Loss: 2.533
	 Val. Loss: 1.935
	 Val. Accuracy: 0.019
Epoch: 02 | Time: 0.0m 26.63s
	Train Loss: 1.681
	 Val. Loss: 1.539
	 Val. Accuracy: 0.072
Epoch: 03 | Time: 0.0m 26.59s
	Train Loss: 1.375
	 Val. Loss: 1.333
	 Val. Accuracy: 0.126
Epoch: 04 | Time: 0.0m 26.37s
	Train Loss: 1.182
	 Val. Loss: 1.239
	 Val. Accuracy: 0.162
Epoch: 05 | Time: 0.0m 26.57s
	Train Loss: 1.065
	 Val. Loss: 1.164
	 Val. Accuracy: 0.203
Epoch: 06 | Time: 0.0m 26.52s
	Train Loss: 0.976
	 Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▅▄▃▃▃▂▂▂▂▁▁▁▁▁
val_accuracy,▁▂▃▄▅▆▆▆▇▇▇▇███
val_loss,█▅▄▃▂▂▂▂▁▁▁▁▁▁▁

0,1
epoch,14.0
train_loss,0.53858
val_accuracy,0.33272
val_loss,0.95884


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: uldz0mh0 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	clip: 5
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 512
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.3


  return op(a, b)


Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
The model has 7,414,594 trainable parameters
Epoch: 01 | Time: 0.0m 28.02s
	Train Loss: 2.389
	 Val. Loss: 1.765
	 Val. Accuracy: 0.034
Epoch: 02 | Time: 0.0m 27.84s
	Train Loss: 1.490
	 Val. Loss: 1.363
	 Val. Accuracy: 0.099
Epoch: 03 | Time: 0.0m 27.99s
	Train Loss: 1.195
	 Val. Loss: 1.196
	 Val. Accuracy: 0.189
Epoch: 04 | Time: 0.0m 27.88s
	Train Loss: 1.020
	 Val. Loss: 1.108
	 Val. Accuracy: 0.227
Epoch: 05 | Time: 0.0m 27.85s
	Train Loss: 0.911
	 Val. Loss: 1.058
	 Val. Accuracy: 0.256
Epoch: 06 | Time: 0.0m 27.72s
	Train Loss: 0.825
	 Val. Loss: 1.009
	 Val. Accuracy: 0.285
Epoch: 07 | Time: 0.0m

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▅▄▃▃▂▂▂▂▂▂▁▁▁▁
val_accuracy,▁▂▄▅▆▆▇▇▇▇█████
val_loss,█▅▃▂▂▂▁▁▁▁▁▁▁▁▁

0,1
epoch,14.0
train_loss,0.41272
val_accuracy,0.35613
val_loss,0.9847


[34m[1mwandb[0m: Agent Starting Run: eqnt770p with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	clip: 5
[34m[1mwandb[0m: 	decoder_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 512
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.3


  return op(a, b)


Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Creating adapter from 2 encoder layers to 3 decoder layers
The model has 8,725,826 trainable parameters
Epoch: 01 | Time: 0.0m 18.84s
	Train Loss: 2.661
	 Val. Loss: 1.998
	 Val. Accuracy: 0.006
Epoch: 02 | Time: 0.0m 19.01s
	Train Loss: 1.723
	 Val. Loss: 1.494
	 Val. Accuracy: 0.066
Epoch: 03 | Time: 0.0m 18.85s
	Train Loss: 1.351
	 Val. Loss: 1.317
	 Val. Accuracy: 0.117
Epoch: 04 | Time: 0.0m 18.86s
	Train Loss: 1.165
	 Val. Loss: 1.199
	 Val. Accuracy: 0.165
Epoch: 05 | Time: 0.0m 19.03s
	Train Loss: 1.042
	 Val. Loss: 1.129
	 Val. Accuracy: 0.214
Epoch: 06 | Time: 0.0m 18.77s
	Train Loss: 0.948
	 Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▅▄▃▃▂▂▂▂▂▁▁▁▁▁
val_accuracy,▁▂▃▄▅▆▆▇▇▇▇▇███
val_loss,█▅▃▃▂▂▂▂▁▁▁▁▁▁▁

0,1
epoch,14.0
train_loss,0.56174
val_accuracy,0.34029
val_loss,0.966


[34m[1mwandb[0m: Agent Starting Run: 2it2uiu5 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	clip: 0.1
[34m[1mwandb[0m: 	decoder_layers: 1
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 1
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 512
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.3


  return op(a, b)


Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
The model has 3,212,098 trainable parameters
Epoch: 01 | Time: 0.0m 35.97s
	Train Loss: 2.308
	 Val. Loss: 1.756
	 Val. Accuracy: 0.036
Epoch: 02 | Time: 0.0m 35.90s
	Train Loss: 1.530
	 Val. Loss: 1.412
	 Val. Accuracy: 0.103
Epoch: 03 | Time: 0.0m 35.81s
	Train Loss: 1.246
	 Val. Loss: 1.242
	 Val. Accuracy: 0.166
Epoch: 04 | Time: 0.0m 35.89s
	Train Loss: 1.074
	 Val. Loss: 1.142
	 Val. Accuracy: 0.211
Epoch: 05 | Time: 0.0m 35.81s
	Train Loss: 0.968
	 Val. Loss: 1.098
	 Val. Accuracy: 0.235
Epoch: 06 | Time: 0.0m 35.89s
	Train Loss: 0.882
	 Val. Loss: 1.048
	 Val. Accuracy: 0.263
Epoch: 07 | Time: 0.0m

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▅▄▃▃▃▂▂▂▂▂▁▁▁▁
val_accuracy,▁▂▄▅▅▆▆▇▇▇▇████
val_loss,█▅▃▃▂▂▂▁▁▁▁▁▁▁▁

0,1
epoch,14.0
train_loss,0.48379
val_accuracy,0.352
val_loss,0.97644


[34m[1mwandb[0m: Agent Starting Run: 03u4qjpd with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	beam_size: 1
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	clip: 5
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	emb_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 3
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	hidden_dim: 512
[34m[1mwandb[0m: 	lr: 0.0001
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


  return op(a, b)


Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Latin vocabulary size: 29
Devanagari vocabulary size: 66
Special tokens: PAD=<PAD> (idx=0), SOS=<SOS> (idx=1), EOS=<EOS> (idx=2)
Creating adapter from 3 encoder layers to 2 decoder layers
The model has 8,725,314 trainable parameters
Epoch: 01 | Time: 0.0m 28.38s
	Train Loss: 2.213
	 Val. Loss: 1.653
	 Val. Accuracy: 0.076
Epoch: 02 | Time: 0.0m 28.07s
	Train Loss: 1.282
	 Val. Loss: 1.341
	 Val. Accuracy: 0.165
Epoch: 03 | Time: 0.0m 28.07s
	Train Loss: 1.014
	 Val. Loss: 1.225
	 Val. Accuracy: 0.227
Epoch: 04 | Time: 0.0m 28.05s
	Train Loss: 0.871
	 Val. Loss: 1.140
	 Val. Accuracy: 0.271
Epoch: 05 | Time: 0.0m 28.10s
	Train Loss: 0.766
	 Val. Loss: 1.104
	 Val. Accuracy: 0.296
Epoch: 06 | Time: 0.0m 28.20s
	Train Loss: 0.694
	 Val

0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▅▄▃▃▂▂▂▂▂▁▁▁▁▁
val_accuracy,▁▃▅▆▆▇▇▇█▇██▇██
val_loss,█▄▃▂▂▁▁▁▁▁▁▁▂▁▂

0,1
epoch,14.0
train_loss,0.3369
val_accuracy,0.3637
val_loss,1.11681
