In [1]:
# Install required packages
!pip install wandb tqdm seaborn matplotlib

# Import necessary libraries
import os
import json
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import wandb
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
import random
import io
from sklearn.model_selection import train_test_split 
import time



In [2]:
import torch.nn.functional as F


In [3]:
# Login to Weights & Biases using Kaggle secrets
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("WANDB_API_KEY")
os.environ["WANDB_API_KEY"] = wandb_api

# Set thread start method for wandb to avoid errors
os.environ["WANDB_START_METHOD"] = "thread"

# Verify login
wandb.login()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mteja_sai[0m ([33mteja_sai-indian-institute-of-technology-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
class TransliterationDataset(Dataset):
    """Custom Dataset for Transliteration pairs with attestation counts."""
    def __init__(self, source_texts, target_texts, attestation_counts, source_vocab, target_vocab, max_len=50):
        self.source_texts = source_texts
        self.target_texts = target_texts
        self.attestation_counts = attestation_counts  # Added attestation counts
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab
        self.max_len = max_len

        # Get special token indices, assuming they are always present after load_dakshina_data
        self.pad_idx = self.target_vocab.get('<PAD>', 0)
        self.unk_idx = self.target_vocab.get('<UNK>', 1)
        self.sos_idx = self.target_vocab.get('< SOS >', 2)
        self.eos_idx = self.target_vocab.get('<EOS>', 3)

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, idx):
        source_text = self.source_texts[idx]
        target_text = self.target_texts[idx]
        attestation_count = self.attestation_counts[idx]

        # Convert non-string values to strings safely
        source_text = str(source_text) if not isinstance(source_text, str) else source_text
        target_text = str(target_text) if not isinstance(target_text, str) else target_text

        # Convert characters to indices using .get with UNK fallback
        source_indices = [self.source_vocab.get(char, self.unk_idx) for char in source_text]
        target_indices = [self.target_vocab.get(char, self.unk_idx) for char in target_text]

        # Add SOS and EOS tokens to target sequence
        target_indices = [self.sos_idx] + target_indices + [self.eos_idx]

        # Truncate sequences if longer than max_len
        source_indices = source_indices[:self.max_len]
        target_indices = target_indices[:self.max_len]

        # Pad sequences to max_len using the PAD index
        source_indices += [self.pad_idx] * (self.max_len - len(source_indices))
        target_indices += [self.pad_idx] * (self.max_len - len(target_indices))

        return {
            'source': torch.tensor(source_indices, dtype=torch.long),
            'target': torch.tensor(target_indices, dtype=torch.long),
            'source_text': source_text,
            'target_text': target_text,
            'attestation': torch.tensor(attestation_count, dtype=torch.float)  # Added attestation count as tensor
        }

def load_dakshina_data(language='ta', base_dir='/kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0', max_len=50):
    """
    Load data from the Dakshina dataset for a specific language.
    language: language code (e.g., 'ta' for Tamil)
    base_dir: base directory containing the dataset structure
    max_len: maximum sequence length for padding/truncation
    """
    # Define file paths based on Dakshina dataset structure
    train_file = os.path.join(base_dir, language, "lexicons", f"{language}.translit.sampled.train.tsv")
    dev_file = os.path.join(base_dir, language, "lexicons", f"{language}.translit.sampled.dev.tsv")
    test_file = os.path.join(base_dir, language, "lexicons", f"{language}.translit.sampled.test.tsv")
    
    print(f"Looking for training file at: {train_file}")
    
    # Check if files exist
    if not os.path.exists(train_file):
        print(f"Error: Training file not found at {train_file}")
        print(f"Current working directory: {os.getcwd()}")
        print(f"Files in {os.path.dirname(train_file)} (if directory exists):")
        try:
            if os.path.exists(os.path.dirname(train_file)):
                print(os.listdir(os.path.dirname(train_file)))
            else:
                print(f"Directory {os.path.dirname(train_file)} does not exist")
        except Exception as e:
            print(f"Error listing directory: {e}")
        
        # Create a test dataset if the real one can't be found
        return create_test_dataset(max_len=max_len)
    
    # Load data with error handling
    try:
        # In Dakshina, the format is: native_script \t romanization \t attestation_count
        # We want source (romanization) -> target (native_script)
        train_df = pd.read_csv(train_file, sep='\t', header=None, 
                              names=['target', 'source', 'attestation'], 
                              keep_default_na=False, on_bad_lines='skip')
        dev_df = pd.read_csv(dev_file, sep='\t', header=None, 
                            names=['target', 'source', 'attestation'], 
                            keep_default_na=False, on_bad_lines='skip')
        test_df = pd.read_csv(test_file, sep='\t', header=None, 
                             names=['target', 'source', 'attestation'], 
                             keep_default_na=False, on_bad_lines='skip')

        # Convert attestation counts to integers
        train_df['attestation'] = train_df['attestation'].astype(int)
        dev_df['attestation'] = dev_df['attestation'].astype(int)
        test_df['attestation'] = test_df['attestation'].astype(int)

        # Convert any non-string values to strings explicitly
        train_df['source'] = train_df['source'].apply(str)
        train_df['target'] = train_df['target'].apply(str)
        dev_df['source'] = dev_df['source'].apply(str)
        dev_df['target'] = dev_df['target'].apply(str)
        test_df['source'] = test_df['source'].apply(str)
        test_df['target'] = test_df['target'].apply(str)

        # Build vocabularies from the training data
        source_chars = set()
        target_chars = set()

        for text in train_df['source']:
            if isinstance(text, str):
                source_chars.update(text)

        for text in train_df['target']:
            if isinstance(text, str):
                target_chars.update(text)

        # Create vocabulary dictionaries with consistent special tokens
        source_vocab = {}
        target_vocab = {}

        # Add special tokens first with known indices
        special_tokens = ['<PAD>', '<UNK>', '< SOS >', '<EOS>']
        for i, token in enumerate(special_tokens):
            source_vocab[token] = i
            target_vocab[token] = i

        # Add sorted unique characters from data
        for char in sorted(list(source_chars)):
            if char not in source_vocab:
                source_vocab[char] = len(source_vocab)

        for char in sorted(list(target_chars)):
            if char not in target_vocab:
                target_vocab[char] = len(target_vocab)

        # Print debug information about the vocabularies
        print("Special tokens in vocabulary:")
        print(f"Source vocab keys: {list(source_vocab.keys())[:10]}")
        print(f"Target vocab keys: {list(target_vocab.keys())[:10]}")

        # Create inverse vocabularies for decoding
        inv_source_vocab = {v: k for k, v in source_vocab.items()}
        inv_target_vocab = {v: k for k, v in target_vocab.items()}

        # Create datasets using the loaded data and created vocabs
        # Include attestation counts in the dataset creation
        train_dataset = TransliterationDataset(
            train_df['source'].tolist(),
            train_df['target'].tolist(),
            train_df['attestation'].tolist(),  # Pass attestation counts
            source_vocab,
            target_vocab,
            max_len=max_len
        )

        dev_dataset = TransliterationDataset(
            dev_df['source'].tolist(),
            dev_df['target'].tolist(),
            dev_df['attestation'].tolist(),  # Pass attestation counts
            source_vocab,
            target_vocab,
            max_len=max_len
        )

        test_dataset = TransliterationDataset(
            test_df['source'].tolist(),
            test_df['target'].tolist(),
            test_df['attestation'].tolist(),  # Pass attestation counts
            source_vocab,
            target_vocab,
            max_len=max_len
        )

        print(f"Successfully loaded Dakshina dataset for {language}")
        print(f"Train set: {len(train_dataset)} examples")
        print(f"Dev set: {len(dev_dataset)} examples")
        print(f"Test set: {len(test_dataset)} examples")
        print(f"Source vocabulary size: {len(source_vocab)}")
        print(f"Target vocabulary size: {len(target_vocab)}")
        print(f"Max sequence length: {max_len}")

        return {
            'train_dataset': train_dataset,
            'dev_dataset': dev_dataset,
            'test_dataset': test_dataset,
            'source_vocab': source_vocab,
            'target_vocab': target_vocab,
            'inv_source_vocab': inv_source_vocab,
            'inv_target_vocab': inv_target_vocab,
            'max_len': max_len
        }

    except Exception as e:
        print(f"Error loading data: {e}")
        import traceback
        traceback.print_exc()
        return create_test_dataset(max_len=max_len)

def create_test_dataset(max_len=50):
    """Create a small test dataset for debugging purposes"""
    print("Creating a minimal test dataset for debugging...")
    
    # Example data with attestation counts
    source_texts = ["hello", "world", "test", "longerword", "sampledata", "another"]
    target_texts = ["ஹலோ", "உலகம்", "சோதனை", "நீண்டசொல்", "மாதிரிதரவு", "மற்றொன்று"]
    attestation_counts = [2, 1, 3, 1, 2, 1]  # Example attestation counts

    # Ensure max_len is at least long enough for the longest example + SOS/EOS
    min_required_len = max(max(len(s) for s in source_texts), max(len(t) for t in target_texts)) + 2
    current_max_len = max(max_len, min_required_len)
    if current_max_len > max_len:
        print(f"Warning: Adjusted max_len from {max_len} to {current_max_len} for test data.")
        max_len = current_max_len

    source_chars = set("".join(source_texts))
    target_chars = set("".join(target_texts))

    # Create vocab with consistent special tokens
    source_vocab = {}
    target_vocab = {}
    special_tokens = ['<PAD>', '<UNK>', '< SOS >', '<EOS>']
    for i, token in enumerate(special_tokens):
        source_vocab[token] = i
        target_vocab[token] = i

    for char in sorted(list(source_chars)):
        if char not in source_vocab:
            source_vocab[char] = len(source_vocab)

    for char in sorted(list(target_chars)):
        if char not in target_vocab:
            target_vocab[char] = len(target_vocab)

    inv_source_vocab = {v: k for k, v in source_vocab.items()}
    inv_target_vocab = {v: k for k, v in target_vocab.items()}

    # Split into train, dev, test (using fixed small splits)
    train_src = source_texts[:4]
    train_tgt = target_texts[:4]
    train_att = attestation_counts[:4]
    
    dev_src = source_texts[4:5]
    dev_tgt = target_texts[4:5]
    dev_att = attestation_counts[4:5]
    
    test_src = source_texts[5:]
    test_tgt = target_texts[5:]
    test_att = attestation_counts[5:]

    train_dataset = TransliterationDataset(train_src, train_tgt, train_att, source_vocab, target_vocab, max_len=max_len)
    dev_dataset = TransliterationDataset(dev_src, dev_tgt, dev_att, source_vocab, target_vocab, max_len=max_len)
    test_dataset = TransliterationDataset(test_src, test_tgt, test_att, source_vocab, target_vocab, max_len=max_len)

    print("Created minimal test dataset with:")
    print(f"Train set: {len(train_dataset)} examples")
    print(f"Dev set: {len(dev_dataset)} examples")
    print(f"Test set: {len(test_dataset)} examples")
    print(f"Source vocabulary size: {len(source_vocab)}")
    print(f"Target vocabulary size: {len(target_vocab)}")
    print(f"Max sequence length: {max_len}")

    return {
        'train_dataset': train_dataset,
        'dev_dataset': dev_dataset,
        'test_dataset': test_dataset,
        'source_vocab': source_vocab,
        'target_vocab': target_vocab,
        'inv_source_vocab': inv_source_vocab,
        'inv_target_vocab': inv_target_vocab,
        'max_len': max_len
    }

def get_dataloaders(data_dict, batch_size=32):
    """Create DataLoaders for train, dev, and test sets"""
    train_loader = DataLoader(
        data_dict['train_dataset'],
        batch_size=batch_size,
        shuffle=True,
    )

    dev_loader = DataLoader(
        data_dict['dev_dataset'],
        batch_size=batch_size,
        shuffle=False,
    )

    test_loader = DataLoader(
        data_dict['test_dataset'],
        batch_size=batch_size,
        shuffle=False,
    )

    return train_loader, dev_loader, test_loader


In [5]:
# List files in the input directory
import os
print(os.listdir("/kaggle/input/dakshina-dataset-v1-0-tar"))


['dakshina_dataset_v1.0']


In [6]:
# Encoder
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers=1, cell_type='lstm', dropout=0):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.cell_type = cell_type.lower()
        
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.dropout = nn.Dropout(dropout)
        
        if self.cell_type == 'lstm':
            self.rnn = nn.LSTM(
                embedding_size, 
                hidden_size, 
                num_layers=num_layers, 
                dropout=dropout if num_layers > 1 else 0,
                batch_first=True
            )
        elif self.cell_type == 'gru':
            self.rnn = nn.GRU(
                embedding_size, 
                hidden_size, 
                num_layers=num_layers, 
                dropout=dropout if num_layers > 1 else 0,
                batch_first=True
            )
        else:  # vanilla RNN
            self.rnn = nn.RNN(
                embedding_size, 
                hidden_size, 
                num_layers=num_layers, 
                dropout=dropout if num_layers > 1 else 0,
                batch_first=True,
                nonlinearity='tanh'
            )
    
    def forward(self, x):
        # x shape: (batch_size, seq_length)
        embedded = self.dropout(self.embedding(x))
        # embedded shape: (batch_size, seq_length, embedding_size)
        
        if self.cell_type == 'lstm':
            outputs, (hidden, cell) = self.rnn(embedded)
            return outputs, hidden, cell
        else:
            outputs, hidden = self.rnn(embedded)
            return outputs, hidden, None

# Decoder
class Decoder(nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size, num_layers=1, cell_type='lstm', dropout=0):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.cell_type = cell_type.lower()
        
        self.embedding = nn.Embedding(output_size, embedding_size)
        self.dropout = nn.Dropout(dropout)
        
        if self.cell_type == 'lstm':
            self.rnn = nn.LSTM(
                embedding_size, 
                hidden_size, 
                num_layers=num_layers, 
                dropout=dropout if num_layers > 1 else 0,
                batch_first=True
            )
        elif self.cell_type == 'gru':
            self.rnn = nn.GRU(
                embedding_size, 
                hidden_size, 
                num_layers=num_layers, 
                dropout=dropout if num_layers > 1 else 0,
                batch_first=True
            )
        else:  # vanilla RNN
            self.rnn = nn.RNN(
                embedding_size, 
                hidden_size, 
                num_layers=num_layers, 
                dropout=dropout if num_layers > 1 else 0,
                batch_first=True,
                nonlinearity='tanh'
            )
            
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, hidden, cell=None):
        # x shape: (batch_size)
        x = x.unsqueeze(1)
        
        embedded = self.dropout(self.embedding(x))
        # embedded shape: (batch_size, 1, embedding_size)
        
        if self.cell_type == 'lstm':
            output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
            prediction = self.fc(output.squeeze(1))
            return prediction, hidden, cell
        else:
            output, hidden = self.rnn(embedded, hidden)
            prediction = self.fc(output.squeeze(1))
            return prediction, hidden, None

# Attention
class Attention(nn.Module):
    def __init__(self, encoder_hidden_dim, decoder_hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(encoder_hidden_dim + decoder_hidden_dim, decoder_hidden_dim)
        self.v = nn.Linear(decoder_hidden_dim, 1, bias=False)
        
    def forward(self, hidden, encoder_outputs):
        # hidden: [batch_size, decoder_hidden_dim]
        # encoder_outputs: [batch_size, src_len, encoder_hidden_dim]
        
        batch_size = encoder_outputs.shape[0]
        src_len = encoder_outputs.shape[1]
        
        # Repeat decoder hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        # Calculate energy
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        
        # Get attention weights
        return F.softmax(attention, dim=1)

# Attention Decoder
class AttentionDecoder(nn.Module):
    def __init__(self, output_size, embedding_size, encoder_hidden_size, decoder_hidden_size, 
                 num_layers=1, cell_type='lstm', dropout=0):
        super(AttentionDecoder, self).__init__()
        self.output_size = output_size
        self.decoder_hidden_size = decoder_hidden_size
        self.num_layers = num_layers
        self.cell_type = cell_type.lower()
        
        self.embedding = nn.Embedding(output_size, embedding_size)
        self.attention = Attention(encoder_hidden_size, decoder_hidden_size)
        self.dropout = nn.Dropout(dropout)
        
        # Input to the RNN will be embedding + context vector
        if self.cell_type == 'lstm':
            self.rnn = nn.LSTM(
                embedding_size + encoder_hidden_size, 
                decoder_hidden_size, 
                num_layers=num_layers, 
                dropout=dropout if num_layers > 1 else 0,
                batch_first=True
            )
        elif self.cell_type == 'gru':
            self.rnn = nn.GRU(
                embedding_size + encoder_hidden_size, 
                decoder_hidden_size, 
                num_layers=num_layers, 
                dropout=dropout if num_layers > 1 else 0,
                batch_first=True
            )
        else:  # vanilla RNN
            self.rnn = nn.RNN(
                embedding_size + encoder_hidden_size, 
                decoder_hidden_size, 
                num_layers=num_layers, 
                dropout=dropout if num_layers > 1 else 0,
                batch_first=True,
                nonlinearity='tanh'
            )
            
        self.fc = nn.Linear(decoder_hidden_size + encoder_hidden_size + embedding_size, output_size)
    
    def forward(self, x, hidden, encoder_outputs, cell=None):
        # x shape: (batch_size)
        # hidden shape: (num_layers, batch_size, decoder_hidden_size)
        # encoder_outputs shape: (batch_size, src_len, encoder_hidden_size)
        
        x = x.unsqueeze(1)  # (batch_size, 1)
        embedded = self.dropout(self.embedding(x))  # (batch_size, 1, embedding_size)
        
        # Get the last hidden state for attention
        if self.cell_type == 'lstm':
            attn_hidden = hidden[-1]
        else:
            attn_hidden = hidden[-1]
            
        # Calculate attention weights
        attn_weights = self.attention(attn_hidden, encoder_outputs)  # (batch_size, src_len)
        
        # Create context vector by multiplying attention weights with encoder outputs
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs)  # (batch_size, 1, encoder_hidden_size)
        
        # Combine embedded input and context vector
        rnn_input = torch.cat((embedded, context), dim=2)  # (batch_size, 1, embedding_size + encoder_hidden_size)
        
        # Pass through RNN
        if self.cell_type == 'lstm':
            output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        else:
            output, hidden = self.rnn(rnn_input, hidden)
            cell = None
            
        # Final output layer
        # Concatenate output, context and embedded for richer representation
        output = output.squeeze(1)  # (batch_size, decoder_hidden_size)
        context = context.squeeze(1)  # (batch_size, encoder_hidden_size)
        embedded = embedded.squeeze(1)  # (batch_size, embedding_size)
        
        prediction = self.fc(torch.cat((output, context, embedded), dim=1))
        
        return prediction, hidden, cell, attn_weights

# Seq2Seq
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.cell_type = encoder.cell_type
        
    def forward(self, source, target, teacher_forcing_ratio=0.5):
        batch_size = source.shape[0]
        target_len = target.shape[1]
        target_vocab_size = self.decoder.output_size
        
        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(self.device)
        
        if self.cell_type == 'lstm':
            encoder_outputs, hidden, cell = self.encoder(source)
            
            # Adjust hidden and cell dimensions if encoder and decoder layers differ
            if self.encoder.num_layers != self.decoder.num_layers:
                # If encoder has fewer layers than decoder, repeat the layers
                if self.encoder.num_layers < self.decoder.num_layers:
                    repeat_factor = self.decoder.num_layers // self.encoder.num_layers
                    hidden = hidden.repeat(repeat_factor, 1, 1)
                    cell = cell.repeat(repeat_factor, 1, 1)
                # If encoder has more layers than decoder, take the last layers
                else:
                    hidden = hidden[-self.decoder.num_layers:]
                    cell = cell[-self.decoder.num_layers:]
        else:
            encoder_outputs, hidden, _ = self.encoder(source)
            cell = None
            
            # Adjust hidden dimensions if encoder and decoder layers differ
            if self.encoder.num_layers != self.decoder.num_layers:
                # If encoder has fewer layers than decoder, repeat the layers
                if self.encoder.num_layers < self.decoder.num_layers:
                    repeat_factor = self.decoder.num_layers // self.encoder.num_layers
                    hidden = hidden.repeat(repeat_factor, 1, 1)
                # If encoder has more layers than decoder, take the last layers
                else:
                    hidden = hidden[-self.decoder.num_layers:]
        
        # First input to the decoder is the < SOS > token
        decoder_input = target[:, 0]
        
        for t in range(1, target_len):
            if self.cell_type == 'lstm':
                decoder_output, hidden, cell = self.decoder(decoder_input, hidden, cell)
            else:
                decoder_output, hidden, _ = self.decoder(decoder_input, hidden)
                
            outputs[:, t, :] = decoder_output
            
            # Teacher forcing: use actual target as next input
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = decoder_output.argmax(1)
            decoder_input = target[:, t] if teacher_force else top1
        
        return outputs

# AttentionSeq2Seq
class AttentionSeq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(AttentionSeq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.cell_type = encoder.cell_type
        
    def forward(self, source, target, teacher_forcing_ratio=0.5):
        batch_size = source.shape[0]
        target_len = target.shape[1]
        target_vocab_size = self.decoder.output_size
        src_len = source.shape[1]
        
        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(self.device)
        attentions = torch.zeros(batch_size, target_len, src_len).to(self.device)
        
        if self.cell_type == 'lstm':
            encoder_outputs, hidden, cell = self.encoder(source)
            
            # Adjust hidden and cell dimensions if encoder and decoder layers differ
            if self.encoder.num_layers != self.decoder.num_layers:
                # If encoder has fewer layers than decoder, repeat the layers
                if self.encoder.num_layers < self.decoder.num_layers:
                    repeat_factor = self.decoder.num_layers // self.encoder.num_layers
                    hidden = hidden.repeat(repeat_factor, 1, 1)
                    cell = cell.repeat(repeat_factor, 1, 1)
                # If encoder has more layers than decoder, take the last layers
                else:
                    hidden = hidden[-self.decoder.num_layers:]
                    cell = cell[-self.decoder.num_layers:]
        else:
            encoder_outputs, hidden, _ = self.encoder(source)
            cell = None
            
            # Adjust hidden dimensions if encoder and decoder layers differ
            if self.encoder.num_layers != self.decoder.num_layers:
                # If encoder has fewer layers than decoder, repeat the layers
                if self.encoder.num_layers < self.decoder.num_layers:
                    repeat_factor = self.decoder.num_layers // self.encoder.num_layers
                    hidden = hidden.repeat(repeat_factor, 1, 1)
                # If encoder has more layers than decoder, take the last layers
                else:
                    hidden = hidden[-self.decoder.num_layers:]
        
        # First input to the decoder is the < SOS > token
        decoder_input = target[:, 0]
        
        for t in range(1, target_len):
            if self.cell_type == 'lstm':
                decoder_output, hidden, cell, attn_weights = self.decoder(
                    decoder_input, hidden, encoder_outputs, cell
                )
            else:
                decoder_output, hidden, _, attn_weights = self.decoder(
                    decoder_input, hidden, encoder_outputs
                )
                
            outputs[:, t, :] = decoder_output
            attentions[:, t, :] = attn_weights
            
            # Teacher forcing: use actual target as next input
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = decoder_output.argmax(1)
            decoder_input = target[:, t] if teacher_force else top1
        
        return outputs, attentions


In [7]:
def init_weights(m):
    """Initializes model weights."""
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        elif 'bias' in name:
            nn.init.constant_(param.data, 0)

def count_parameters(model):
    """Counts the number of trainable parameters in a model."""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def train_epoch(model, iterator, optimizer, criterion, clip, device, teacher_forcing_ratio=0.5):
    """
    Trains the model for one epoch.
    Uses attestation counts as weights for the loss function.
    """
    model.train()
    epoch_loss = 0

    for batch in tqdm(iterator, desc="Training"):
        src = batch['source'].to(device)
        trg = batch['target'].to(device)
        attestation = batch['attestation'].to(device)  # Get attestation counts
        
        optimizer.zero_grad()
        
        if isinstance(model, AttentionSeq2Seq):
            output, _ = model(src, trg, teacher_forcing_ratio)
        else:
            output = model(src, trg, teacher_forcing_ratio)

        output_dim = output.shape[-1]

        # Slice the output and target to remove the first timestep
        output_seq = output[:, 1:, :]
        trg_seq = trg[:, 1:]
        
        # Check for empty sequences after slicing
        if trg_seq.numel() == 0:
            continue

        # Calculate unweighted loss
        loss = criterion(output_seq.reshape(-1, output_dim), trg_seq.reshape(-1))
        
        # Apply attestation weights to the loss
        seq_len = trg_seq.shape[1]
        attestation_weights = attestation.repeat_interleave(seq_len)
        
        # Normalize weights to sum to batch size
        attestation_weights = attestation_weights * (attestation_weights.size(0) / attestation_weights.sum())
        
        # Apply weights to loss
        weighted_loss = (loss * attestation_weights).mean()
        
        weighted_loss.backward()
        
        # Clip gradients to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        epoch_loss += weighted_loss.item()

    return epoch_loss / len(iterator) if len(iterator) > 0 else 0.0

def evaluate(model, iterator, criterion, device):
    """
    Evaluates the model on a given dataset iterator.
    Calculates average loss over the dataset, using attestation counts as weights.
    """
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for batch in tqdm(iterator, desc="Evaluating Loss"):
            src = batch['source'].to(device)
            trg = batch['target'].to(device)
            attestation = batch['attestation'].to(device)  # Get attestation counts

            # Pass source and target to the model
            if isinstance(model, AttentionSeq2Seq):
                output, _ = model(src, trg, 0)
            else:
                output = model(src, trg, 0)

            output_dim = output.shape[-1]

            # Slice the output and target to remove the first timestep (SOS)
            output_seq = output[:, 1:, :]
            trg_seq = trg[:, 1:]

            # Check for empty sequences after slicing
            if trg_seq.numel() == 0:
                continue

            # Calculate unweighted loss
            loss = criterion(output_seq.reshape(-1, output_dim), trg_seq.reshape(-1))
            
            # Apply attestation weights to the loss
            seq_len = trg_seq.shape[1]
            attestation_weights = attestation.repeat_interleave(seq_len)
            
            # Normalize weights to sum to batch size
            attestation_weights = attestation_weights * (attestation_weights.size(0) / attestation_weights.sum())
            
            # Apply weights to loss
            weighted_loss = (loss * attestation_weights).mean()
            
            epoch_loss += weighted_loss.item()

    return epoch_loss / len(iterator) if len(iterator) > 0 else 0.0

def calculate_accuracy(model, iterator, inv_target_vocab, device):
    """
    Calculates word-level accuracy (exact match) and generates predictions.
    Performs greedy decoding step-by-step.
    Includes attestation counts in the predictions.
    """
    model.eval()
    correct = 0
    total = 0
    predictions = []

    # Get special token indices using .get() with fallbacks
    target_vocab = {v: k for k, v in inv_target_vocab.items()}
    
    # Try different possible token names for special tokens
    sos_idx = None
    for token in ['< SOS >', '< SOS >', 'SOS']:
        if token in target_vocab:
            sos_idx = target_vocab[token]
            break
    if sos_idx is None:
        sos_idx = 2  # Default SOS index
        
    eos_idx = None
    for token in ['<EOS>', '< EOS >', 'EOS']:
        if token in target_vocab:
            eos_idx = target_vocab[token]
            break
    if eos_idx is None:
        eos_idx = 3  # Default EOS index
        
    pad_idx = target_vocab.get('<PAD>', 0)
    unk_idx = target_vocab.get('<UNK>', 1)

    # Determine maximum prediction length
    max_prediction_length = getattr(iterator.dataset, 'max_len', 50)
    if max_prediction_length < 10:
        max_prediction_length = 50

    with torch.no_grad():
        for batch in tqdm(iterator, desc="Calculating Accuracy"):
            src = batch['source'].to(device)
            trg_texts = batch['target_text']
            attestation_counts = batch['attestation']  # Get attestation counts

            # Get the actual batch size for the current batch
            current_batch_size = src.shape[0]

            # Handle empty batches gracefully
            if current_batch_size == 0:
                continue

            # --- Encoder Step ---
            if hasattr(model, 'cell_type') and model.cell_type == 'lstm':
                encoder_outputs, hidden, cell = model.encoder(src)
            else:
                encoder_outputs, hidden, _ = model.encoder(src)
                cell = None

            # --- Decoder Step-by-Step Decoding (Greedy Search) ---
            decoder_input = torch.full((current_batch_size,), sos_idx, dtype=torch.long, device=device)

            batch_decoded_indices = [[] for _ in range(current_batch_size)]
            finished_decoding = [False] * current_batch_size
            
            # For storing attention weights if using attention model
            batch_attention_weights = [[] for _ in range(current_batch_size)] if isinstance(model, AttentionSeq2Seq) else None

            for t in range(max_prediction_length):
                if isinstance(model, AttentionSeq2Seq):
                    if hasattr(model, 'cell_type') and model.cell_type == 'lstm':
                        decoder_output, hidden, cell, attn_weights = model.decoder(
                            decoder_input, hidden, encoder_outputs, cell
                        )
                    else:
                        decoder_output, hidden, _, attn_weights = model.decoder(
                            decoder_input, hidden, encoder_outputs
                        )
                else:
                    if hasattr(model, 'cell_type') and model.cell_type == 'lstm':
                        decoder_output, hidden, cell = model.decoder(
                            decoder_input, hidden, cell
                        )
                    else:
                        decoder_output, hidden, _ = model.decoder(
                            decoder_input, hidden
                        )

                # Get the predicted token index for this step
                top1 = decoder_output.argmax(1)
                
                # Update decoded indices and finished status for each sequence in the batch
                for i in range(current_batch_size):
                    if not finished_decoding[i]:
                        predicted_token_idx = top1[i].item()
                        batch_decoded_indices[i].append(predicted_token_idx)
                        
                        # Store attention weights if using attention model
                        if isinstance(model, AttentionSeq2Seq):
                            batch_attention_weights[i].append(attn_weights[i].cpu().numpy())

                        # Check for EOS
                        if predicted_token_idx == eos_idx:
                            finished_decoding[i] = True

                # The input for the *next* step is the tokens predicted in this step
                decoder_input = top1

                # Stop early if all sequences have finished decoding
                if all(finished_decoding):
                    break

            # --- Post-process decoded indices to get predicted strings ---
            for i in range(current_batch_size):
                pred_chars = []
                for idx in batch_decoded_indices[i]:
                    if idx == eos_idx:
                        break
                    if idx != pad_idx and idx != unk_idx:
                        pred_char = inv_target_vocab.get(idx, None)
                        if pred_char is not None and pred_char not in ['<PAD>', '<UNK>', '< SOS >', '<EOS>', '< SOS >']:
                            pred_chars.append(pred_char)

                pred_text = ''.join(pred_chars)
                original_target_text = trg_texts[i]
                is_correct = (pred_text == original_target_text)
                
                if is_correct:
                    correct += 1
                total += 1

                # Store prediction details with attestation count
                prediction_info = {
                    'source': batch['source_text'][i],
                    'target': batch['target_text'][i],
                    'prediction': pred_text,
                    'correct': is_correct,
                    'attestation': attestation_counts[i].item()  # Include attestation count
                }
                
                # Add attention weights if available
                if isinstance(model, AttentionSeq2Seq):
                    prediction_info['attention_weights'] = batch_attention_weights[i]
                
                predictions.append(prediction_info)

    accuracy = correct / total if total > 0 else 0.0
    return accuracy, predictions

def epoch_time(start_time, end_time):
    """Calculates elapsed time in minutes and seconds."""
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


In [8]:
def train_model(config=None):
    """
    Main training function with descriptive run names.
    """
    # Generate a descriptive run name
    run_name = None
    if config:
        # Model type
        model_type = "attention" if getattr(config, 'use_attention', False) else "vanilla"
        
        # Cell type
        cell_type = getattr(config, 'cell_type', 'lstm')
        
        # Architecture details
        emb_size = getattr(config, 'embedding_size', 64)
        hid_size = getattr(config, 'hidden_size', 128)
        
        # Layer information
        num_layers = getattr(config, 'num_layers', 1)
        
        # Training parameters
        optimizer_name = getattr(config, 'optimizer', 'adam')
        lr = getattr(config, 'learning_rate', 0.001)
        dropout = getattr(config, 'dropout', 0.0)
        batch_size = getattr(config, 'batch_size', 64)
        
        # Create meaningful run name
        run_name = f"{model_type}_{cell_type}_emb{emb_size}_hid{hid_size}_layers{num_layers}_drop{dropout}_{optimizer_name}_lr{lr:.6f}_batch{batch_size}"
        print(f"Generated run name: {run_name}")
    
    # Set thread start method for wandb
    os.environ["WANDB_START_METHOD"] = "thread"
    
    # Initialize wandb with the descriptive run name - IMPORTANT: Don't use with statement
    wandb_run = wandb.init(project="transliteration-seq2seq", config=config, name=run_name, settings=wandb.Settings(start_method="thread"))
    print(f"Actual wandb run name: {wandb_run.name}")
    
    config = wandb.config

    print(f"Starting training run with config: {config}")

    # --- Data Loading ---
    data_dict = load_dakshina_data(
        language=config.language,
        base_dir=DATA_DIR,
        max_len=getattr(config, 'max_seq_len', 50)
    )

    # Handle potential data loading failure
    if data_dict is None or not data_dict['train_dataset'] or len(data_dict['train_dataset']) < config.batch_size:
        print("Failed to load data or train dataset is too small. Exiting training.")
        if wandb.run:
             wandb.log({"train_loss": float('nan'), "valid_loss": float('nan'), "valid_accuracy": 0.0, "test_accuracy": 0.0})
             wandb.run.finish(exit_code=1)
        return None, 0, []

    train_loader, dev_loader, test_loader = get_dataloaders(
        data_dict,
        batch_size=config.batch_size
    )

    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # --- Model Creation ---
    input_size = len(data_dict['source_vocab'])
    output_size = len(data_dict['target_vocab'])
    pad_idx = data_dict['target_vocab'].get('<PAD>', 0)

    # Get number of layers
    num_layers = getattr(config, 'num_layers', 1)

    # Create encoder
    encoder = Encoder(
        input_size=input_size,
        embedding_size=config.embedding_size,
        hidden_size=config.hidden_size,
        num_layers=num_layers,
        cell_type=config.cell_type,
        dropout=config.dropout
    ).to(device)

    # Create decoder (Attention or Vanilla)
    if config.use_attention:
        decoder = AttentionDecoder(
            output_size=output_size,
            embedding_size=config.embedding_size,
            encoder_hidden_size=config.hidden_size,
            decoder_hidden_size=config.hidden_size,
            num_layers=num_layers,
            cell_type=config.cell_type,
            dropout=config.dropout
        ).to(device)
        model = AttentionSeq2Seq(encoder, decoder, device).to(device)
    else:
        decoder = Decoder(
            output_size=output_size,
            embedding_size=config.embedding_size,
            hidden_size=config.hidden_size,
            num_layers=num_layers,
            cell_type=config.cell_type,
            dropout=config.dropout
        ).to(device)
        model = Seq2Seq(encoder, decoder, device).to(device)

    # Initialize weights
    model.apply(init_weights)

    # Print model info
    print(f'The model has {count_parameters(model):,} trainable parameters')
    wandb.log({"trainable_parameters": count_parameters(model)})

    # Define optimizer and criterion
    optimizer_name = getattr(config, 'optimizer', 'adam').lower()
    
    if optimizer_name == 'adam':
        optimizer = optim.Adam(
            model.parameters(), 
            lr=config.learning_rate,
            weight_decay=getattr(config, 'weight_decay', 0)
        )
    elif optimizer_name == 'rmsprop':
        optimizer = optim.RMSprop(
            model.parameters(), 
            lr=config.learning_rate,
            alpha=getattr(config, 'rmsprop_alpha', 0.99),
            eps=getattr(config, 'rmsprop_eps', 1e-8),
            weight_decay=getattr(config, 'weight_decay', 0)
        )
    elif optimizer_name == 'sgd':
        optimizer = optim.SGD(
            model.parameters(), 
            lr=config.learning_rate,
            momentum=getattr(config, 'momentum', 0),
            weight_decay=getattr(config, 'weight_decay', 0)
        )
    else:
        print(f"Warning: Unknown optimizer '{optimizer_name}'. Using Adam.")
        optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
    
    # Log the optimizer type
    wandb.log({"optimizer_type": optimizer_name})
    
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx, reduction='none')  # Use 'none' to apply attestation weights

    # --- Learning Rate Scheduler ---
    scheduler_name = getattr(config, 'scheduler', 'none').lower()
    scheduler = None
    
    if scheduler_name == 'plateau':
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, 
            mode='min', 
            factor=getattr(config, 'scheduler_factor', 0.1),
            patience=getattr(config, 'scheduler_patience', 10),
            verbose=True
        )
    elif scheduler_name == 'cosine':
        scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=getattr(config, 'scheduler_t_max', config.n_epochs),
            eta_min=getattr(config, 'scheduler_eta_min', 0)
        )
    elif scheduler_name != 'none':
        print(f"Warning: Unknown scheduler '{scheduler_name}'. Not using a scheduler.")

    # --- Training Loop ---
    best_valid_loss = float('inf')
    best_valid_accuracy = 0.0

    for epoch in range(config.n_epochs):
        start_time = time.time()

        train_loss = train_epoch(
            model,
            train_loader,
            optimizer,
            criterion,
            config.clip,
            device,
            config.teacher_forcing_ratio
        )

        # Evaluate on development set
        valid_loss = evaluate(model, dev_loader, criterion, device)

        # Calculate accuracy on validation set
        valid_accuracy, _ = calculate_accuracy(
            model,
            dev_loader,
            data_dict['inv_target_vocab'],
            device
        )

        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        # Step the scheduler if using one
        if scheduler is not None:
            if scheduler_name == 'plateau':
                scheduler.step(valid_loss)
            else:
                scheduler.step()
        
        # Get current learning rate
        current_lr = optimizer.param_groups[0]['lr']

        # Log epoch metrics to wandb
        wandb.log({
            "train_loss": train_loss,
            "valid_loss": valid_loss,
            "valid_accuracy": valid_accuracy,
            "epoch": epoch,
            "epoch_time_min": epoch_mins,
            "epoch_time_sec": epoch_secs,
            "learning_rate": current_lr
            })


In [9]:
def train_model(config=None): 
    """
    Main training function with descriptive run names.
    """
    # Generate a descriptive run name
    run_name = None
    if config:
        # Model type
        model_type = "attention" if getattr(config, 'use_attention', False) else "vanilla"
        
        # Cell type
        cell_type = getattr(config, 'cell_type', 'lstm')
        
        # Architecture details
        emb_size = getattr(config, 'embedding_size', 64)
        hid_size = getattr(config, 'hidden_size', 128)
        
        # Layer information
        num_layers = getattr(config, 'num_layers', 1)
        
        # Training parameters
        optimizer_name = getattr(config, 'optimizer', 'adam')
        lr = getattr(config, 'learning_rate', 0.001)
        dropout = getattr(config, 'dropout', 0.0)
        batch_size = getattr(config, 'batch_size', 64)
        
        # Create meaningful run name
        run_name = f"{model_type}_{cell_type}_emb{emb_size}_hid{hid_size}_layers{num_layers}_drop{dropout}_{optimizer_name}_lr{lr:.6f}_batch{batch_size}"
        print(f"Generated run name: {run_name}")
    
    # Set thread start method for wandb
    os.environ["WANDB_START_METHOD"] = "thread"
    
    wandb_run = wandb.init(project="transliteration-seq2seq", config=config, settings=wandb.Settings(start_method="thread"))

    # Override run name for sweep
    if run_name and wandb.run:
        wandb.run.name = run_name
        wandb.run.save()
    
    print(f"Actual wandb run name: {wandb.run.name}")

    config = wandb.config

    print(f"Starting training run with config: {config}")

    # --- Data Loading ---
    data_dict = load_dakshina_data(
        language=config.language,
        base_dir=DATA_DIR,
        max_len=getattr(config, 'max_seq_len', 50)
    )

    # Handle potential data loading failure
    if data_dict is None or not data_dict['train_dataset'] or len(data_dict['train_dataset']) < config.batch_size:
        print("Failed to load data or train dataset is too small. Exiting training.")
        if wandb.run:
             wandb.log({"train_loss": float('nan'), "valid_loss": float('nan'), "valid_accuracy": 0.0, "test_accuracy": 0.0})
             wandb.run.finish(exit_code=1)
        return None, 0, []

    train_loader, dev_loader, test_loader = get_dataloaders(
        data_dict,
        batch_size=config.batch_size
    )

    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # --- Model Creation ---
    input_size = len(data_dict['source_vocab'])
    output_size = len(data_dict['target_vocab'])
    pad_idx = data_dict['target_vocab'].get('<PAD>', 0)

    # Get number of layers
    num_layers = getattr(config, 'num_layers', 1)

    # Create encoder
    encoder = Encoder(
        input_size=input_size,
        embedding_size=config.embedding_size,
        hidden_size=config.hidden_size,
        num_layers=num_layers,
        cell_type=config.cell_type,
        dropout=config.dropout
    ).to(device)

    # Create decoder (Attention or Vanilla)
    if config.use_attention:
        decoder = AttentionDecoder(
            output_size=output_size,
            embedding_size=config.embedding_size,
            encoder_hidden_size=config.hidden_size,
            decoder_hidden_size=config.hidden_size,
            num_layers=num_layers,
            cell_type=config.cell_type,
            dropout=config.dropout
        ).to(device)
        model = AttentionSeq2Seq(encoder, decoder, device).to(device)
    else:
        decoder = Decoder(
            output_size=output_size,
            embedding_size=config.embedding_size,
            hidden_size=config.hidden_size,
            num_layers=num_layers,
            cell_type=config.cell_type,
            dropout=config.dropout
        ).to(device)
        model = Seq2Seq(encoder, decoder, device).to(device)

    # Initialize weights
    model.apply(init_weights)

    # Print model info
    print(f'The model has {count_parameters(model):,} trainable parameters')
    wandb.log({"trainable_parameters": count_parameters(model)})

    # Define optimizer and criterion
    optimizer_name = getattr(config, 'optimizer', 'adam').lower()
    
    if optimizer_name == 'adam':
        optimizer = optim.Adam(
            model.parameters(), 
            lr=config.learning_rate,
            weight_decay=getattr(config, 'weight_decay', 0)
        )
    elif optimizer_name == 'rmsprop':
        optimizer = optim.RMSprop(
            model.parameters(), 
            lr=config.learning_rate,
            alpha=getattr(config, 'rmsprop_alpha', 0.99),
            eps=getattr(config, 'rmsprop_eps', 1e-8),
            weight_decay=getattr(config, 'weight_decay', 0)
        )
    elif optimizer_name == 'sgd':
        optimizer = optim.SGD(
            model.parameters(), 
            lr=config.learning_rate,
            momentum=getattr(config, 'momentum', 0),
            weight_decay=getattr(config, 'weight_decay', 0)
        )
    else:
        print(f"Warning: Unknown optimizer '{optimizer_name}'. Using Adam.")
        optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
    
    # Log the optimizer type
    wandb.log({"optimizer_type": optimizer_name})
    
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx, reduction='none')  # Use 'none' to apply attestation weights

    # --- Learning Rate Scheduler ---
    scheduler_name = getattr(config, 'scheduler', 'none').lower()
    scheduler = None
    
    if scheduler_name == 'plateau':
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, 
            mode='min', 
            factor=getattr(config, 'scheduler_factor', 0.1),
            patience=getattr(config, 'scheduler_patience', 10),
            verbose=True
        )
    elif scheduler_name == 'cosine':
        scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=getattr(config, 'scheduler_t_max', config.n_epochs),
            eta_min=getattr(config, 'scheduler_eta_min', 0)
        )
    elif scheduler_name != 'none':
        print(f"Warning: Unknown scheduler '{scheduler_name}'. Not using a scheduler.")

    # --- Training Loop ---
    best_valid_loss = float('inf')
    best_valid_accuracy = 0.0

    for epoch in range(config.n_epochs):
        start_time = time.time()

        train_loss = train_epoch(
            model,
            train_loader,
            optimizer,
            criterion,
            config.clip,
            device,
            config.teacher_forcing_ratio
        )

        # Evaluate on development set
        valid_loss = evaluate(model, dev_loader, criterion, device)

        # Calculate accuracy on validation set
        valid_accuracy, _ = calculate_accuracy(
            model,
            dev_loader,
            data_dict['inv_target_vocab'],
            device
        )

        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        # Step the scheduler if using one
        if scheduler is not None:
            if scheduler_name == 'plateau':
                scheduler.step(valid_loss)
            else:
                scheduler.step()
        
        # Get current learning rate
        current_lr = optimizer.param_groups[0]['lr']

        # Log epoch metrics to wandb
        wandb.log({
            "train_loss": train_loss,
            "valid_loss": valid_loss,
            "valid_accuracy": valid_accuracy,
            "epoch": epoch,
            "epoch_time_min": epoch_mins,
            "epoch_time_sec": epoch_secs,
            "learning_rate": current_lr
        })

        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.4f}')
        print(f'\t Val. Loss: {valid_loss:.4f}')
        print(f'\t Val. Accuracy: {valid_accuracy:.4f}')
        print(f'\t Learning Rate: {current_lr:.6f}')

        # Save the best model based on validation loss
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            best_valid_accuracy = valid_accuracy
            model_save_path = f'best-model-{wandb.run.id}.pt'
            torch.save(model.state_dict(), model_save_path)
            print(f"Saved new best model to {model_save_path}")

    print("Training finished.")

    # --- Final Evaluation on Test Set ---
    print("Evaluating best model on test set...")
    model_save_path = f'best-model-{wandb.run.id}.pt'
    if os.path.exists(model_save_path):
         model.to(device)
         model.load_state_dict(torch.load(model_save_path, map_location=device))
         model.eval()

         # Calculate test loss
         test_loss = evaluate(model, test_loader, criterion, device)

         # Calculate test accuracy and get predictions
         test_accuracy, test_predictions = calculate_accuracy(
             model,
             test_loader,
             data_dict['inv_target_vocab'],
             device
         )

         # Log final test metrics
         if wandb.run:
             wandb.log({
                 "test_loss": test_loss,
                 "test_accuracy": test_accuracy
             })
             print(f'Test Loss: {test_loss:.4f}')
             print(f'Test Accuracy: {test_accuracy:.4f}')

             # --- Save Predictions ---
             folder_name = 'predictions/vanilla'
             if config.use_attention:
                 folder_name = 'predictions/attention'

             os.makedirs(folder_name, exist_ok=True)

             predictions_file_path = f'{folder_name}/predictions-{wandb.run.id}.json'
             with open(predictions_file_path, 'w', encoding='utf-8') as f:
                 # Include attestation counts in the saved predictions
                 serializable_predictions = []
                 for i, p in enumerate(test_predictions):
                      serializable_prediction = {
                           'source': p['source'],
                           'target': p['target'],
                           'prediction': p['prediction'],
                           'correct': bool(p['correct']),
                           'attestation': float(p['attestation'])
                      }
                      
                      # Include attention weights if available
                      if 'attention_weights' in p:
                          serializable_prediction['attention_weights'] = [weights.tolist() for weights in p['attention_weights']]
                          
                      serializable_predictions.append(serializable_prediction)
                      
                 json.dump(serializable_predictions, f, ensure_ascii=False, indent=2)
             print(f"Saved predictions to {predictions_file_path}")

             wandb.run.finish()
         return model, test_accuracy, test_predictions
    else:
         print(f"Best model file {model_save_path} not found. Cannot perform test evaluation.")
         if wandb.run and wandb.run.state == 'running':
              if wandb.run.summary.get("test_accuracy") is None:
                  wandb.log({"test_loss": float('nan'), "test_accuracy": 0.0})
              wandb.run.finish(exit_code=1)
         return None, 0, []


In [10]:
LANGUAGE = 'ta'
DATA_DIR = '/kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0'

In [13]:
def run_sweep(sweep_config: dict, count: int = 50):
    """
    Runs a wandb hyperparameter sweep.
    """
    # Set thread start method for wandb
    os.environ["WANDB_START_METHOD"] = "thread"
    
    # Add project name to sweep_config
    sweep_config['project'] = "transliteration-seq2seq"
    
    print(f"Creating sweep with configuration: {sweep_config['method']}")
    sweep_id = wandb.sweep(
        sweep_config
    )
    print(f"Starting sweep with ID: {sweep_id}")
    print(f"Running {count} trials.")
    wandb.agent(sweep_id, train_model, count=count)

In [12]:
vanilla_sweep_config = {
    'method': 'bayes',
    'metric': {'name': 'valid_accuracy', 'goal': 'maximize'},
    'parameters': {
    # Fixed parameters for the sweep
    'language': {'value': LANGUAGE},
    'data_dir': {'value': DATA_DIR},
    'max_seq_len': {'value': 50},
    'use_attention': {'value': False},
        # Model architecture parameters
    'embedding_size': {'values': 64, 128, 256},
    'hidden_size': {'values': 64, 128, 256},
    'num_layers': {'values': 3, 2, 1, 4, 5},  # Same number of layers for encoder and decoder
    'cell_type': {'values': ['rnn', 'lstm', 'gru']},
    'dropout': {'values': [0.0, 0.2, 0.3]},
    
    # Optimizer parameters
    'optimizer': {'values': ['adam', 'rmsprop', 'sgd']},
    'learning_rate': {'distribution': 'log_uniform_values', 'min': 0.0001, 'max': 0.01},
    'weight_decay': {'values': [0, 0.0001, 0.001]},
    
    # Optimizer-specific parameters
    'rmsprop_alpha': {'value': 0.99},
    'momentum': {'values': [0, 0.9]},  # For SGD
    
    # Learning rate scheduler
    'scheduler': {'values': ['none', 'plateau', 'cosine']},
    'scheduler_factor': {'value': 0.5},  # For plateau
    'scheduler_patience': {'value': 5},  # For plateau
    'scheduler_t_max': {'value': 10},    # For cosine
    
    # Training parameters
    'batch_size': {'values': },
    'n_epochs': {'value': 10},  # 10 epochs per run
    'clip': {'value': 1.0},
    'teacher_forcing_ratio': {'values': [0.5, 0.7]}
    }
}


SyntaxError: ':' expected after dictionary key (219011122.py, line 11)

In [None]:
def run_sweep(sweep_config: dict, count: int = 50):
    """
    Runs a wandb hyperparameter sweep.
    """
    # Set thread start method for wandb
    os.environ["WANDB_START_METHOD"] = "thread"
    
    # Add project name to sweep_config
    sweep_config['project'] = "transliteration-seq2seq"
    
    print(f"Creating sweep with configuration: {sweep_config['method']}")
    sweep_id = wandb.sweep(
        sweep_config
    )
    print(f"Starting sweep with ID: {sweep_id}")
    print(f"Running {count} trials.")
    wandb.agent(sweep_id, train_model, count=count)

# Define sweep configuration for vanilla model (no attention)
vanilla_sweep_config = {
    'method': 'bayes',
    'metric': {'name': 'valid_accuracy', 'goal': 'maximize'},
    'parameters': {
        # Fixed parameters for the sweep
        'language': {'value': LANGUAGE},
        'data_dir': {'value': DATA_DIR},
        'max_seq_len': {'value': 50},
        'use_attention': {'value': False},
        
        # Model architecture parameters
        'embedding_size': {'values': [128, 64, 32]},
        'hidden_size': {'values': [256, 128, 64]},
        'num_layers': {'values': [4, 3, 2, 1]},  # Same number of layers for encoder and decoder
        'cell_type': {'values': ['lstm', 'gru', 'rnn']},
        'dropout': {'values': [0.0, 0.2, 0.3]},
        
        # Optimizer parameters
        'optimizer': {'values': ['adam', 'rmsprop', 'sgd']},
        'learning_rate': {'distribution': 'log_uniform_values', 'min': 0.0001, 'max': 0.01},
        'weight_decay': {'values': [0, 0.0001, 0.001]},
        
        # Optimizer-specific parameters
        'rmsprop_alpha': {'value': 0.99},
        'momentum': {'values': [0, 0.9]},  # For SGD
        
        # Learning rate scheduler
        'scheduler': {'values': ['none', 'plateau', 'cosine']},
        'scheduler_factor': {'value': 0.5},  # For plateau
        'scheduler_patience': {'value': 5},  # For plateau
        'scheduler_t_max': {'value': 10},    # For cosine
        
        # Training parameters
        'batch_size': {'values': [32, 64, 128]},
        'n_epochs': {'value': 10},  # 10 epochs per run
        'clip': {'value': 1.0},
        'teacher_forcing_ratio': {'values': [0.5, 0.7]}
    }
}

# Define sweep configuration for attention model
attention_sweep_config = {
    'method': 'bayes',
    'metric': {'name': 'valid_accuracy', 'goal': 'maximize'},
    'parameters': {
        # Fixed parameters for the sweep
        'language': {'value': LANGUAGE},
        'data_dir': {'value': DATA_DIR},
        'max_seq_len': {'value': 50},
        'use_attention': {'value': True},
        
        # Model architecture parameters
        'embedding_size': {'values': [128, 64, 32]},
        'hidden_size': {'values': [256, 128, 64]},
        'num_layers': {'values': [4, 3, 2, 1]},  # Same number of layers for encoder and decoder
        'cell_type': {'values': ['lstm', 'gru', 'rnn']},
        'dropout': {'values': [0.0, 0.2, 0.3]},
        
        # Optimizer parameters
        'optimizer': {'values': ['adam', 'rmsprop', 'sgd']},
        'learning_rate': {'distribution': 'log_uniform_values', 'min': 0.0001, 'max': 0.01},
        'weight_decay': {'values': [0, 0.0001, 0.001]},
        
        # Optimizer-specific parameters
        'rmsprop_alpha': {'value': 0.99},
        'momentum': {'values': [0, 0.9]},  # For SGD
        
        # Learning rate scheduler
        'scheduler': {'values': ['none', 'plateau', 'cosine']},
        'scheduler_factor': {'value': 0.5},  # For plateau
        'scheduler_patience': {'value': 5},  # For plateau
        'scheduler_t_max': {'value': 10},    # For cosine
        
        # Training parameters
        'batch_size': {'values': [32, 64, 128]},
        'n_epochs': {'value': 10},  # 10 epochs per run
        'clip': {'value': 1.0},
        'teacher_forcing_ratio': {'values': [0.5, 0.7]}
    }
}

# Set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Check for GPU availability
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA device count: {torch.cuda.device_count()}")
    # Set CUDA flags for better performance
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = False
else:
    print("No GPU available, using CPU instead")

# Run the attention sweep with 50 trials
print("Starting attention model sweep (50 trials)...")
run_sweep(attention_sweep_config, count=50)

# Run the vanilla sweep with 50 trials
print("Starting vanilla model sweep (50 trials)...")
run_sweep(vanilla_sweep_config, count=50)

# Optional: Analysis function for after the sweeps are complete
def analyze_sweep_results(sweep_id, project_name="transliteration-seq2seq"):
    """
    Analyze the results of a sweep and generate visualizations.
    """
    api = wandb.Api()
    sweep = api.sweep(f"{wandb.run.entity}/{project_name}/{sweep_id}")
    
    # Get all runs in the sweep
    runs = sweep.runs
    
    # Extract run data
    run_data = []
    for run in runs:
        if run.state == "finished" and run.summary.get("test_accuracy") is not None:
            run_data.append({
                "run_id": run.id,
                "name": run.name,
                "test_accuracy": run.summary.get("test_accuracy", 0),
                "valid_accuracy": run.summary.get("valid_accuracy", 0),
                "config": {k: v for k, v in run.config.items() if not k.startswith('_')}
            })
    
    # Convert to DataFrame
    df = pd.DataFrame(run_data)
    
    if len(df) == 0:
        print("No completed runs found in the sweep.")
        return
    
    # Sort by test accuracy
    df = df.sort_values("test_accuracy", ascending=False)
    
    # Print the best run
    best_run = df.iloc[0]
    print(f"Best run: {best_run['name']} (ID: {best_run['run_id']})")
    print(f"Test accuracy: {best_run['test_accuracy']:.4f}")
    print("Configuration:")
    for k, v in best_run['config'].items():
        print(f"  {k}: {v}")
    
    # Create a plot of test accuracy vs. hyperparameters
    plt.figure(figsize=(15, 10))
    
    # Select numerical hyperparameters
    numerical_params = ['embedding_size', 'hidden_size', 'num_layers', 'learning_rate', 'batch_size', 'dropout']
    
    for i, param in enumerate(numerical_params):
        if param in df.columns:
            plt.subplot(2, 3, i+1)
            sns.scatterplot(x=param, y='test_accuracy', data=df)
            plt.title(f'Test Accuracy vs {param}')
            plt.grid(True)
    
    plt.tight_layout()
    plt.savefig('hyperparameter_analysis.png')
    plt.show()
    
    return df

# Example usage (uncomment to run after sweeps are complete):
# vanilla_results = analyze_sweep_results("YOUR_VANILLA_SWEEP_ID")
# attention_results = analyze_sweep_results("YOUR_ATTENTION_SWEEP_ID")


Using device: cuda
CUDA device: Tesla T4
CUDA device count: 2
Starting attention model sweep (50 trials)...
Creating sweep with configuration: bayes
Create sweep with ID: 8zoszb8i
Sweep URL: https://wandb.ai/teja_sai-indian-institute-of-technology-madras/transliteration-seq2seq/sweeps/8zoszb8i
Starting sweep with ID: 8zoszb8i
Running 50 trials.


[34m[1mwandb[0m: Agent Starting Run: 32i27dqx with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	clip: 1
[34m[1mwandb[0m: 	data_dir: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_size: 64
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	language: ta
[34m[1mwandb[0m: 	learning_rate: 0.007358989864160939
[34m[1mwandb[0m: 	max_seq_len: 50
[34m[1mwandb[0m: 	momentum: 0.9
[34m[1mwandb[0m: 	n_epochs: 10
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	rmsprop_alpha: 0.99
[34m[1mwandb[0m: 	scheduler: plateau
[34m[1mwandb[0m: 	scheduler_factor: 0.5
[34m[1mwandb[0m: 	scheduler_patience: 5
[34m[1mwandb[0m: 	scheduler_t_max: 10
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5
[34m[1mwandb[0m: 	use_attention: True
[34m[1mwandb[0m: 	weight_decay: 0


Actual wandb run name: gallant-sweep-1
Starting training run with config: {'batch_size': 128, 'cell_type': 'lstm', 'clip': 1, 'data_dir': '/kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0', 'dropout': 0.3, 'embedding_size': 64, 'hidden_size': 256, 'language': 'ta', 'learning_rate': 0.007358989864160939, 'max_seq_len': 50, 'momentum': 0.9, 'n_epochs': 10, 'num_layers': 2, 'optimizer': 'adam', 'rmsprop_alpha': 0.99, 'scheduler': 'plateau', 'scheduler_factor': 0.5, 'scheduler_patience': 5, 'scheduler_t_max': 10, 'teacher_forcing_ratio': 0.5, 'use_attention': True, 'weight_decay': 0}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '< SOS >', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '< SOS >', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218



Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 01 | Time: 1m 29s
	Train Loss: 0.2783
	 Val. Loss: 0.1304
	 Val. Accuracy: 0.4599
	 Learning Rate: 0.007359
Saved new best model to best-model-32i27dqx.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 02 | Time: 1m 29s
	Train Loss: 0.0890
	 Val. Loss: 0.1125
	 Val. Accuracy: 0.5155
	 Learning Rate: 0.007359
Saved new best model to best-model-32i27dqx.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 03 | Time: 1m 29s
	Train Loss: 0.0759
	 Val. Loss: 0.1063
	 Val. Accuracy: 0.5483
	 Learning Rate: 0.007359
Saved new best model to best-model-32i27dqx.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 04 | Time: 1m 30s
	Train Loss: 0.0680
	 Val. Loss: 0.1069
	 Val. Accuracy: 0.5541
	 Learning Rate: 0.007359


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 05 | Time: 1m 29s
	Train Loss: 0.0639
	 Val. Loss: 0.1054
	 Val. Accuracy: 0.5698
	 Learning Rate: 0.007359
Saved new best model to best-model-32i27dqx.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 06 | Time: 1m 30s
	Train Loss: 0.0611
	 Val. Loss: 0.1066
	 Val. Accuracy: 0.5740
	 Learning Rate: 0.007359


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 07 | Time: 1m 30s
	Train Loss: 0.0585
	 Val. Loss: 0.1107
	 Val. Accuracy: 0.5814
	 Learning Rate: 0.007359


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 08 | Time: 1m 30s
	Train Loss: 0.0557
	 Val. Loss: 0.1103
	 Val. Accuracy: 0.5757
	 Learning Rate: 0.007359


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 09 | Time: 1m 30s
	Train Loss: 0.0554
	 Val. Loss: 0.1067
	 Val. Accuracy: 0.5893
	 Learning Rate: 0.007359


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 10 | Time: 1m 29s
	Train Loss: 0.0523
	 Val. Loss: 0.1105
	 Val. Accuracy: 0.5925
	 Learning Rate: 0.007359
Training finished.
Evaluating best model on test set...


Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Test Loss: 0.1132
Test Accuracy: 0.5533
Saved predictions to predictions/attention/predictions-32i27dqx.json


0,1
epoch,▁▂▃▃▄▅▆▆▇█
epoch_time_min,▁▁▁▁▁▁▁▁▁▁
epoch_time_sec,▁▁▁█▁████▁
learning_rate,▁▁▁▁▁▁▁▁▁▁
test_accuracy,▁
test_loss,▁
train_loss,█▂▂▁▁▁▁▁▁▁
trainable_parameters,▁
valid_accuracy,▁▄▆▆▇▇▇▇██
valid_loss,█▃▁▁▁▁▃▂▁▂

0,1
epoch,9
epoch_time_min,1
epoch_time_sec,29
learning_rate,0.00736
optimizer_type,adam
test_accuracy,0.55332
test_loss,0.11317
train_loss,0.05228
trainable_parameters,2139826
valid_accuracy,0.5925


[34m[1mwandb[0m: Agent Starting Run: emvjocj7 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	clip: 1
[34m[1mwandb[0m: 	data_dir: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	embedding_size: 32
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	language: ta
[34m[1mwandb[0m: 	learning_rate: 0.008867107959822679
[34m[1mwandb[0m: 	max_seq_len: 50
[34m[1mwandb[0m: 	momentum: 0
[34m[1mwandb[0m: 	n_epochs: 10
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	rmsprop_alpha: 0.99
[34m[1mwandb[0m: 	scheduler: none
[34m[1mwandb[0m: 	scheduler_factor: 0.5
[34m[1mwandb[0m: 	scheduler_patience: 5
[34m[1mwandb[0m: 	scheduler_t_max: 10
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5
[34m[1mwandb[0m: 	use_attention: True
[34m[1mwandb[0m: 	weight_decay: 0.001


Actual wandb run name: elated-sweep-2
Starting training run with config: {'batch_size': 128, 'cell_type': 'lstm', 'clip': 1, 'data_dir': '/kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0', 'dropout': 0, 'embedding_size': 32, 'hidden_size': 64, 'language': 'ta', 'learning_rate': 0.008867107959822679, 'max_seq_len': 50, 'momentum': 0, 'n_epochs': 10, 'num_layers': 1, 'optimizer': 'rmsprop', 'rmsprop_alpha': 0.99, 'scheduler': 'none', 'scheduler_factor': 0.5, 'scheduler_patience': 5, 'scheduler_t_max': 10, 'teacher_forcing_ratio': 0.5, 'use_attention': True, 'weight_decay': 0.001}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '< SOS >', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '< SOS >', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 e

Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 01 | Time: 1m 7s
	Train Loss: 0.5448
	 Val. Loss: 0.5426
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.008867
Saved new best model to best-model-emvjocj7.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 02 | Time: 1m 6s
	Train Loss: 0.5394
	 Val. Loss: 0.5511
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.008867


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 03 | Time: 1m 7s
	Train Loss: 0.5390
	 Val. Loss: 0.5426
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.008867


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 04 | Time: 1m 8s
	Train Loss: 0.5377
	 Val. Loss: 0.5446
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.008867


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 05 | Time: 1m 8s
	Train Loss: 0.5376
	 Val. Loss: 0.5449
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.008867


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 06 | Time: 1m 8s
	Train Loss: 0.5374
	 Val. Loss: 0.5409
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.008867
Saved new best model to best-model-emvjocj7.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 07 | Time: 1m 7s
	Train Loss: 0.5350
	 Val. Loss: 0.5467
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.008867


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 08 | Time: 1m 7s
	Train Loss: 0.5368
	 Val. Loss: 0.5396
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.008867
Saved new best model to best-model-emvjocj7.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 09 | Time: 1m 8s
	Train Loss: 0.5352
	 Val. Loss: 0.5432
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.008867


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 10 | Time: 1m 8s
	Train Loss: 0.5362
	 Val. Loss: 0.5442
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.008867
Training finished.
Evaluating best model on test set...


Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Test Loss: 0.5450
Test Accuracy: 0.0000
Saved predictions to predictions/attention/predictions-emvjocj7.json


0,1
epoch,▁▂▃▃▄▅▆▆▇█
epoch_time_min,▁▁▁▁▁▁▁▁▁▁
epoch_time_sec,▅▁▅███▅▅██
learning_rate,▁▁▁▁▁▁▁▁▁▁
test_accuracy,▁
test_loss,▁
train_loss,█▄▄▃▃▃▁▂▁▂
trainable_parameters,▁
valid_accuracy,▁▁▁▁▁▁▁▁▁▁
valid_loss,▃█▃▄▄▂▅▁▃▄

0,1
epoch,9
epoch_time_min,1
epoch_time_sec,8
learning_rate,0.00887
optimizer_type,rmsprop
test_accuracy,0
test_loss,0.54495
train_loss,0.53617
trainable_parameters,85490
valid_accuracy,0


[34m[1mwandb[0m: Agent Starting Run: up75067f with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: gru
[34m[1mwandb[0m: 	clip: 1
[34m[1mwandb[0m: 	data_dir: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	embedding_size: 64
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	language: ta
[34m[1mwandb[0m: 	learning_rate: 0.004061607321958502
[34m[1mwandb[0m: 	max_seq_len: 50
[34m[1mwandb[0m: 	momentum: 0
[34m[1mwandb[0m: 	n_epochs: 10
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	rmsprop_alpha: 0.99
[34m[1mwandb[0m: 	scheduler: none
[34m[1mwandb[0m: 	scheduler_factor: 0.5
[34m[1mwandb[0m: 	scheduler_patience: 5
[34m[1mwandb[0m: 	scheduler_t_max: 10
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7
[34m[1mwandb[0m: 	use_attention: True
[34m[1mwandb[0m: 	weight_decay: 0.0001


Actual wandb run name: polar-sweep-3
Starting training run with config: {'batch_size': 32, 'cell_type': 'gru', 'clip': 1, 'data_dir': '/kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0', 'dropout': 0, 'embedding_size': 64, 'hidden_size': 256, 'language': 'ta', 'learning_rate': 0.004061607321958502, 'max_seq_len': 50, 'momentum': 0, 'n_epochs': 10, 'num_layers': 2, 'optimizer': 'adam', 'rmsprop_alpha': 0.99, 'scheduler': 'none', 'scheduler_factor': 0.5, 'scheduler_patience': 5, 'scheduler_t_max': 10, 'teacher_forcing_ratio': 0.7, 'use_attention': True, 'weight_decay': 0.0001}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '< SOS >', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '< SOS >', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examp

Training:   0%|          | 0/2132 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/214 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/214 [00:00<?, ?it/s]

Epoch: 01 | Time: 4m 58s
	Train Loss: 0.2283
	 Val. Loss: 0.1519
	 Val. Accuracy: 0.3895
	 Learning Rate: 0.004062
Saved new best model to best-model-up75067f.pt


Training:   0%|          | 0/2132 [00:00<?, ?it/s]

In [None]:
def run_sweep(sweep_config, count=50):
    """
    Runs a wandb hyperparameter sweep with a descriptive name based on key hyperparameters.
    """
    # Generate a descriptive name for the sweep
    params = sweep_config['parameters']
    
    # Get fixed parameters for the name
    model_type = "attention" if params['use_attention']['value'] else "vanilla"
    language = params['language']['value']
    
    # Get ranges for variable parameters
    embedding_sizes = params.get('embedding_size', {}).get('values', ['default'])
    if not isinstance(embedding_sizes, list):
        embedding_sizes = [embedding_sizes]
    embedding_range = f"emb{min(embedding_sizes)}-{max(embedding_sizes)}" if len(embedding_sizes) > 1 else f"emb{embedding_sizes[0]}"
    
    hidden_sizes = params.get('hidden_size', {}).get('values', ['default'])
    if not isinstance(hidden_sizes, list):
        hidden_sizes = [hidden_sizes]
    hidden_range = f"hid{min(hidden_sizes)}-{max(hidden_sizes)}" if len(hidden_sizes) > 1 else f"hid{hidden_sizes[0]}"
    
    # Get layer information
    if 'num_layers' in params:
        layers = params['num_layers'].get('values', [1])
        if not isinstance(layers, list):
            layers = [layers]
        layers_range = f"layers{min(layers)}-{max(layers)}" if len(layers) > 1 else f"layers{layers[0]}"
    else:
        enc_layers = params.get('encoder_layers', {}).get('values', [1])
        dec_layers = params.get('decoder_layers', {}).get('values', [1])
        if not isinstance(enc_layers, list):
            enc_layers = [enc_layers]
        if not isinstance(dec_layers, list):
            dec_layers = [dec_layers]
        layers_range = f"enc{min(enc_layers)}-{max(enc_layers)}_dec{min(dec_layers)}-{max(dec_layers)}"
    
    # Get cell type
    cell_types = params.get('cell_type', {}).get('values', ['lstm'])
    if not isinstance(cell_types, list):
        cell_types = [cell_types]
    cell_type_str = "-".join(cell_types)
    
    # Get optimizer info
    optimizers = params.get('optimizer', {}).get('values', ['adam'])
    if not isinstance(optimizers, list):
        optimizers = [optimizers]
    optimizer_str = "-".join(optimizers)
    
    # Get learning rate range
    if 'learning_rate' in params:
        if 'distribution' in params['learning_rate']:
            lr_min = params['learning_rate'].get('min', 0.0001)
            lr_max = params['learning_rate'].get('max', 0.01)
            lr_range = f"lr{lr_min}-{lr_max}"
        elif 'values' in params['learning_rate']:
            lr_values = params['learning_rate']['values']
            lr_range = f"lr{min(lr_values)}-{max(lr_values)}" if len(lr_values) > 1 else f"lr{lr_values[0]}"
        else:
            lr_range = f"lr{params['learning_rate'].get('value', 'default')}"
    else:
        lr_range = "lr-default"
    
    # Get dropout range
    dropouts = params.get('dropout', {}).get('values', [0.0])
    if not isinstance(dropouts, list):
        dropouts = [dropouts]
    dropout_range = f"drop{min(dropouts)}-{max(dropouts)}" if len(dropouts) > 1 else f"drop{dropouts[0]}"
    
    # Add timestamp for uniqueness
    import datetime
    timestamp = datetime.datetime.now().strftime("%m%d-%H%M")
    
    # Combine all parts into a descriptive name
    sweep_name = f"{language}_{model_type}_{cell_type_str}_{embedding_range}_{hidden_range}_{layers_range}_{dropout_range}_{optimizer_str}_{lr_range}_{timestamp}"

    sweep_config['name'] = sweep_name
    
    print(f"Creating sweep with name: {sweep_name}")
    sweep_id = wandb.sweep(
        sweep_config, 
        project="transliteration-seq2seq",
    )
    print(f"Starting sweep with ID: {sweep_id}")
    print(f"Running {count} trials.")
    wandb.agent(sweep_id, train_model, count=count)

# Define sweep configuration for vanilla model (no attention)
vanilla_sweep_config = {
    'method': 'bayes',
    'metric': {'name': 'valid_accuracy', 'goal': 'maximize'},
    'parameters': {
        'language': {'value': 'ta'},
        'embedding_size': {'values': [128, 32, 64]},
        'hidden_size': {'values': [256, 64, 128]},
        'encoder_layers': {'values': [1, 2, 3]},
        'decoder_layers': {'values': [1, 2, 3]},
        'cell_type': {'values': ['lstm', 'gru']},
        'dropout': {'values': [0.0, 0.2, 0.3]},
        'optimizer': {'values': ['adam', 'rmsprop']},
        'learning_rate': {'distribution': 'log_uniform_values', 'min': 0.0001, 'max': 0.01},
        'weight_decay': {'values': [0, 0.0001]},
        'batch_size': {'values': [32, 64, 128]},
        'n_epochs': {'value': 5},  # 5 epochs per run as requested
        'clip': {'value': 1.0},
        'teacher_forcing_ratio': {'values': [0.5, 0.7]},
        'use_attention': {'value': False},
        'max_seq_len': {'value': 50},
        'scheduler': {'values': ['none', 'plateau']}
    }
}

# Define sweep configuration for attention model
attention_sweep_config = {
    'method': 'bayes',
    'metric': {'name': 'valid_accuracy', 'goal': 'maximize'},
    'parameters': {
        'language': {'value': 'ta'},
        'embedding_size': {'values': [128, 64, 32]},
        'hidden_size': {'values': [256, 128, 64]},
        'encoder_layers': {'values': [1, 2, 3, 4, 5]},
        'decoder_layers': {'values': [1, 2, 3, 4, 5]},
        'cell_type': {'values': ['lstm', 'gru']},
        'dropout': {'values': [0.0, 0.2, 0.3]},
        'optimizer': {'values': ['adam', 'rmsprop']},
        'learning_rate': {'distribution': 'log_uniform_values', 'min': 0.0001, 'max': 0.01},
        'weight_decay': {'values': [0, 0.0001]},
        'batch_size': {'values': [32, 64, 128]},
        'n_epochs': {'value': 5},  # 5 epochs per run as requested
        'clip': {'value': 1.0},
        'teacher_forcing_ratio': {'values': [0.5, 0.7]},
        'use_attention': {'value': True},
        'max_seq_len': {'value': 50},
        'scheduler': {'values': ['none', 'plateau']}
    }
}

# Run the vanilla sweep with 50 trials
print("Starting vanilla model sweep (50 trials)...")
run_sweep(vanilla_sweep_config, count=50)

# Run the attention sweep with 50 trials
print("Starting attention model sweep (50 trials)...")
run_sweep(attention_sweep_config, count=50)


Starting vanilla model sweep (50 trials)...
Creating sweep with name: ta_vanilla_lstm-gru_emb32-128_hid64-256_enc1-3_dec1-3_drop0.0-0.3_adam-rmsprop_lr0.0001-0.01_0520-1225
Create sweep with ID: mbrh064k
Sweep URL: https://wandb.ai/teja_sai-indian-institute-of-technology-madras/transliteration-seq2seq/sweeps/mbrh064k
Starting sweep with ID: mbrh064k
Running 50 trials.


[34m[1mwandb[0m: [32m[41mERROR[0m Run raz3p6ob errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 300, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     wandb.teardown()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/lib/python3.11/contextlib.py", line 81, in inner
[34m[1mwandb[0m: [32m[41mERROR[0m     return func(*args, **kwds)
[34m[1mwandb[0m: [32m[41mERROR[0m            ^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_setup.py", line 404, in teardown
[34m[1mwandb[0m: [32m[41mERROR[0m     orig_singleton._teardown(exit_code=exit_code)
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_setup.py", line 249, in _teardown
[34m[1mwandb[0m: [32m[41mERROR[0m     internal_exit

Starting training run with config: {'batch_size': 64, 'cell_type': 'lstm', 'clip': 1, 'decoder_layers': 2, 'dropout': 0, 'embedding_size': 128, 'encoder_layers': 2, 'hidden_size': 64, 'language': 'ta', 'learning_rate': 0.0004152277870643743, 'max_seq_len': 50, 'n_epochs': 5, 'optimizer': 'adam', 'scheduler': 'none', 'teacher_forcing_ratio': 0.7, 'use_attention': False, 'weight_decay': 0}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examples
Dev set: 6827 examples
Test set: 6864 examples
Source vocabulary size: 30
Target vocabulary size: 50
Max sequence length: 50
Using device: cuda
The model has 179,378 trainable parameters


Training:   0%|          | 0/1066 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/107 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/107 [00:00<?, ?it/s]

Epoch: 01 | Time: 1m 25s
	Train Loss: 0.5831
	 Val. Loss: 0.5251
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.000415
Saved new best model to best-model-3412yiok.pt


Training:   0%|          | 0/1066 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/107 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/107 [00:00<?, ?it/s]

Epoch: 02 | Time: 1m 25s
	Train Loss: 0.5365
	 Val. Loss: 0.5080
	 Val. Accuracy: 0.0004
	 Learning Rate: 0.000415
Saved new best model to best-model-3412yiok.pt


Training:   0%|          | 0/1066 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/107 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/107 [00:00<?, ?it/s]

Epoch: 03 | Time: 1m 25s
	Train Loss: 0.4777
	 Val. Loss: 0.5338
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.000415


Training:   0%|          | 0/1066 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/107 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/107 [00:00<?, ?it/s]

Epoch: 04 | Time: 1m 26s
	Train Loss: 0.4620
	 Val. Loss: 0.5308
	 Val. Accuracy: 0.0004
	 Learning Rate: 0.000415


Training:   0%|          | 0/1066 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/107 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/107 [00:00<?, ?it/s]

Epoch: 05 | Time: 1m 26s
	Train Loss: 0.4520
	 Val. Loss: 0.5284
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.000415
Training finished.
Evaluating best model on test set...


Evaluating Loss:   0%|          | 0/108 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/108 [00:00<?, ?it/s]

Test Loss: 0.5122 | Test Accuracy: 0.0000
Saved predictions to predictions/vanilla/predictions-3412yiok.json


0,1
epoch,▁▃▅▆█
epoch_time_min,▁▁▁▁▁
epoch_time_sec,▁▁▁██
learning_rate,▁▁▁▁▁
test_accuracy,▁
test_loss,▁
train_loss,█▆▂▂▁
trainable_parameters,▁
valid_accuracy,▁█▁█▁
valid_loss,▆▁█▇▇

0,1
epoch,4
epoch_time_min,1
epoch_time_sec,26
learning_rate,0.00042
optimizer_type,adam
test_accuracy,0
test_loss,0.51221
train_loss,0.45195
trainable_parameters,179378
valid_accuracy,0


[34m[1mwandb[0m: [32m[41mERROR[0m Run 1mez5v7t errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 300, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     wandb.teardown()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/lib/python3.11/contextlib.py", line 81, in inner
[34m[1mwandb[0m: [32m[41mERROR[0m     return func(*args, **kwds)
[34m[1mwandb[0m: [32m[41mERROR[0m            ^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_setup.py", line 404, in teardown
[34m[1mwandb[0m: [32m[41mERROR[0m     orig_singleton._teardown(exit_code=exit_code)
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_setup.py", line 249, in _teardown
[34m[1mwandb[0m: [32m[41mERROR[0m     internal_exit

Starting training run with config: {'batch_size': 32, 'cell_type': 'gru', 'clip': 1, 'decoder_layers': 2, 'dropout': 0.3, 'embedding_size': 128, 'encoder_layers': 1, 'hidden_size': 256, 'language': 'ta', 'learning_rate': 0.0007808362384340169, 'max_seq_len': 50, 'n_epochs': 5, 'optimizer': 'rmsprop', 'scheduler': 'plateau', 'teacher_forcing_ratio': 0.7, 'use_attention': False, 'weight_decay': 0}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examples
Dev set: 6827 examples
Test set: 6864 examples
Source vocabulary size: 30
Target vocabulary size: 50
Max sequence length: 50
Using device: cuda
The model has 1,010,738 trainable parameters




Training:   0%|          | 0/2132 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/214 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/214 [00:00<?, ?it/s]

Traceback (most recent call last):
  File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
    valid_accuracy, _ = calculate_accuracy(
                        ^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/7560943.py", line 194, in calculate_accuracy
    decoder_output, hidden, _ = model.decoder(
                                ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/3382890687.py", line 100, in forward
    output, hidden = self.rnn(embedded, hidden)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call

0,1
trainable_parameters,▁

0,1
optimizer_type,rmsprop
trainable_parameters,1010738


[34m[1mwandb[0m: [32m[41mERROR[0m Run kkc9m416 errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
[34m[1mwandb[0m: [32m[41mERROR[0m     valid_accuracy, _ = calculate_accuracy(
[34m[1mwandb[0m: [32m[41mERROR[0m                         ^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/7560943.py", line 194, in calculate_accuracy
[34m[1mwandb[0m: [32m[41mERROR[0m     decoder_output, hidden, _ = model.decoder(
[34m[1mwandb[0m: [32m[41mERROR[0m                                 ^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/m

Starting training run with config: {'batch_size': 32, 'cell_type': 'gru', 'clip': 1, 'decoder_layers': 3, 'dropout': 0.2, 'embedding_size': 64, 'encoder_layers': 1, 'hidden_size': 256, 'language': 'ta', 'learning_rate': 0.0005862946897153711, 'max_seq_len': 50, 'n_epochs': 5, 'optimizer': 'rmsprop', 'scheduler': 'none', 'teacher_forcing_ratio': 0.7, 'use_attention': False, 'weight_decay': 0.0001}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examples
Dev set: 6827 examples
Test set: 6864 examples
Source vocabulary size: 30
Target vocabulary size: 50
Max sequence length: 50
Using device: cuda
The model has 1,302,066 trainable parameters


Training:   0%|          | 0/2132 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/214 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/214 [00:00<?, ?it/s]

Traceback (most recent call last):
  File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
    valid_accuracy, _ = calculate_accuracy(
                        ^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/7560943.py", line 194, in calculate_accuracy
    decoder_output, hidden, _ = model.decoder(
                                ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/3382890687.py", line 100, in forward
    output, hidden = self.rnn(embedded, hidden)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call

0,1
trainable_parameters,▁

0,1
optimizer_type,rmsprop
trainable_parameters,1302066


[34m[1mwandb[0m: [32m[41mERROR[0m Run flumypgm errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
[34m[1mwandb[0m: [32m[41mERROR[0m     valid_accuracy, _ = calculate_accuracy(
[34m[1mwandb[0m: [32m[41mERROR[0m                         ^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/7560943.py", line 194, in calculate_accuracy
[34m[1mwandb[0m: [32m[41mERROR[0m     decoder_output, hidden, _ = model.decoder(
[34m[1mwandb[0m: [32m[41mERROR[0m                                 ^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/m

Starting training run with config: {'batch_size': 32, 'cell_type': 'gru', 'clip': 1, 'decoder_layers': 2, 'dropout': 0, 'embedding_size': 32, 'encoder_layers': 2, 'hidden_size': 64, 'language': 'ta', 'learning_rate': 0.0006663942370879322, 'max_seq_len': 50, 'n_epochs': 5, 'optimizer': 'rmsprop', 'scheduler': 'none', 'teacher_forcing_ratio': 0.5, 'use_attention': False, 'weight_decay': 0}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examples
Dev set: 6827 examples
Test set: 6864 examples
Source vocabulary size: 30
Target vocabulary size: 50
Max sequence length: 50
Using device: cuda
The model has 93,362 trainable parameters


Training:   0%|          | 0/2132 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/214 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/214 [00:00<?, ?it/s]

Epoch: 01 | Time: 2m 34s
	Train Loss: 0.5412
	 Val. Loss: 0.5356
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.000666
Saved new best model to best-model-0lj3je3y.pt


Training:   0%|          | 0/2132 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/214 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/214 [00:00<?, ?it/s]

Epoch: 02 | Time: 2m 35s
	Train Loss: 0.5059
	 Val. Loss: 0.5397
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.000666


Training:   0%|          | 0/2132 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/214 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/214 [00:00<?, ?it/s]

Epoch: 03 | Time: 2m 35s
	Train Loss: 0.4966
	 Val. Loss: 0.5381
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.000666


Training:   0%|          | 0/2132 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/214 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/214 [00:00<?, ?it/s]

Epoch: 04 | Time: 2m 35s
	Train Loss: 0.4910
	 Val. Loss: 0.5500
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.000666


Training:   0%|          | 0/2132 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/214 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/214 [00:00<?, ?it/s]

Epoch: 05 | Time: 2m 35s
	Train Loss: 0.4869
	 Val. Loss: 0.5349
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.000666
Saved new best model to best-model-0lj3je3y.pt
Training finished.
Evaluating best model on test set...


Evaluating Loss:   0%|          | 0/215 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/215 [00:00<?, ?it/s]

Test Loss: 0.5393 | Test Accuracy: 0.0000
Saved predictions to predictions/vanilla/predictions-0lj3je3y.json


0,1
epoch,▁▃▅▆█
epoch_time_min,▁▁▁▁▁
epoch_time_sec,▁████
learning_rate,▁▁▁▁▁
test_accuracy,▁
test_loss,▁
train_loss,█▃▂▂▁
trainable_parameters,▁
valid_accuracy,▁▁▁▁▁
valid_loss,▁▃▂█▁

0,1
epoch,4
epoch_time_min,2
epoch_time_sec,35
learning_rate,0.00067
optimizer_type,rmsprop
test_accuracy,0
test_loss,0.53926
train_loss,0.48685
trainable_parameters,93362
valid_accuracy,0


[34m[1mwandb[0m: [32m[41mERROR[0m Run ga3uw70h errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 300, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     wandb.teardown()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/lib/python3.11/contextlib.py", line 81, in inner
[34m[1mwandb[0m: [32m[41mERROR[0m     return func(*args, **kwds)
[34m[1mwandb[0m: [32m[41mERROR[0m            ^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_setup.py", line 404, in teardown
[34m[1mwandb[0m: [32m[41mERROR[0m     orig_singleton._teardown(exit_code=exit_code)
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_setup.py", line 249, in _teardown
[34m[1mwandb[0m: [32m[41mERROR[0m     internal_exit

Starting training run with config: {'batch_size': 32, 'cell_type': 'lstm', 'clip': 1, 'decoder_layers': 3, 'dropout': 0.2, 'embedding_size': 64, 'encoder_layers': 1, 'hidden_size': 128, 'language': 'ta', 'learning_rate': 0.0002014522681314444, 'max_seq_len': 50, 'n_epochs': 5, 'optimizer': 'rmsprop', 'scheduler': 'none', 'teacher_forcing_ratio': 0.7, 'use_attention': False, 'weight_decay': 0.0001}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examples
Dev set: 6827 examples
Test set: 6864 examples
Source vocabulary size: 30
Target vocabulary size: 50
Max sequence length: 50
Using device: cuda
The model has 474,418 trainable parameters


Training:   0%|          | 0/2132 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/214 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/214 [00:00<?, ?it/s]

Traceback (most recent call last):
  File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
    valid_accuracy, _ = calculate_accuracy(
                        ^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/7560943.py", line 190, in calculate_accuracy
    decoder_output, hidden, cell = model.decoder(
                                   ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/3382890687.py", line 96, in forward
    output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/mo

0,1
trainable_parameters,▁

0,1
optimizer_type,rmsprop
trainable_parameters,474418


[34m[1mwandb[0m: [32m[41mERROR[0m Run f0mu5ft1 errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
[34m[1mwandb[0m: [32m[41mERROR[0m     valid_accuracy, _ = calculate_accuracy(
[34m[1mwandb[0m: [32m[41mERROR[0m                         ^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/7560943.py", line 190, in calculate_accuracy
[34m[1mwandb[0m: [32m[41mERROR[0m     decoder_output, hidden, cell = model.decoder(
[34m[1mwandb[0m: [32m[41mERROR[0m                                    ^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/torch/nn/mod

Starting training run with config: {'batch_size': 32, 'cell_type': 'gru', 'clip': 1, 'decoder_layers': 3, 'dropout': 0, 'embedding_size': 128, 'encoder_layers': 3, 'hidden_size': 64, 'language': 'ta', 'learning_rate': 0.0015736418604743196, 'max_seq_len': 50, 'n_epochs': 5, 'optimizer': 'adam', 'scheduler': 'plateau', 'teacher_forcing_ratio': 0.7, 'use_attention': False, 'weight_decay': 0}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examples
Dev set: 6827 examples
Test set: 6864 examples
Source vocabulary size: 30
Target vocabulary size: 50
Max sequence length: 50
Using device: cuda
The model has 187,826 trainable parameters




Training:   0%|          | 0/2132 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/214 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/214 [00:00<?, ?it/s]

Epoch: 01 | Time: 3m 5s
	Train Loss: 0.5567
	 Val. Loss: 0.5242
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.001574
Saved new best model to best-model-k5cvs6mq.pt


Training:   0%|          | 0/2132 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/214 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/214 [00:00<?, ?it/s]

Epoch: 02 | Time: 3m 5s
	Train Loss: 0.5026
	 Val. Loss: 0.5634
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.001574


Training:   0%|          | 0/2132 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/214 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/214 [00:00<?, ?it/s]

Epoch: 04 | Time: 3m 4s
	Train Loss: 0.4414
	 Val. Loss: 0.5743
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.001574


Training:   0%|          | 0/2132 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/214 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/214 [00:00<?, ?it/s]

Epoch: 05 | Time: 3m 5s
	Train Loss: 0.4336
	 Val. Loss: 0.5665
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.001574
Training finished.
Evaluating best model on test set...


Evaluating Loss:   0%|          | 0/215 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/215 [00:00<?, ?it/s]

Test Loss: 0.5294 | Test Accuracy: 0.0000
Saved predictions to predictions/vanilla/predictions-k5cvs6mq.json


0,1
epoch,▁▃▅▆█
epoch_time_min,▁▁▁▁▁
epoch_time_sec,██▁▁█
learning_rate,▁▁▁▁▁
test_accuracy,▁
test_loss,▁
train_loss,█▅▂▁▁
trainable_parameters,▁
valid_accuracy,▁▁▁▁▁
valid_loss,▁▆▆█▇

0,1
epoch,4
epoch_time_min,3
epoch_time_sec,5
learning_rate,0.00157
optimizer_type,adam
test_accuracy,0
test_loss,0.52941
train_loss,0.43362
trainable_parameters,187826
valid_accuracy,0


[34m[1mwandb[0m: [32m[41mERROR[0m Run ka5lylte errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 300, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     wandb.teardown()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/lib/python3.11/contextlib.py", line 81, in inner
[34m[1mwandb[0m: [32m[41mERROR[0m     return func(*args, **kwds)
[34m[1mwandb[0m: [32m[41mERROR[0m            ^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_setup.py", line 404, in teardown
[34m[1mwandb[0m: [32m[41mERROR[0m     orig_singleton._teardown(exit_code=exit_code)
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_setup.py", line 249, in _teardown
[34m[1mwandb[0m: [32m[41mERROR[0m     internal_exit

Starting training run with config: {'batch_size': 128, 'cell_type': 'lstm', 'clip': 1, 'decoder_layers': 1, 'dropout': 0, 'embedding_size': 32, 'encoder_layers': 2, 'hidden_size': 64, 'language': 'ta', 'learning_rate': 0.0002307102971116781, 'max_seq_len': 50, 'n_epochs': 5, 'optimizer': 'rmsprop', 'scheduler': 'none', 'teacher_forcing_ratio': 0.7, 'use_attention': False, 'weight_decay': 0.0001}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examples
Dev set: 6827 examples
Test set: 6864 examples
Source vocabulary size: 30
Target vocabulary size: 50
Max sequence length: 50
Using device: cuda
The model has 89,266 trainable parameters


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Traceback (most recent call last):
  File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
    valid_accuracy, _ = calculate_accuracy(
                        ^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/7560943.py", line 190, in calculate_accuracy
    decoder_output, hidden, cell = model.decoder(
                                   ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/3382890687.py", line 96, in forward
    output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/mo

0,1
trainable_parameters,▁

0,1
optimizer_type,rmsprop
trainable_parameters,89266


[34m[1mwandb[0m: [32m[41mERROR[0m Run 1tfdoku0 errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
[34m[1mwandb[0m: [32m[41mERROR[0m     valid_accuracy, _ = calculate_accuracy(
[34m[1mwandb[0m: [32m[41mERROR[0m                         ^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/7560943.py", line 190, in calculate_accuracy
[34m[1mwandb[0m: [32m[41mERROR[0m     decoder_output, hidden, cell = model.decoder(
[34m[1mwandb[0m: [32m[41mERROR[0m                                    ^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/torch/nn/mod

Starting training run with config: {'batch_size': 64, 'cell_type': 'lstm', 'clip': 1, 'decoder_layers': 3, 'dropout': 0.2, 'embedding_size': 64, 'encoder_layers': 3, 'hidden_size': 256, 'language': 'ta', 'learning_rate': 0.00023353827737786917, 'max_seq_len': 50, 'n_epochs': 5, 'optimizer': 'rmsprop', 'scheduler': 'plateau', 'teacher_forcing_ratio': 0.5, 'use_attention': False, 'weight_decay': 0.0001}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examples
Dev set: 6827 examples
Test set: 6864 examples
Source vocabulary size: 30
Target vocabulary size: 50
Max sequence length: 50
Using device: cuda
The model has 2,782,770 trainable parameters




Training:   0%|          | 0/1066 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/107 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/107 [00:00<?, ?it/s]

Epoch: 01 | Time: 2m 5s
	Train Loss: 0.5614
	 Val. Loss: 0.5239
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.000234
Saved new best model to best-model-amh9b2uz.pt


Training:   0%|          | 0/1066 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/107 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/107 [00:00<?, ?it/s]

Epoch: 02 | Time: 2m 5s
	Train Loss: 0.5535
	 Val. Loss: 0.5224
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.000234
Saved new best model to best-model-amh9b2uz.pt


Training:   0%|          | 0/1066 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=10000.0 (msgs/sec)
NotebookApp.rate_limit_window=1.0 (secs)



Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 02 | Time: 0m 37s
	Train Loss: 0.2637
	 Val. Loss: 0.3112
	 Val. Accuracy: 0.0845
	 Learning Rate: 0.006821
Saved new best model to best-model-raz3p6ob.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 03 | Time: 0m 37s
	Train Loss: 0.2252
	 Val. Loss: 0.2775
	 Val. Accuracy: 0.1412
	 Learning Rate: 0.006821
Saved new best model to best-model-raz3p6ob.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 04 | Time: 0m 37s
	Train Loss: 0.2061
	 Val. Loss: 0.2645
	 Val. Accuracy: 0.1695
	 Learning Rate: 0.006821
Saved new best model to best-model-raz3p6ob.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 05 | Time: 0m 37s
	Train Loss: 0.1918
	 Val. Loss: 0.2481
	 Val. Accuracy: 0.1790
	 Learning Rate: 0.006821
Saved new best model to best-model-raz3p6ob.pt
Training finished.
Evaluating best model on test set...


Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Test Loss: 0.2522 | Test Accuracy: 0.1734
Saved predictions to predictions/vanilla/predictions-raz3p6ob.json


0,1
epoch,▁▃▅▆█
epoch_time_min,▁▁▁▁▁
epoch_time_sec,▁▁▁▁▁
learning_rate,▁▁▁▁▁
test_accuracy,▁
test_loss,▁
train_loss,█▄▂▂▁
trainable_parameters,▁
valid_accuracy,▁▃▆██
valid_loss,█▅▃▂▁

0,1
epoch,4
epoch_time_min,0
epoch_time_sec,37
learning_rate,0.00682
optimizer_type,rmsprop
test_accuracy,0.17337
test_loss,0.25218
train_loss,0.19181
trainable_parameters,58290
valid_accuracy,0.179


[34m[1mwandb[0m: [32m[41mERROR[0m Run y34w1kny errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 300, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     wandb.teardown()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/lib/python3.11/contextlib.py", line 81, in inner
[34m[1mwandb[0m: [32m[41mERROR[0m     return func(*args, **kwds)
[34m[1mwandb[0m: [32m[41mERROR[0m            ^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_setup.py", line 404, in teardown
[34m[1mwandb[0m: [32m[41mERROR[0m     orig_singleton._teardown(exit_code=exit_code)
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_setup.py", line 249, in _teardown
[34m[1mwandb[0m: [32m[41mERROR[0m     internal_exit

Starting training run with config: {'batch_size': 128, 'cell_type': 'gru', 'clip': 1, 'decoder_layers': 1, 'dropout': 0.2, 'embedding_size': 64, 'encoder_layers': 1, 'hidden_size': 128, 'language': 'ta', 'learning_rate': 0.005062030007522182, 'max_seq_len': 50, 'n_epochs': 5, 'optimizer': 'rmsprop', 'scheduler': 'plateau', 'teacher_forcing_ratio': 0.5, 'use_attention': False, 'weight_decay': 0}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examples
Dev set: 6827 examples
Test set: 6864 examples
Source vocabulary size: 30
Target vocabulary size: 50
Max sequence length: 50
Using device: cuda
The model has 160,562 trainable parameters




Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 01 | Time: 0m 37s
	Train Loss: 0.4826
	 Val. Loss: 0.4636
	 Val. Accuracy: 0.0012
	 Learning Rate: 0.005062
Saved new best model to best-model-abn7vhiz.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 02 | Time: 0m 37s
	Train Loss: 0.2881
	 Val. Loss: 0.2964
	 Val. Accuracy: 0.1182
	 Learning Rate: 0.005062
Saved new best model to best-model-abn7vhiz.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 03 | Time: 0m 37s
	Train Loss: 0.1904
	 Val. Loss: 0.2476
	 Val. Accuracy: 0.2161
	 Learning Rate: 0.005062
Saved new best model to best-model-abn7vhiz.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 04 | Time: 0m 37s
	Train Loss: 0.1569
	 Val. Loss: 0.2137
	 Val. Accuracy: 0.2695
	 Learning Rate: 0.005062
Saved new best model to best-model-abn7vhiz.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 05 | Time: 0m 37s
	Train Loss: 0.1381
	 Val. Loss: 0.2023
	 Val. Accuracy: 0.3123
	 Learning Rate: 0.005062
Saved new best model to best-model-abn7vhiz.pt
Training finished.
Evaluating best model on test set...


Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Test Loss: 0.2133 | Test Accuracy: 0.2972
Saved predictions to predictions/vanilla/predictions-abn7vhiz.json


0,1
epoch,▁▃▅▆█
epoch_time_min,▁▁▁▁▁
epoch_time_sec,▁▁▁▁▁
learning_rate,▁▁▁▁▁
test_accuracy,▁
test_loss,▁
train_loss,█▄▂▁▁
trainable_parameters,▁
valid_accuracy,▁▄▆▇█
valid_loss,█▄▂▁▁

0,1
epoch,4
epoch_time_min,0
epoch_time_sec,37
learning_rate,0.00506
optimizer_type,rmsprop
test_accuracy,0.2972
test_loss,0.21333
train_loss,0.13807
trainable_parameters,160562
valid_accuracy,0.31229


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: [32m[41mERROR[0m Run 55l08upl errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 300, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     wandb.teardown()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/lib/python3.11/contextlib.py", line 81, in inner
[34m[1mwandb[0m: [32m[41mERROR[0m     return func(*args, **kwds)
[34m[1mwandb[0m: [32m[41mERROR[0m            ^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_setup.py", line 404, in teardown
[34m[1mwandb[0m: [32m[41mERROR[0m     orig_singleton._teardown(exit_code=exit_code)
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_setup.p

Starting training run with config: {'batch_size': 128, 'cell_type': 'gru', 'clip': 1, 'decoder_layers': 2, 'dropout': 0.3, 'embedding_size': 64, 'encoder_layers': 1, 'hidden_size': 128, 'language': 'ta', 'learning_rate': 0.0004231397083875456, 'max_seq_len': 50, 'n_epochs': 5, 'optimizer': 'rmsprop', 'scheduler': 'plateau', 'teacher_forcing_ratio': 0.7, 'use_attention': False, 'weight_decay': 0}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examples
Dev set: 6827 examples
Test set: 6864 examples
Source vocabulary size: 30
Target vocabulary size: 50
Max sequence length: 50
Using device: cuda
The model has 259,634 trainable parameters




Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Traceback (most recent call last):
  File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
    valid_accuracy, _ = calculate_accuracy(
                        ^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/7560943.py", line 194, in calculate_accuracy
    decoder_output, hidden, _ = model.decoder(
                                ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/3382890687.py", line 100, in forward
    output, hidden = self.rnn(embedded, hidden)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call

0,1
trainable_parameters,▁

0,1
optimizer_type,rmsprop
trainable_parameters,259634


[34m[1mwandb[0m: [32m[41mERROR[0m Run 8tq0ezne errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
[34m[1mwandb[0m: [32m[41mERROR[0m     valid_accuracy, _ = calculate_accuracy(
[34m[1mwandb[0m: [32m[41mERROR[0m                         ^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/7560943.py", line 194, in calculate_accuracy
[34m[1mwandb[0m: [32m[41mERROR[0m     decoder_output, hidden, _ = model.decoder(
[34m[1mwandb[0m: [32m[41mERROR[0m                                 ^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/m

Starting training run with config: {'batch_size': 128, 'cell_type': 'gru', 'clip': 1, 'decoder_layers': 1, 'dropout': 0.2, 'embedding_size': 64, 'encoder_layers': 2, 'hidden_size': 128, 'language': 'ta', 'learning_rate': 0.004604635216373744, 'max_seq_len': 50, 'n_epochs': 5, 'optimizer': 'adam', 'scheduler': 'plateau', 'teacher_forcing_ratio': 0.5, 'use_attention': False, 'weight_decay': 0}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examples
Dev set: 6827 examples
Test set: 6864 examples
Source vocabulary size: 30
Target vocabulary size: 50
Max sequence length: 50
Using device: cuda
The model has 259,634 trainable parameters




Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Traceback (most recent call last):
  File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
    valid_accuracy, _ = calculate_accuracy(
                        ^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/7560943.py", line 194, in calculate_accuracy
    decoder_output, hidden, _ = model.decoder(
                                ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/3382890687.py", line 100, in forward
    output, hidden = self.rnn(embedded, hidden)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call

0,1
trainable_parameters,▁

0,1
optimizer_type,adam
trainable_parameters,259634


[34m[1mwandb[0m: [32m[41mERROR[0m Run r7ehwoiq errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
[34m[1mwandb[0m: [32m[41mERROR[0m     valid_accuracy, _ = calculate_accuracy(
[34m[1mwandb[0m: [32m[41mERROR[0m                         ^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/7560943.py", line 194, in calculate_accuracy
[34m[1mwandb[0m: [32m[41mERROR[0m     decoder_output, hidden, _ = model.decoder(
[34m[1mwandb[0m: [32m[41mERROR[0m                                 ^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/m

Starting training run with config: {'batch_size': 64, 'cell_type': 'gru', 'clip': 1, 'decoder_layers': 2, 'dropout': 0.3, 'embedding_size': 64, 'encoder_layers': 1, 'hidden_size': 64, 'language': 'ta', 'learning_rate': 0.0036617430705565023, 'max_seq_len': 50, 'n_epochs': 5, 'optimizer': 'rmsprop', 'scheduler': 'plateau', 'teacher_forcing_ratio': 0.5, 'use_attention': False, 'weight_decay': 0}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examples
Dev set: 6827 examples
Test set: 6864 examples
Source vocabulary size: 30
Target vocabulary size: 50
Max sequence length: 50
Using device: cuda
The model has 83,250 trainable parameters




Training:   0%|          | 0/1066 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/107 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/107 [00:00<?, ?it/s]

Traceback (most recent call last):
  File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
    valid_accuracy, _ = calculate_accuracy(
                        ^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/7560943.py", line 194, in calculate_accuracy
    decoder_output, hidden, _ = model.decoder(
                                ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/3382890687.py", line 100, in forward
    output, hidden = self.rnn(embedded, hidden)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call

0,1
trainable_parameters,▁

0,1
optimizer_type,rmsprop
trainable_parameters,83250


[34m[1mwandb[0m: [32m[41mERROR[0m Run o9zdmb54 errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
[34m[1mwandb[0m: [32m[41mERROR[0m     valid_accuracy, _ = calculate_accuracy(
[34m[1mwandb[0m: [32m[41mERROR[0m                         ^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/7560943.py", line 194, in calculate_accuracy
[34m[1mwandb[0m: [32m[41mERROR[0m     decoder_output, hidden, _ = model.decoder(
[34m[1mwandb[0m: [32m[41mERROR[0m                                 ^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/m

Starting training run with config: {'batch_size': 128, 'cell_type': 'gru', 'clip': 1, 'decoder_layers': 1, 'dropout': 0.2, 'embedding_size': 64, 'encoder_layers': 1, 'hidden_size': 64, 'language': 'ta', 'learning_rate': 0.00478407121233276, 'max_seq_len': 50, 'n_epochs': 5, 'optimizer': 'rmsprop', 'scheduler': 'plateau', 'teacher_forcing_ratio': 0.5, 'use_attention': False, 'weight_decay': 0.0001}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examples
Dev set: 6827 examples
Test set: 6864 examples
Source vocabulary size: 30
Target vocabulary size: 50
Max sequence length: 50
Using device: cuda
The model has 58,290 trainable parameters




Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 01 | Time: 0m 37s
	Train Loss: 0.5145
	 Val. Loss: 0.5509
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.004784
Saved new best model to best-model-5d4x400u.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 02 | Time: 0m 36s
	Train Loss: 0.4970
	 Val. Loss: 0.5435
	 Val. Accuracy: 0.0004
	 Learning Rate: 0.004784
Saved new best model to best-model-5d4x400u.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 03 | Time: 0m 37s
	Train Loss: 0.4910
	 Val. Loss: 0.5441
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.004784


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 04 | Time: 0m 36s
	Train Loss: 0.4874
	 Val. Loss: 0.5307
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.004784
Saved new best model to best-model-5d4x400u.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 05 | Time: 0m 37s
	Train Loss: 0.4883
	 Val. Loss: 0.5435
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.004784
Training finished.
Evaluating best model on test set...


Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Test Loss: 0.5355 | Test Accuracy: 0.0000
Saved predictions to predictions/vanilla/predictions-5d4x400u.json


0,1
epoch,▁▃▅▆█
epoch_time_min,▁▁▁▁▁
epoch_time_sec,█▁█▁█
learning_rate,▁▁▁▁▁
test_accuracy,▁
test_loss,▁
train_loss,█▃▂▁▁
trainable_parameters,▁
valid_accuracy,▁█▁▁▁
valid_loss,█▅▆▁▅

0,1
epoch,4
epoch_time_min,0
epoch_time_sec,37
learning_rate,0.00478
optimizer_type,rmsprop
test_accuracy,0
test_loss,0.53554
train_loss,0.48832
trainable_parameters,58290
valid_accuracy,0


[34m[1mwandb[0m: [32m[41mERROR[0m Run 6dfmcbw1 errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 300, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     wandb.teardown()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/lib/python3.11/contextlib.py", line 81, in inner
[34m[1mwandb[0m: [32m[41mERROR[0m     return func(*args, **kwds)
[34m[1mwandb[0m: [32m[41mERROR[0m            ^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_setup.py", line 404, in teardown
[34m[1mwandb[0m: [32m[41mERROR[0m     orig_singleton._teardown(exit_code=exit_code)
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_setup.py", line 249, in _teardown
[34m[1mwandb[0m: [32m[41mERROR[0m     internal_exit

Starting training run with config: {'batch_size': 32, 'cell_type': 'gru', 'clip': 1, 'decoder_layers': 2, 'dropout': 0.2, 'embedding_size': 32, 'encoder_layers': 2, 'hidden_size': 256, 'language': 'ta', 'learning_rate': 0.00011318208065660135, 'max_seq_len': 50, 'n_epochs': 5, 'optimizer': 'rmsprop', 'scheduler': 'none', 'teacher_forcing_ratio': 0.7, 'use_attention': False, 'weight_decay': 0.0001}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examples
Dev set: 6827 examples
Test set: 6864 examples
Source vocabulary size: 30
Target vocabulary size: 50
Max sequence length: 50
Using device: cuda
The model has 1,250,354 trainable parameters


Training:   0%|          | 0/2132 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/214 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/214 [00:00<?, ?it/s]

Epoch: 01 | Time: 3m 3s
	Train Loss: 0.5628
	 Val. Loss: 0.5236
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.000113
Saved new best model to best-model-osab03x5.pt


Training:   0%|          | 0/2132 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/214 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/214 [00:00<?, ?it/s]

Epoch: 02 | Time: 3m 3s
	Train Loss: 0.5535
	 Val. Loss: 0.5230
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.000113
Saved new best model to best-model-osab03x5.pt


Training:   0%|          | 0/2132 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/214 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/214 [00:00<?, ?it/s]

Epoch: 03 | Time: 3m 4s
	Train Loss: 0.5532
	 Val. Loss: 0.5238
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.000113


Training:   0%|          | 0/2132 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/214 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/214 [00:00<?, ?it/s]

Epoch: 04 | Time: 3m 3s
	Train Loss: 0.5528
	 Val. Loss: 0.5217
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.000113
Saved new best model to best-model-osab03x5.pt


Training:   0%|          | 0/2132 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/214 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/214 [00:00<?, ?it/s]

Epoch: 05 | Time: 3m 4s
	Train Loss: 0.5528
	 Val. Loss: 0.5238
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.000113
Training finished.
Evaluating best model on test set...


Evaluating Loss:   0%|          | 0/215 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/215 [00:00<?, ?it/s]

Test Loss: 0.5272 | Test Accuracy: 0.0000
Saved predictions to predictions/vanilla/predictions-osab03x5.json


0,1
epoch,▁▃▅▆█
epoch_time_min,▁▁▁▁▁
epoch_time_sec,▁▁█▁█
learning_rate,▁▁▁▁▁
test_accuracy,▁
test_loss,▁
train_loss,█▁▁▁▁
trainable_parameters,▁
valid_accuracy,▁▁▁▁▁
valid_loss,▇▅█▁█

0,1
epoch,4
epoch_time_min,3
epoch_time_sec,4
learning_rate,0.00011
optimizer_type,rmsprop
test_accuracy,0
test_loss,0.52718
train_loss,0.55277
trainable_parameters,1250354
valid_accuracy,0


[34m[1mwandb[0m: [32m[41mERROR[0m Run ga3uw70h errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 300, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     wandb.teardown()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/lib/python3.11/contextlib.py", line 81, in inner
[34m[1mwandb[0m: [32m[41mERROR[0m     return func(*args, **kwds)
[34m[1mwandb[0m: [32m[41mERROR[0m            ^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_setup.py", line 404, in teardown
[34m[1mwandb[0m: [32m[41mERROR[0m     orig_singleton._teardown(exit_code=exit_code)
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_setup.py", line 249, in _teardown
[34m[1mwandb[0m: [32m[41mERROR[0m     internal_exit

Starting training run with config: {'batch_size': 64, 'cell_type': 'gru', 'clip': 1, 'decoder_layers': 1, 'dropout': 0.3, 'embedding_size': 64, 'encoder_layers': 1, 'hidden_size': 64, 'language': 'ta', 'learning_rate': 0.008857882695058831, 'max_seq_len': 50, 'n_epochs': 5, 'optimizer': 'rmsprop', 'scheduler': 'plateau', 'teacher_forcing_ratio': 0.7, 'use_attention': False, 'weight_decay': 0.0001}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examples
Dev set: 6827 examples
Test set: 6864 examples
Source vocabulary size: 30
Target vocabulary size: 50
Max sequence length: 50
Using device: cuda
The model has 58,290 trainable parameters




Training:   0%|          | 0/1066 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/107 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/107 [00:00<?, ?it/s]

Epoch: 01 | Time: 1m 10s
	Train Loss: 0.3830
	 Val. Loss: 0.3920
	 Val. Accuracy: 0.0277
	 Learning Rate: 0.008858
Saved new best model to best-model-iqgyxcen.pt


Training:   0%|          | 0/1066 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/107 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/107 [00:00<?, ?it/s]

Epoch: 02 | Time: 1m 10s
	Train Loss: 0.2474
	 Val. Loss: 0.3222
	 Val. Accuracy: 0.1022
	 Learning Rate: 0.008858
Saved new best model to best-model-iqgyxcen.pt


Training:   0%|          | 0/1066 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/107 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/107 [00:00<?, ?it/s]

Epoch: 03 | Time: 1m 10s
	Train Loss: 0.2139
	 Val. Loss: 0.3025
	 Val. Accuracy: 0.1274
	 Learning Rate: 0.008858
Saved new best model to best-model-iqgyxcen.pt


Training:   0%|          | 0/1066 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/107 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/107 [00:00<?, ?it/s]

Epoch: 04 | Time: 1m 10s
	Train Loss: 0.1998
	 Val. Loss: 0.2890
	 Val. Accuracy: 0.1561
	 Learning Rate: 0.008858
Saved new best model to best-model-iqgyxcen.pt


Training:   0%|          | 0/1066 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/107 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/107 [00:00<?, ?it/s]

Epoch: 05 | Time: 1m 10s
	Train Loss: 0.1950
	 Val. Loss: 0.2818
	 Val. Accuracy: 0.1501
	 Learning Rate: 0.008858
Saved new best model to best-model-iqgyxcen.pt
Training finished.
Evaluating best model on test set...


Evaluating Loss:   0%|          | 0/108 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/108 [00:00<?, ?it/s]

Test Loss: 0.2887 | Test Accuracy: 0.1436
Saved predictions to predictions/vanilla/predictions-iqgyxcen.json


0,1
epoch,▁▃▅▆█
epoch_time_min,▁▁▁▁▁
epoch_time_sec,▁▁▁▁▁
learning_rate,▁▁▁▁▁
test_accuracy,▁
test_loss,▁
train_loss,█▃▂▁▁
trainable_parameters,▁
valid_accuracy,▁▅▆██
valid_loss,█▄▂▁▁

0,1
epoch,4
epoch_time_min,1
epoch_time_sec,10
learning_rate,0.00886
optimizer_type,rmsprop
test_accuracy,0.14365
test_loss,0.28869
train_loss,0.19498
trainable_parameters,58290
valid_accuracy,0.15014


[34m[1mwandb[0m: [32m[41mERROR[0m Run nbx3anqz errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 300, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     wandb.teardown()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/lib/python3.11/contextlib.py", line 81, in inner
[34m[1mwandb[0m: [32m[41mERROR[0m     return func(*args, **kwds)
[34m[1mwandb[0m: [32m[41mERROR[0m            ^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_setup.py", line 404, in teardown
[34m[1mwandb[0m: [32m[41mERROR[0m     orig_singleton._teardown(exit_code=exit_code)
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_setup.py", line 249, in _teardown
[34m[1mwandb[0m: [32m[41mERROR[0m     internal_exit

Starting training run with config: {'batch_size': 128, 'cell_type': 'gru', 'clip': 1, 'decoder_layers': 2, 'dropout': 0.3, 'embedding_size': 32, 'encoder_layers': 1, 'hidden_size': 128, 'language': 'ta', 'learning_rate': 0.003910852536444529, 'max_seq_len': 50, 'n_epochs': 5, 'optimizer': 'rmsprop', 'scheduler': 'plateau', 'teacher_forcing_ratio': 0.7, 'use_attention': False, 'weight_decay': 0.0001}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examples
Dev set: 6827 examples
Test set: 6864 examples
Source vocabulary size: 30
Target vocabulary size: 50
Max sequence length: 50
Using device: cuda
The model has 232,498 trainable parameters




Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Traceback (most recent call last):
  File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
    valid_accuracy, _ = calculate_accuracy(
                        ^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/7560943.py", line 194, in calculate_accuracy
    decoder_output, hidden, _ = model.decoder(
                                ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/3382890687.py", line 100, in forward
    output, hidden = self.rnn(embedded, hidden)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call

0,1
trainable_parameters,▁

0,1
optimizer_type,rmsprop
trainable_parameters,232498


[34m[1mwandb[0m: [32m[41mERROR[0m Run daekt2n5 errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
[34m[1mwandb[0m: [32m[41mERROR[0m     valid_accuracy, _ = calculate_accuracy(
[34m[1mwandb[0m: [32m[41mERROR[0m                         ^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/7560943.py", line 194, in calculate_accuracy
[34m[1mwandb[0m: [32m[41mERROR[0m     decoder_output, hidden, _ = model.decoder(
[34m[1mwandb[0m: [32m[41mERROR[0m                                 ^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/m

Starting training run with config: {'batch_size': 128, 'cell_type': 'gru', 'clip': 1, 'decoder_layers': 1, 'dropout': 0.2, 'embedding_size': 64, 'encoder_layers': 2, 'hidden_size': 128, 'language': 'ta', 'learning_rate': 0.001293942236058646, 'max_seq_len': 50, 'n_epochs': 5, 'optimizer': 'rmsprop', 'scheduler': 'plateau', 'teacher_forcing_ratio': 0.5, 'use_attention': False, 'weight_decay': 0}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examples
Dev set: 6827 examples
Test set: 6864 examples
Source vocabulary size: 30
Target vocabulary size: 50
Max sequence length: 50
Using device: cuda
The model has 259,634 trainable parameters




Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Traceback (most recent call last):
  File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
    valid_accuracy, _ = calculate_accuracy(
                        ^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/7560943.py", line 194, in calculate_accuracy
    decoder_output, hidden, _ = model.decoder(
                                ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/3382890687.py", line 100, in forward
    output, hidden = self.rnn(embedded, hidden)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call

0,1
trainable_parameters,▁

0,1
optimizer_type,rmsprop
trainable_parameters,259634


[34m[1mwandb[0m: [32m[41mERROR[0m Run oipmqf4r errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
[34m[1mwandb[0m: [32m[41mERROR[0m     valid_accuracy, _ = calculate_accuracy(
[34m[1mwandb[0m: [32m[41mERROR[0m                         ^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/7560943.py", line 194, in calculate_accuracy
[34m[1mwandb[0m: [32m[41mERROR[0m     decoder_output, hidden, _ = model.decoder(
[34m[1mwandb[0m: [32m[41mERROR[0m                                 ^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/m

Starting training run with config: {'batch_size': 128, 'cell_type': 'gru', 'clip': 1, 'decoder_layers': 1, 'dropout': 0.2, 'embedding_size': 64, 'encoder_layers': 3, 'hidden_size': 128, 'language': 'ta', 'learning_rate': 0.008530254700150846, 'max_seq_len': 50, 'n_epochs': 5, 'optimizer': 'adam', 'scheduler': 'plateau', 'teacher_forcing_ratio': 0.7, 'use_attention': False, 'weight_decay': 0}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examples
Dev set: 6827 examples
Test set: 6864 examples
Source vocabulary size: 30
Target vocabulary size: 50
Max sequence length: 50
Using device: cuda
The model has 358,706 trainable parameters




Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Traceback (most recent call last):
  File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
    valid_accuracy, _ = calculate_accuracy(
                        ^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/7560943.py", line 194, in calculate_accuracy
    decoder_output, hidden, _ = model.decoder(
                                ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/3382890687.py", line 100, in forward
    output, hidden = self.rnn(embedded, hidden)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call

0,1
trainable_parameters,▁

0,1
optimizer_type,adam
trainable_parameters,358706


[34m[1mwandb[0m: [32m[41mERROR[0m Run gp1xqc9f errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
[34m[1mwandb[0m: [32m[41mERROR[0m     valid_accuracy, _ = calculate_accuracy(
[34m[1mwandb[0m: [32m[41mERROR[0m                         ^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/7560943.py", line 194, in calculate_accuracy
[34m[1mwandb[0m: [32m[41mERROR[0m     decoder_output, hidden, _ = model.decoder(
[34m[1mwandb[0m: [32m[41mERROR[0m                                 ^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/m

Starting training run with config: {'batch_size': 128, 'cell_type': 'gru', 'clip': 1, 'decoder_layers': 2, 'dropout': 0.2, 'embedding_size': 128, 'encoder_layers': 2, 'hidden_size': 64, 'language': 'ta', 'learning_rate': 0.0004205763507118058, 'max_seq_len': 50, 'n_epochs': 5, 'optimizer': 'rmsprop', 'scheduler': 'none', 'teacher_forcing_ratio': 0.5, 'use_attention': False, 'weight_decay': 0.0001}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examples
Dev set: 6827 examples
Test set: 6864 examples
Source vocabulary size: 30
Target vocabulary size: 50
Max sequence length: 50
Using device: cuda
The model has 137,906 trainable parameters


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 01 | Time: 0m 45s
	Train Loss: 0.5772
	 Val. Loss: 0.5254
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.000421
Saved new best model to best-model-c28sxvua.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 02 | Time: 0m 45s
	Train Loss: 0.5561
	 Val. Loss: 0.5227
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.000421
Saved new best model to best-model-c28sxvua.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 03 | Time: 0m 45s
	Train Loss: 0.5540
	 Val. Loss: 0.5217
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.000421
Saved new best model to best-model-c28sxvua.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 04 | Time: 0m 45s
	Train Loss: 0.5534
	 Val. Loss: 0.5215
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.000421
Saved new best model to best-model-c28sxvua.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 05 | Time: 0m 45s
	Train Loss: 0.5531
	 Val. Loss: 0.5218
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.000421
Training finished.
Evaluating best model on test set...


Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Test Loss: 0.5267 | Test Accuracy: 0.0000
Saved predictions to predictions/vanilla/predictions-c28sxvua.json


0,1
epoch,▁▃▅▆█
epoch_time_min,▁▁▁▁▁
epoch_time_sec,▁▁▁▁▁
learning_rate,▁▁▁▁▁
test_accuracy,▁
test_loss,▁
train_loss,█▂▁▁▁
trainable_parameters,▁
valid_accuracy,▁▁▁▁▁
valid_loss,█▃▁▁▂

0,1
epoch,4
epoch_time_min,0
epoch_time_sec,45
learning_rate,0.00042
optimizer_type,rmsprop
test_accuracy,0
test_loss,0.52673
train_loss,0.55307
trainable_parameters,137906
valid_accuracy,0


[34m[1mwandb[0m: [32m[41mERROR[0m Run c60djjw4 errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 300, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     wandb.teardown()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/lib/python3.11/contextlib.py", line 81, in inner
[34m[1mwandb[0m: [32m[41mERROR[0m     return func(*args, **kwds)
[34m[1mwandb[0m: [32m[41mERROR[0m            ^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_setup.py", line 404, in teardown
[34m[1mwandb[0m: [32m[41mERROR[0m     orig_singleton._teardown(exit_code=exit_code)
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_setup.py", line 249, in _teardown
[34m[1mwandb[0m: [32m[41mERROR[0m     internal_exit

Starting training run with config: {'batch_size': 32, 'cell_type': 'lstm', 'clip': 1, 'decoder_layers': 1, 'dropout': 0.2, 'embedding_size': 128, 'encoder_layers': 1, 'hidden_size': 256, 'language': 'ta', 'learning_rate': 0.0011274310666666168, 'max_seq_len': 50, 'n_epochs': 5, 'optimizer': 'adam', 'scheduler': 'none', 'teacher_forcing_ratio': 0.7, 'use_attention': False, 'weight_decay': 0}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examples
Dev set: 6827 examples
Test set: 6864 examples
Source vocabulary size: 30
Target vocabulary size: 50
Max sequence length: 50
Using device: cuda
The model has 813,618 trainable parameters


Training:   0%|          | 0/2132 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/214 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/214 [00:00<?, ?it/s]

Epoch: 01 | Time: 2m 37s
	Train Loss: 0.4771
	 Val. Loss: 0.5657
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.001127
Saved new best model to best-model-l8jshma4.pt


Training:   0%|          | 0/2132 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/214 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/214 [00:00<?, ?it/s]

Epoch: 02 | Time: 2m 38s
	Train Loss: 0.4244
	 Val. Loss: 0.5772
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.001127


Training:   0%|          | 0/2132 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/214 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/214 [00:00<?, ?it/s]

Epoch: 03 | Time: 2m 37s
	Train Loss: 0.3246
	 Val. Loss: 0.4345
	 Val. Accuracy: 0.0217
	 Learning Rate: 0.001127
Saved new best model to best-model-l8jshma4.pt


Training:   0%|          | 0/2132 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/214 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/214 [00:00<?, ?it/s]

Epoch: 04 | Time: 2m 38s
	Train Loss: 0.2096
	 Val. Loss: 0.3424
	 Val. Accuracy: 0.1336
	 Learning Rate: 0.001127
Saved new best model to best-model-l8jshma4.pt


Training:   0%|          | 0/2132 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/214 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/214 [00:00<?, ?it/s]

Epoch: 05 | Time: 2m 37s
	Train Loss: 0.1456
	 Val. Loss: 0.2862
	 Val. Accuracy: 0.2329
	 Learning Rate: 0.001127
Saved new best model to best-model-l8jshma4.pt
Training finished.
Evaluating best model on test set...


Evaluating Loss:   0%|          | 0/215 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/215 [00:00<?, ?it/s]

Test Loss: 0.3000 | Test Accuracy: 0.2284
Saved predictions to predictions/vanilla/predictions-l8jshma4.json


0,1
epoch,▁▃▅▆█
epoch_time_min,▁▁▁▁▁
epoch_time_sec,▁█▁█▁
learning_rate,▁▁▁▁▁
test_accuracy,▁
test_loss,▁
train_loss,█▇▅▂▁
trainable_parameters,▁
valid_accuracy,▁▁▂▅█
valid_loss,██▅▂▁

0,1
epoch,4
epoch_time_min,2
epoch_time_sec,37
learning_rate,0.00113
optimizer_type,adam
test_accuracy,0.22844
test_loss,0.29997
train_loss,0.14559
trainable_parameters,813618
valid_accuracy,0.2329


[34m[1mwandb[0m: [32m[41mERROR[0m Run ka5lylte errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 300, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     wandb.teardown()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/lib/python3.11/contextlib.py", line 81, in inner
[34m[1mwandb[0m: [32m[41mERROR[0m     return func(*args, **kwds)
[34m[1mwandb[0m: [32m[41mERROR[0m            ^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_setup.py", line 404, in teardown
[34m[1mwandb[0m: [32m[41mERROR[0m     orig_singleton._teardown(exit_code=exit_code)
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/wandb_setup.py", line 249, in _teardown
[34m[1mwandb[0m: [32m[41mERROR[0m     internal_exit

Starting training run with config: {'batch_size': 128, 'cell_type': 'gru', 'clip': 1, 'decoder_layers': 2, 'dropout': 0.2, 'embedding_size': 128, 'encoder_layers': 1, 'hidden_size': 64, 'language': 'ta', 'learning_rate': 0.0022867987141504203, 'max_seq_len': 50, 'n_epochs': 5, 'optimizer': 'adam', 'scheduler': 'none', 'teacher_forcing_ratio': 0.7, 'use_attention': False, 'weight_decay': 0}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examples
Dev set: 6827 examples
Test set: 6864 examples
Source vocabulary size: 30
Target vocabulary size: 50
Max sequence length: 50
Using device: cuda
The model has 112,946 trainable parameters


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Traceback (most recent call last):
  File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
    valid_accuracy, _ = calculate_accuracy(
                        ^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/7560943.py", line 194, in calculate_accuracy
    decoder_output, hidden, _ = model.decoder(
                                ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/3382890687.py", line 100, in forward
    output, hidden = self.rnn(embedded, hidden)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call

0,1
trainable_parameters,▁

0,1
optimizer_type,adam
trainable_parameters,112946


[34m[1mwandb[0m: [32m[41mERROR[0m Run i9c0nfkt errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
[34m[1mwandb[0m: [32m[41mERROR[0m     valid_accuracy, _ = calculate_accuracy(
[34m[1mwandb[0m: [32m[41mERROR[0m                         ^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/7560943.py", line 194, in calculate_accuracy
[34m[1mwandb[0m: [32m[41mERROR[0m     decoder_output, hidden, _ = model.decoder(
[34m[1mwandb[0m: [32m[41mERROR[0m                                 ^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/m

Starting training run with config: {'batch_size': 64, 'cell_type': 'lstm', 'clip': 1, 'decoder_layers': 1, 'dropout': 0.3, 'embedding_size': 128, 'encoder_layers': 3, 'hidden_size': 64, 'language': 'ta', 'learning_rate': 0.005235623740148747, 'max_seq_len': 50, 'n_epochs': 5, 'optimizer': 'rmsprop', 'scheduler': 'none', 'teacher_forcing_ratio': 0.7, 'use_attention': False, 'weight_decay': 0}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examples
Dev set: 6827 examples
Test set: 6864 examples
Source vocabulary size: 30
Target vocabulary size: 50
Max sequence length: 50
Using device: cuda
The model has 179,378 trainable parameters


Training:   0%|          | 0/1066 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/107 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/107 [00:00<?, ?it/s]

Traceback (most recent call last):
  File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
    valid_accuracy, _ = calculate_accuracy(
                        ^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/7560943.py", line 190, in calculate_accuracy
    decoder_output, hidden, cell = model.decoder(
                                   ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/3382890687.py", line 96, in forward
    output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/mo

0,1
trainable_parameters,▁

0,1
optimizer_type,rmsprop
trainable_parameters,179378


[34m[1mwandb[0m: [32m[41mERROR[0m Run h6iao8jo errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/2448685418.py", line 191, in train_model
[34m[1mwandb[0m: [32m[41mERROR[0m     valid_accuracy, _ = calculate_accuracy(
[34m[1mwandb[0m: [32m[41mERROR[0m                         ^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/7560943.py", line 190, in calculate_accuracy
[34m[1mwandb[0m: [32m[41mERROR[0m     decoder_output, hidden, cell = model.decoder(
[34m[1mwandb[0m: [32m[41mERROR[0m                                    ^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/torch/nn/mod

Starting training run with config: {'batch_size': 128, 'cell_type': 'gru', 'clip': 1, 'decoder_layers': 1, 'dropout': 0.3, 'embedding_size': 64, 'encoder_layers': 1, 'hidden_size': 64, 'language': 'ta', 'learning_rate': 0.0013674929836985695, 'max_seq_len': 50, 'n_epochs': 5, 'optimizer': 'adam', 'scheduler': 'plateau', 'teacher_forcing_ratio': 0.7, 'use_attention': False, 'weight_decay': 0}
Looking for training file at: /kaggle/input/dakshina-dataset-v1-0-tar/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv
Special tokens in vocabulary:
Source vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f']
Target vocab keys: ['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'ஃ', 'அ', 'ஆ', 'இ', 'ஈ', 'உ']
Successfully loaded Dakshina dataset for ta
Train set: 68218 examples
Dev set: 6827 examples
Test set: 6864 examples
Source vocabulary size: 30
Target vocabulary size: 50
Max sequence length: 50
Using device: cuda
The model has 58,290 trainable parameters




Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 01 | Time: 0m 37s
	Train Loss: 0.5545
	 Val. Loss: 0.5449
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.001367
Saved new best model to best-model-lzbp2rn9.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 02 | Time: 0m 38s
	Train Loss: 0.4646
	 Val. Loss: 0.5180
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.001367
Saved new best model to best-model-lzbp2rn9.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 03 | Time: 0m 37s
	Train Loss: 0.4353
	 Val. Loss: 0.5053
	 Val. Accuracy: 0.0000
	 Learning Rate: 0.001367
Saved new best model to best-model-lzbp2rn9.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 04 | Time: 0m 38s
	Train Loss: 0.4035
	 Val. Loss: 0.4921
	 Val. Accuracy: 0.0006
	 Learning Rate: 0.001367
Saved new best model to best-model-lzbp2rn9.pt


Training:   0%|          | 0/533 [00:00<?, ?it/s]

Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]

Calculating Accuracy:   0%|          | 0/54 [00:00<?, ?it/s]

Epoch: 05 | Time: 0m 37s
	Train Loss: 0.3800
	 Val. Loss: 0.4735
	 Val. Accuracy: 0.0007
	 Learning Rate: 0.001367
Saved new best model to best-model-lzbp2rn9.pt
Training finished.
Evaluating best model on test set...


Evaluating Loss:   0%|          | 0/54 [00:00<?, ?it/s]