In [4]:
import os
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import wandb
wandb.login(key="8b9afebd4705d9e357dd7b2fa957f97e19705bf1")

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mroohmr1011[0m ([33mroohiparveen[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
from collections import Counter
from tqdm import tqdm

In [19]:
# ---------------------------
# Vocabulary Builder (no torchtext)
# ---------------------------
def build_vocab_from_data(path, column=0):
    counter = Counter()
    with open(path, encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) < 2:
                continue
            text = parts[column]
            counter.update(list(text))

    chars = sorted(counter.keys())
    vocab = {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3}
    for i, char in enumerate(chars, start=4):
        vocab[char] = i
    return vocab

def inverse_vocab(vocab):
    return {idx: char for char, idx in vocab.items()}

# ---------------------------
# Dataset & Collate Function
# ---------------------------
class TransliterationDataset(Dataset):
    def __init__(self, path, input_vocab, output_vocab):
        self.data = []
        with open(path, encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) < 2:
                    continue
                target, latin = parts[0], parts[1]
                src_ids = [input_vocab['<sos>']] + [input_vocab.get(c, input_vocab['<unk>']) for c in latin] + [input_vocab['<eos>']]
                tgt_ids = [output_vocab['<sos>']] + [output_vocab.get(c, output_vocab['<unk>']) for c in target] + [output_vocab['<eos>']]
                self.data.append((torch.tensor(src_ids, dtype=torch.long), torch.tensor(tgt_ids, dtype=torch.long)))


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
    
    def pad_collate_fn(batch, pad_idx_input, pad_idx_output):
        src_batch, trg_batch = zip(*batch)
        src_lens = [len(seq) for seq in src_batch]
        trg_lens = [len(seq) for seq in trg_batch]
        max_src_len, max_trg_len = max(src_lens), max(trg_lens)
    
        pad_src = [torch.cat([seq, torch.full((max_src_len - len(seq),), pad_idx_input, dtype=torch.long)]) for seq in src_batch]
        pad_trg = [torch.cat([seq, torch.full((max_trg_len - len(seq),), pad_idx_output, dtype=torch.long)]) for seq in trg_batch]

        return torch.stack(pad_src), torch.stack(pad_trg)

# Setup Paths and Build Vocab
# ---------------------------
data_path = "/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/ta/lexicons"
train_file = os.path.join(data_path, "/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv")
dev_file = os.path.join(data_path, "/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.dev.tsv")

input_vocab = build_vocab_from_data(train_file, column=1)   # Latin script
output_vocab = build_vocab_from_data(train_file, column=0)  # Native script

inv_input_vocab = inverse_vocab(input_vocab)
inv_output_vocab = inverse_vocab(output_vocab)

PAD_IDX = output_vocab['<pad>']

# ---------------------------
# Create Datasets and Loaders
# ---------------------------
train_dataset = TransliterationDataset(train_file, input_vocab, output_vocab)
val_dataset = TransliterationDataset(dev_file, input_vocab, output_vocab)

PAD_IDX_INPUT = input_vocab['<pad>']
PAD_IDX_OUTPUT = output_vocab['<pad>']

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,
                          collate_fn=lambda batch: pad_collate_fn(batch, PAD_IDX_INPUT, PAD_IDX_OUTPUT))

val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False,
                        collate_fn=lambda batch: pad_collate_fn(batch, PAD_IDX_INPUT, PAD_IDX_OUTPUT))

# -------------------------------
# Encoder & Decoder
# -------------------------------
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, num_layers, cell_type='LSTM', dropout=0.0):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.cell_type = cell_type.upper()
        rnn_class = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}[self.cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers=num_layers,
                             dropout=dropout if num_layers > 1 else 0.0, batch_first=True)

    def forward(self, src, lengths):
        embedded = self.embedding(src)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        outputs, hidden = self.rnn(packed)
        return hidden

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, num_layers, cell_type='LSTM', dropout=0.0):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.cell_type = cell_type.upper()
        rnn_class = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}[self.cell_type]
        self.rnn = rnn_class(emb_dim, hidden_dim, num_layers=num_layers,
                             dropout=dropout if num_layers > 1 else 0.0, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        
        # Weight tying if dimensions match
        if emb_dim == hidden_dim:
            self.fc_out.weight = self.embedding.weight

    def forward(self, input, hidden):
        input = input.unsqueeze(1)
        embedded = self.embedding(input)
        output, hidden = self.rnn(embedded, hidden)
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, src_lengths, tgt, teacher_forcing_ratio=0.5):
        batch_size, tgt_len = tgt.shape
        tgt_vocab_size = self.decoder.embedding.num_embeddings
        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)
        hidden = self.encoder(src, src_lengths)
        input = tgt[:, 0]

        for t in range(1, tgt_len):
            output, hidden = self.decoder(input, hidden)
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            input = tgt[:, t] if teacher_force else output.argmax(1)

        return outputs

class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, smoothing=0.1):
        super().__init__()
        self.smoothing = smoothing

    def forward(self, pred, target):
        confidence = 1.0 - self.smoothing
        log_probs = F.log_softmax(pred, dim=-1)
        nll_loss = -log_probs.gather(dim=-1, index=target.unsqueeze(1)).squeeze(1)
        smooth_loss = -log_probs.mean(dim=-1)
        loss = confidence * nll_loss + self.smoothing * smooth_loss
        return loss.mean()


# -------------------------------
# W&B Sweep Training Function
# -------------------------------
def sweep_train(config=None):
    with wandb.init(config=config):
        config = wandb.config

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Rebuild vocab and dataset
        train_dataset = TransliterationDataset(train_file, input_vocab, output_vocab)
        val_dataset = TransliterationDataset(dev_file, input_vocab, output_vocab)

        train_loader = DataLoader(
            train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=pad_collate_fn
        )
        val_loader = DataLoader(
            val_dataset, batch_size=config.batch_size, shuffle=False, collate_fn=pad_collate_fn
        )

        encoder = Encoder(
            input_dim=len(input_vocab),
            emb_dim=config.embedding_dim,
            hidden_dim=config.hidden_dim,
            num_layers=config.num_layers,
            cell_type=config.cell_type,
            dropout=config.dropout
        )

        decoder = Decoder(
            output_dim=len(output_vocab),
            emb_dim=config.embedding_dim,
            hidden_dim=config.hidden_dim,
            num_layers=config.num_layers,
            cell_type=config.cell_type,
            dropout=config.dropout
        )

        model = Seq2Seq(encoder, decoder, device).to(device)

        optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
        criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

        best_val_loss = float('inf')
        initial_tf_ratio = config.teacher_forcing_ratio

        for epoch in range(10):  # You can change to config.epochs if you want
            # Decay teacher forcing ratio
            tf_ratio = max(0.1, initial_tf_ratio * (0.95 ** epoch))

            # ---- Training Phase ----
            model.train()
            total_loss, correct, total = 0, 0, 0
            for src, trg in train_loader:
                src, trg = src.to(device), trg.to(device)

                optimizer.zero_grad()
                output = model(src, trg, tf_ratio)
                output_dim = output.shape[-1]

                output = output[:, 1:].reshape(-1, output_dim)
                trg = trg[:, 1:].reshape(-1)

                loss = criterion(output, trg)
                loss.backward()
                optimizer.step()

                total_loss += loss.item()
                correct += (output.argmax(1) == trg).sum().item()
                total += trg.ne(PAD_IDX).sum().item()

            train_loss = total_loss / len(train_loader)
            train_acc = correct / total

            # ---- Validation Phase ----
            model.eval()
            val_loss, val_correct, val_total = 0, 0, 0
            with torch.no_grad():
                for src, trg in val_loader:
                    src, trg = src.to(device), trg.to(device)
                    output = model(src, trg, teacher_forcing_ratio=0)
                    output_dim = output.shape[-1]

                    output = output[:, 1:].reshape(-1, output_dim)
                    trg = trg[:, 1:].reshape(-1)

                    loss = criterion(output, trg)
                    val_loss += loss.item()
                    val_correct += (output.argmax(1) == trg).sum().item()
                    val_total += trg.ne(PAD_IDX).sum().item()

            val_loss /= len(val_loader)
            val_acc = val_correct / val_total

            scheduler.step()

            # Log to wandb
            wandb.log({
                'epoch': epoch + 1,
                'train_loss': train_loss,
                'train_acc': train_acc,
                'val_loss': val_loss,
                'val_acc': val_acc,
                'lr': scheduler.get_last_lr()[0],
                'tf_ratio': tf_ratio
            })

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save(model.state_dict(), "best_model.pt")

# -------------------------------
# Sweep Config
# -------------------------------
sweep_config = {
    'method': 'bayes',
    'metric': {'name': 'val_loss', 'goal': 'minimize'},
    'parameters': {
        'embedding_dim': {'values': [16, 32, 64, 256]},
        'hidden_dim': {'values': [16, 32, 64, 256]},
        'num_layers': {'values': [1, 2, 3]},
        'dropout': {'values': [0.2, 0.3]},
        'cell_type': {'values': ['LSTM', 'GRU', 'RNN']},
        'learning_rate': {'values': [0.001]},
        'batch_size': {'values': [32, 64]},
        'teacher_forcing_ratio': {'values': [0.7]},
        'tf_decay': {'values': [0.95]}, 
        'epochs': {'value': 15}
    }
}



In [22]:
# Optional: To run the sweep (outside the script, after loading)
# import wandb
sweep_id = wandb.sweep(sweep_config, project="DA6401 Assignment 3")
wandb.agent(sweep_id, function=sweep_train, count=15)

Create sweep with ID: aayppy4c
Sweep URL: https://wandb.ai/roohiparveen/DA6401%20ASSIGNMENT%203/sweeps/aayppy4c


[34m[1mwandb[0m: Agent Starting Run: ru5djwaz with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	hidden_dim: 16
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▃▄▆▆▇▇███
train_loss,█▅▄▃▂▂▂▁▁▁
val_acc,▁▃▄▆▆▇▇▇██
val_loss,█▆▅▄▃▂▂▂▂▁

0,1
epoch,10.0
train_acc,0.47569
train_loss,1.79989
val_acc,0.34606
val_loss,2.21442


[34m[1mwandb[0m: Agent Starting Run: pt1gf1v7 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	hidden_dim: 16
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▄▅▆▇▇████
train_loss,█▃▂▂▂▁▁▁▁▁
val_acc,▇▆█▁▅▄▅▆▅▅
val_loss,▃▃▂█▃▄▂▁▃▁

0,1
epoch,10.0
train_acc,0.27396
train_loss,2.5099
val_acc,0.16904
val_loss,2.87858


[34m[1mwandb[0m: Agent Starting Run: 9qa8a18a with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	hidden_dim: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▄▅▆▆▆▇▇██
train_loss,█▅▄▃▃▂▂▂▁▁
val_acc,▁▃▄▄▅▅▇▇██
val_loss,█▆▅▅▄▃▂▂▁▁

0,1
epoch,10.0
train_acc,0.51584
train_loss,1.71064
val_acc,0.37906
val_loss,2.19248


[34m[1mwandb[0m: Agent Starting Run: uqzkg1mo with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	hidden_dim: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▅▇▇██████
train_loss,█▃▂▂▁▁▁▁▁▁
val_acc,▁▁▆█▄▇▄▅▄▂
val_loss,█▆▂▁▆▁▃▄▃▂

0,1
epoch,10.0
train_acc,0.29271
train_loss,2.43934
val_acc,0.15184
val_loss,2.90872


[34m[1mwandb[0m: Agent Starting Run: 2r3c46l1 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	hidden_dim: 16
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▄▅▆▆▇▇▇██
train_loss,█▄▃▃▂▂▂▁▁▁
val_acc,▃▃▄▂▁▅█▆▅▆
val_loss,▇▇▆▆█▄▁▃▃▁

0,1
epoch,10.0
train_acc,0.28957
train_loss,2.43937
val_acc,0.19954
val_loss,2.76052


[34m[1mwandb[0m: Agent Starting Run: d1dano18 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	hidden_dim: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▃▅▆▆▇▇███
train_loss,█▆▄▃▃▂▂▁▁▁
val_acc,▁▃▅▅▆▇▇▇██
val_loss,█▆▄▃▂▂▂▁▁▁

0,1
epoch,10.0
train_acc,0.70037
train_loss,1.05826
val_acc,0.63007
val_loss,1.34802


[34m[1mwandb[0m: Agent Starting Run: gf56peob with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	hidden_dim: 32
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▃▅▅▆▇▇▇██
train_loss,█▆▄▃▃▂▂▁▁▁
val_acc,▁▃▄▅▆▆▇▇██
val_loss,█▆▅▄▃▃▂▂▁▁

0,1
epoch,10.0
train_acc,0.69497
train_loss,1.08744
val_acc,0.62729
val_loss,1.35923


[34m[1mwandb[0m: Agent Starting Run: 4qqiqqx2 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▂▄▆▇▇████
train_loss,█▇▄▃▂▂▁▁▁▁
val_acc,▁▃▅▆▇▇████
val_loss,█▆▄▃▂▂▁▁▁▁

0,1
epoch,10.0
train_acc,0.84742
train_loss,0.57087
val_acc,0.79851
val_loss,0.85251


[34m[1mwandb[0m: Agent Starting Run: na8i2d9a with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▆▇▇██████
train_loss,█▃▂▂▁▁▁▁▁▁
val_acc,▁▇████████
val_loss,█▂▁▁▁▁▁▂▁▂

0,1
epoch,10.0
train_acc,0.95971
train_loss,0.14975
val_acc,0.83406
val_loss,0.88809


[34m[1mwandb[0m: Agent Starting Run: 5c4gijwq with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 16
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▅▇▇██████
train_loss,█▄▂▂▂▁▁▁▁▁
val_acc,▁▆▇███████
val_loss,█▃▁▁▁▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.95373
train_loss,0.17364
val_acc,0.83571
val_loss,0.85594


[34m[1mwandb[0m: Agent Starting Run: rync5cs1 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 16
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▅▇▇██████
train_loss,█▄▂▂▂▁▁▁▁▁
val_acc,▁▆▇███████
val_loss,█▃▁▁▁▁▁▁▁▂

0,1
epoch,10.0
train_acc,0.95452
train_loss,0.17049
val_acc,0.82772
val_loss,0.93541


[34m[1mwandb[0m: Agent Starting Run: ypso0sn7 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▆▇▇██████
train_loss,█▃▂▂▁▁▁▁▁▁
val_acc,▁▆▇███████
val_loss,█▂▁▁▁▁▂▂▂▂

0,1
epoch,10.0
train_acc,0.95848
train_loss,0.15382
val_acc,0.83485
val_loss,0.90476


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: t7iyy6ww with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▅▇▇██████
train_loss,█▄▂▂▁▁▁▁▁▁
val_acc,▁▇████████
val_loss,█▂▁▁▁▁▁▂▁▂

0,1
epoch,10.0
train_acc,0.95494
train_loss,0.16792
val_acc,0.8375
val_loss,0.86154


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: zy0u96rc with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	hidden_dim: 64
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▃▆▇▇▇████
train_loss,█▅▃▂▂▂▁▁▁▁
val_acc,▁▄▆▇▇█████
val_loss,█▅▃▂▂▁▁▁▁▁

0,1
epoch,10.0
train_acc,0.85151
train_loss,0.55459
val_acc,0.79586
val_loss,0.86701


[34m[1mwandb[0m: Agent Starting Run: z0xowoni with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	hidden_dim: 256
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_acc,▁▆▇▇██████
train_loss,█▃▂▂▁▁▁▁▁▁
val_acc,▁▆▇███████
val_loss,█▂▁▁▂▂▂▂▂▃

0,1
epoch,10.0
train_acc,0.96119
train_loss,0.1434
val_acc,0.82345
val_loss,0.95153


In [43]:
import torch
from torch.utils.data import Dataset, DataLoader
import os
from collections import Counter
import torch.nn as nn

# --------------------------
# Vocabulary
# --------------------------
def build_vocab_from_data(path, column=0):
    counter = Counter()
    with open(path, encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) < 2:
                continue
            text = parts[column]
            counter.update(list(text))
    vocab = {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3}
    for i, char in enumerate(sorted(counter.keys()), start=4):
        vocab[char] = i
    return vocab

# --------------------------
# Dataset
# --------------------------
class TransliterationDataset(Dataset):
    def __init__(self, path, input_vocab, output_vocab):
        self.data = []
        with open(path, encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) < 2:
                    continue
                target, latin = parts[0], parts[1]
                src = [input_vocab['<sos>']] + [input_vocab.get(c, input_vocab['<unk>']) for c in latin] + [input_vocab['<eos>']]
                trg = [output_vocab['<sos>']] + [output_vocab.get(c, output_vocab['<unk>']) for c in target] + [output_vocab['<eos>']]
                self.data.append((torch.tensor(src), torch.tensor(trg)))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

def pad_collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_lens = [len(seq) for seq in src_batch]
    trg_lens = [len(seq) for seq in trg_batch]
    max_src_len, max_trg_len = max(src_lens), max(trg_lens)

    pad_src = [torch.cat([seq, torch.full((max_src_len - len(seq),), input_vocab['<pad>'])]) for seq in src_batch]
    pad_trg = [torch.cat([seq, torch.full((max_trg_len - len(seq),), output_vocab['<pad>'])]) for seq in trg_batch]

    return torch.stack(pad_src), torch.stack(pad_trg)

# --------------------------
# Model
# --------------------------
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, num_layers, dropout, bidirectional=False):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.bidirectional = bidirectional
        self.num_directions = 2 if bidirectional else 1

        self.rnn = nn.LSTM(
            emb_dim, hidden_dim, num_layers,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=bidirectional, batch_first=True
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src: [batch_size, src_len]
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        # outputs: [batch_size, src_len, hidden_dim * num_directions]
        # hidden, cell: [num_layers * num_directions, batch_size, hidden_dim]
        return outputs, hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, num_layers, dropout, bidirectional_encoder=False):
        super().__init__()
        self.output_dim = output_dim  # <-- Add this line

        self.embedding = nn.Embedding(output_dim, emb_dim)

        input_hidden_dim = hidden_dim * 2 if bidirectional_encoder else hidden_dim

        self.rnn = nn.LSTM(
            emb_dim, hidden_dim, num_layers,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True
        )
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        if input.dim() == 0:
            input = input.unsqueeze(0)  # scalar to [1]
        input = input.unsqueeze(1)      # add seq_len dim -> [batch_size, 1]
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, bidirectional_encoder=False):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.bidirectional_encoder = bidirectional_encoder


    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        encoder_outputs, hidden, cell = self.encoder(src)

        if self.bidirectional_encoder:
            hidden = self._concat_directions(hidden)
            cell = self._concat_directions(cell)

        input = trg[:, 0]  # first input token is <sos>

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:, t] if teacher_force else top1

        return outputs

    def _concat_directions(self, h):
        # [num_layers * 2, batch_size, hidden_dim] → [num_layers, batch_size, hidden_dim * 2]
        return torch.cat((h[0:h.size(0):2], h[1:h.size(0):2]), dim=2)

        
def beam_search_decode(model, src, input_vocab, output_vocab, beam_width=3, max_len=100):
    model.eval()
    src = src.unsqueeze(0).to(model.device)  # Add batch dimension

    with torch.no_grad():
        encoder_outputs, hidden, cell = model.encoder(src)

        if model.bidirectional_encoder:
            hidden = model._concat_directions(hidden)
            cell = model._concat_directions(cell)

        idx2char = {v: k for k, v in output_vocab.items()}
        start_token = output_vocab['<sos>']
        end_token = output_vocab['<eos>']

        # Each beam holds (sequence, hidden, cell, score)
        beams = [(torch.tensor([start_token], device=model.device), hidden, cell, 0.0)]
        completed = []

        for _ in range(max_len):
            new_beams = []
            for seq, hidden, cell, score in beams:
                if seq[-1].item() == end_token:
                    completed.append((seq, hidden, cell, score))
                    continue

                # Decoder forward step requires input token, hidden, cell
                output, hidden_new, cell_new = model.decoder(seq[-1], hidden, cell)
                log_probs = torch.log_softmax(output, dim=1).squeeze(0)  # shape: vocab_size
                topk_probs, topk_idxs = log_probs.topk(beam_width)

                for prob, idx in zip(topk_probs, topk_idxs):
                    new_seq = torch.cat([seq, idx.unsqueeze(0)])
                    new_beams.append((new_seq, hidden_new, cell_new, score + prob.item()))

            # Keep top beam_width beams only
            beams = sorted(new_beams, key=lambda x: x[3], reverse=True)[:beam_width]

            # If all beams ended with <eos>, stop early
            if all(seq[-1].item() == end_token for seq, _, _, _ in beams):
                completed.extend(beams)
                break

        if not completed:
            completed = beams

        # Pick best scoring completed sequence
        best_seq = max(completed, key=lambda x: x[3])[0]

        # Decode to characters, skip special tokens
        decoded = [idx2char[idx.item()] for idx in best_seq if idx.item() not in {start_token, end_token, output_vocab['<pad>']}]
        return ''.join(decoded)

def transliterate_test_set(model, test_loader, input_vocab, output_vocab, beam_width=3):
    model.eval()
    idx2input = {v: k for k, v in input_vocab.items()}
    predictions = []

    with torch.no_grad():
        for src_batch, trg_batch in test_loader:
            for i in range(src_batch.size(0)):
                src_seq = src_batch[i]
                input_str = ''.join([idx2input[idx.item()] for idx in src_seq if idx.item() not in {input_vocab['<pad>'], input_vocab['<sos>'], input_vocab['<eos>']}])
                pred = beam_search_decode(model, src_seq, input_vocab, output_vocab, beam_width=beam_width)
                predictions.append((input_str, pred))

    return predictions

# --------------------------
# Evaluation
# --------------------------
def evaluate(model, data_loader, criterion, device):
    model.eval()
    total_loss, total_correct, total_tokens = 0, 0, 0

    with torch.no_grad():
        for src, trg in data_loader:
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg, teacher_forcing_ratio=0)

            output = output[:, 1:].reshape(-1, output.shape[-1])
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            total_loss += loss.item()

            pred = output.argmax(1)
            non_pad = trg != PAD_IDX
            total_correct += (pred == trg)[non_pad].sum().item()
            total_tokens += non_pad.sum().item()

    return total_loss / len(data_loader), total_correct / total_tokens

# --------------------------
# Run Evaluation
# --------------------------
data_path = "/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/ta/lexicons"
train_path = os.path.join(data_path, "ta.translit.sampled.train.tsv")
test_path = os.path.join(data_path, "ta.translit.sampled.test.tsv")

input_vocab = build_vocab_from_data(train_path, column=1)
output_vocab = build_vocab_from_data(train_path, column=0)
PAD_IDX = output_vocab['<pad>']

test_dataset = TransliterationDataset(test_path, input_vocab, output_vocab)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=pad_collate_fn)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

embedding_dim = 256
hidden_dim = 256
num_layers = 3
bidirectional = False  # MUST be unidirectional

encoder = Encoder(input_dim=len(input_vocab), emb_dim=embedding_dim, hidden_dim=hidden_dim,
                  num_layers=num_layers, dropout=0.3, bidirectional=bidirectional)
decoder = Decoder(output_dim=len(output_vocab), emb_dim=embedding_dim, hidden_dim=hidden_dim,
                  num_layers=num_layers, dropout=0.3)

model = Seq2Seq(encoder, decoder, device, bidirectional_encoder=bidirectional).to(device)
model.load_state_dict(torch.load("best_model.pt", map_location=device))

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
test_loss, test_acc = evaluate(model, test_loader, criterion, device)

print(f"✅ Test Loss: {test_loss:.4f}")
print(f"✅ Test Accuracy: {test_acc * 100:.2f}%")

print("\n🔍 Beam Search Transliteration Samples:")
beam_predictions = transliterate_test_set(model, test_loader, input_vocab, output_vocab, beam_width=5)

for i, (input_str, pred_str) in enumerate(beam_predictions[:10]):
    print(f"Input: {input_str} → Predicted: {pred_str}")


✅ Test Loss: 0.8357
✅ Test Accuracy: 81.49%

🔍 Beam Search Transliteration Samples:
Input: faarm → Predicted: பார்ம்
Input: farm → Predicted: பார்
Input: form → Predicted: போர்ம்
Input: hpaarm → Predicted: யார்ம்
Input: face → Predicted: பேக்
Input: hpaes → Predicted: யேஸ்
Input: pace → Predicted: பேக்
Input: paes → Predicted: பேஸ்
Input: phase → Predicted: பேச்
Input: aeathimuka → Predicted: ஏதிமுக
