In [17]:
# ─── Python Built-ins & Utilities ─────────────────────────────────────────────
import os                      # filesystem operations
import re                      # regex handling
import random                  # random number generation

# ─── Data Processing & Tracking ───────────────────────────────────────────────
import pandas                  # tabular data manipulation
from tqdm.auto import tqdm     # progress bars
import wandb                   # experiment logging

# ─── PyTorch Core ─────────────────────────────────────────────────────────────
import torch                   # tensor operations & autograd
import torch.nn as nn          # neural network layers
import torch.optim as optim    # optimizers
import torch.nn.functional as F  # activation & loss functions
from torch.utils.data import Dataset, DataLoader  # data pipeline


In [18]:
# ─── Experiment Tracker Setup ─────────────────────────────────────────────────
# Ensure a WANDB key is present, else apply the embedded fallback
WANDB_KEY = "8f2f82255a6e5ea16321da3895ae6b00d50eb5b5"
os.environ.setdefault("WANDB_API_KEY", WANDB_KEY)
try:
    # ‘relogin=True’ forces a fresh session if needed
    wandb.login(key=WANDB_KEY, relogin=True)
except wandb.errors.UsageError:
    # Already authenticated or bad key—ignore quietly
    pass

# ─── Reproducibility & Hardware Choice ────────────────────────────────────────
SEED = 42
random.seed(SEED)           # Python RNG
torch.manual_seed(SEED)     # PyTorch RNG
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ─── Data Directory Specification ─────────────────────────────────────────────
# Path to the Dakshina lexicon files; swap 'hi' with any other language code as needed
LEXICON_ROOT = "/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons"
# e.g., for Marathi:
# LEXICON_ROOT = "/kaggle/input/dakshina/dakshina_dataset_v1.0/mr/lexicons"




In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# ─── Character-Level Embeddings ───────────────────────────────────────────────
class CharEmbedder(nn.Module):
    """Maps character indices to dense vectors."""
    def __init__(self, vocab_size: int, emb_dim: int):
        super().__init__()
        self.lookup = nn.Embedding(vocab_size, emb_dim)

    def forward(self, indices: torch.Tensor) -> torch.Tensor:
        # input: [batch, seq_len] → output: [batch, seq_len, emb_dim]
        return self.lookup(indices)


# ─── Sequence Encoder (RNN/GRU/LSTM) ──────────────────────────────────────────
class SeqEncoder(nn.Module):
    """
    Encodes token sequences into contextual representations.
    
    Args:
      vocab_size: size of input token set
      hid_dim: size of hidden state
      emb_dim: size of embedding vectors
      layers: number of recurrent layers
      rnn_type: 'GRU', 'LSTM', or 'RNN'
      drop: dropout probability between RNN layers
      bidir: bidirectional flag
    """
    def __init__(
        self,
        vocab_size: int,
        hid_dim: int,
        emb_dim: int,
        layers: int = 1,
        rnn_type: str = "GRU",
        drop: float = 0.1,
        bidir: bool = False,
    ):
        super().__init__()
        self.hid_dim = hid_dim
        self.num_dirs = 2 if bidir else 1

        # embed → dropout → RNN
        self.embed = nn.Embedding(vocab_size, emb_dim)
        self.drop = nn.Dropout(drop)
        core_drop = drop if layers > 1 else 0.0

        rnn_cls = {
            "GRU": nn.GRU,
            "LSTM": nn.LSTM,
            "RNN": nn.RNN
        }[rnn_type]
        rnn_kwargs = dict(
            input_size=emb_dim,
            hidden_size=hid_dim,
            num_layers=layers,
            dropout=core_drop,
            bidirectional=bidir,
            batch_first=True
        )
        # For vanilla RNN, specify nonlinearity
        if rnn_type == "RNN":
            rnn_kwargs["nonlinearity"] = "tanh"

        self.rnn = rnn_cls(**rnn_kwargs)

    def forward(self, tokens: torch.Tensor):
        # tokens: [batch, seq_len]
        emb = self.drop(self.embed(tokens))  # [batch, seq_len, emb_dim]
        outputs, state = self.rnn(emb)
        return outputs, state  # outputs: all timesteps; state: final hidden (and cell)


# ─── Token-Level Decoder (Stepwise) ──────────────────────────────────────────
class SeqDecoder(nn.Module):
    """
    Generates tokens one step at a time.
    
    Args:
      out_size: target vocabulary size
      hid_dim: hidden state dimension
      emb_dim: embedding vector size
      layers: number of RNN layers
      rnn_type: 'GRU', 'LSTM', or 'RNN'
      drop: dropout probability before and after RNN
    """
    def __init__(
        self,
        out_size: int,
        hid_dim: int,
        emb_dim: int,
        layers: int = 1,
        rnn_type: str = "GRU",
        drop: float = 0.1,
    ):
        super().__init__()
        # embed → RNN → dropout → linear → log-softmax
        self.embed = nn.Embedding(out_size, emb_dim)
        self.rnn_drop = nn.Dropout(drop)
        core_drop = drop if layers > 1 else 0.0

        rnn_cls = {
            "GRU": nn.GRU,
            "LSTM": nn.LSTM,
            "RNN": nn.RNN
        }[rnn_type]
        rnn_kwargs = dict(
            input_size=emb_dim,
            hidden_size=hid_dim,
            num_layers=layers,
            dropout=core_drop,
            batch_first=True
        )
        if rnn_type == "RNN":
            rnn_kwargs["nonlinearity"] = "tanh"

        self.rnn = rnn_cls(**rnn_kwargs)
        self.out_drop = nn.Dropout(core_drop)
        self.project = nn.Linear(hid_dim, out_size)

    def forward(self, input_tok: torch.Tensor, prev_state):
        """
        Args:
          input_tok: [batch, 1] token indices for current timestep
          prev_state: previous hidden (and cell) state
        Returns:
          log_probs: [batch, out_size]
          new_state: updated hidden (and cell) state
        """
        emb = self.embed(input_tok)           # [batch, 1, emb_dim]
        emb = self.rnn_drop(emb)
        rnn_out, new_state = self.rnn(emb, prev_state)
        dropped = self.out_drop(rnn_out[:, 0, :])  # take timestep dim
        logits = self.project(dropped)
        return F.log_softmax(logits, dim=-1), new_state


In [20]:
#  Perform beam search decoding with the trained seq2seq model.
def beam_decode(
    model,
    src_batch: torch.Tensor,
    start_idx: int,
    end_idx: int,
    max_len: int = 30,
    beam_size: int = 3,
    device: torch.device = DEVICE,
):
    # ─── Prepare Model & Encode ──────────────────────────────────────────────────
    model.eval()
    with torch.no_grad():
        enc_outs, enc_state = model.encoder(src_batch.to(device))

        # ─── Initialize Decoder State ──────────────────────────────────────────────
        if model.bidirectional:
            # Merge forward/backward layers
            def _merge(h, c=None):
                L = model.encoder.num_layers
                h_new = torch.zeros(L, 1, model.decoder.hidden_size, device=device)
                c_new = None if c is None else torch.zeros_like(h_new)
                for i in range(L):
                    f, b = h[2*i], h[2*i+1]
                    h_new[i] = model.hidden_transform(torch.cat((f, b), dim=1))
                    if c is not None:
                        fc, bc = c[2*i], c[2*i+1]
                        c_new[i] = model.hidden_transform(torch.cat((fc, bc), dim=1))
                return (h_new, c_new) if c is not None else h_new

            if isinstance(enc_state, tuple):  # LSTM
                dec_state = _merge(*enc_state)
            else:                             # GRU/RNN
                dec_state = _merge(enc_state)
        else:
            dec_state = enc_state

        # ─── Beam Search Setup ─────────────────────────────────────────────────────
        beams = [([start_idx], 0.0, dec_state)]  # (tokens, score, state)
        completed = []

        # ─── Expand Beams ─────────────────────────────────────────────────────────
        for _ in range(max_len):
            candidates = []
            for seq, score, state in beams:
                if seq[-1] == end_idx:
                    completed.append((seq, score))
                    continue
                inp = torch.tensor([[seq[-1]]], device=device)
                logp, next_state = model.decoder(inp, state)
                topv, topi = torch.topk(logp.squeeze(0), beam_size)
                for v, i in zip(topv, topi):
                    candidates.append((seq + [i.item()], score + v.item(), next_state))
            beams = sorted(candidates, key=lambda x: x[1], reverse=True)[:beam_size]
            if not beams:
                break

        # ─── Finalize Outputs ──────────────────────────────────────────────────────
        for seq, score, _ in beams:
            if seq[-1] == end_idx:
                completed.append((seq, score))
        if not completed:
            completed = [(s, sc) for s, sc, _ in beams]

        return sorted(completed, key=lambda x: x[1], reverse=True)


In [21]:
import torch
import torch.nn as nn
import random

# ─── End-to-End Seq2Seq Wrapper ────────────────────────────────────────────────
class Seq2SeqModel(nn.Module):
    """
    Combines encoder and decoder for sequence-to-sequence tasks.
    
    Args:
      src_vocab: source vocabulary size
      tgt_vocab: target vocabulary size
      emb_dim: embedding dimension for both encoder & decoder
      hid_dim: hidden state size (must match between enc/dec)
      enc_layers: number of encoder RNN layers
      dec_layers: number of decoder RNN layers
      rnn_type: 'GRU', 'LSTM', or 'RNN'
      drop_p: dropout rate on embeddings & RNN layers
      bidir_enc: if True, encoder is bidirectional
    """
    def __init__(
        self,
        src_vocab: int,
        tgt_vocab: int,
        emb_dim: int = 256,
        hid_dim: int = 256,
        enc_layers: int = 1,
        dec_layers: int = 1,
        rnn_type: str = "GRU",
        drop_p: float = 0.2,
        bidir_enc: bool = False,
    ):
        super().__init__()
        # ─── Encoder Setup ───────────────────────────────────────────────────────
        self.encoder = SeqEncoder(
            vocab_size=src_vocab,
            hid_dim=hid_dim,
            emb_dim=emb_dim,
            layers=enc_layers,
            rnn_type=rnn_type,
            drop=drop_p,
            bidir=bidir_enc,
        )
        self.bidir = bidir_enc
        # transform bilinear hidden → decoder size if bidirectional
        if bidir_enc:
            self.hidden_proj = nn.Linear(hid_dim * 2, hid_dim)
        # ─── Decoder Setup ───────────────────────────────────────────────────────
        self.decoder = SeqDecoder(
            out_size=tgt_vocab,
            hid_dim=hid_dim,
            emb_dim=emb_dim,
            layers=dec_layers,
            rnn_type=rnn_type,
            drop=drop_p,
        )
        self.rnn_type = rnn_type

    # ─── Align Hidden Layers ────────────────────────────────────────────────────
    def _match_layers(self, h: torch.Tensor, bsz: int):
        # trim or pad hidden states to decoder layer count
        dl = self.decoder.rnn.num_layers
        if h.size(0) > dl:
            return h[:dl]
        if h.size(0) < dl:
            pad = torch.zeros(dl - h.size(0), bsz, h.size(2), device=h.device)
            return torch.cat([h, pad], dim=0)
        return h

    def forward(self, src: torch.Tensor, tgt: torch.Tensor, tf_ratio: float = 0.5):
        # ─── Prep & Encode ───────────────────────────────────────────────────────
        bsz, tgt_len = src.size(0), tgt.size(1)
        out_vocab = self.decoder.project.out_features if hasattr(self.decoder, 'project') else self.decoder.project.weight.size(0)
        outputs = torch.zeros(bsz, tgt_len, out_vocab, device=src.device)

        enc_outs, enc_state = self.encoder(src)
        # ─── Init Decoder State ──────────────────────────────────────────────────
        if self.bidir:
            # combine forward/back encoder states
            if self.rnn_type == "LSTM":
                h_n, c_n = enc_state
                h_dec = torch.zeros(self.decoder.rnn.num_layers, bsz, self.decoder.rnn.hidden_size, device=src.device)
                c_dec = torch.zeros_like(h_dec)
                for i in range(self.decoder.rnn.num_layers):
                    layer = min(i, self.encoder.rnn.num_layers - 1)
                    h_cat = torch.cat((h_n[2*layer], h_n[2*layer+1]), dim=1)
                    c_cat = torch.cat((c_n[2*layer], c_n[2*layer+1]), dim=1)
                    h_dec[i] = self.hidden_proj(h_cat)
                    c_dec[i] = self.hidden_proj(c_cat)
                dec_state = (h_dec, c_dec)
            else:
                h_n = enc_state
                h_dec = torch.zeros(self.decoder.rnn.num_layers, bsz, self.decoder.rnn.hidden_size, device=src.device)
                for i in range(self.decoder.rnn.num_layers):
                    layer = min(i, self.encoder.rnn.num_layers - 1)
                    h_cat = torch.cat((h_n[2*layer], h_n[2*layer+1]), dim=1)
                    h_dec[i] = self.hidden_proj(h_cat)
                dec_state = h_dec
        else:
            # match layer counts for unidirectional
            if self.rnn_type == "LSTM":
                h, c = enc_state
                dec_state = (
                    self._match_layers(h, bsz),
                    self._match_layers(c, bsz),
                )
            else:
                dec_state = self._match_layers(enc_state, bsz)

        # ─── Decode with Teacher Forcing ──────────────────────────────────────────
        input_tok = tgt[:, 0].unsqueeze(1)  # first token (<sos>)
        for t in range(1, tgt_len):
            probs, dec_state = self.decoder(input_tok, dec_state)
            outputs[:, t, :] = probs
            # decide next input
            top1 = probs.argmax(1).unsqueeze(1)
            if random.random() < tf_ratio:
                input_tok = tgt[:, t].unsqueeze(1)
            else:
                input_tok = top1

        return outputs


In [22]:
import os
import torch
from torch.utils.data import Dataset, DataLoader

# ─── Dataset for Romanized ↔ Native Transliterations ───────────────────────────
class TranslitDataset(Dataset):
    """
    Reads tab-separated transliteration pairs and optionally builds vocabularies.
    """
    def __init__(self, filepath, src_vocab=None, tgt_vocab=None, build_vocab=False):
        self.pairs = []
        # load all (src, tgt) pairs
        with open(filepath, encoding="utf-8") as f:
            for line in f:
                parts = line.strip().split("\t")
                if len(parts) >= 2:
                    tgt, src = parts[0], parts[1]
                    self.pairs.append((src, tgt))

        # build or assign vocabularies
        if build_vocab:
            self.src_vocab = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}
            self.tgt_vocab = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}
            for src, tgt in self.pairs:
                for ch in src:
                    self.src_vocab.setdefault(ch, len(self.src_vocab))
                for ch in tgt:
                    self.tgt_vocab.setdefault(ch, len(self.tgt_vocab))
        else:
            assert src_vocab and tgt_vocab, "Provide vocabularies if not building."
            self.src_vocab, self.tgt_vocab = src_vocab, tgt_vocab

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]
        # convert chars → indices with <unk> fallback
        src_ids = [self.src_vocab.get(c, self.src_vocab["<unk>"]) for c in src]
        tgt_ids = (
            [self.tgt_vocab["<sos>"]]
            + [self.tgt_vocab.get(c, self.tgt_vocab["<unk>"]) for c in tgt]
            + [self.tgt_vocab["<eos>"]]
        )
        return torch.tensor(src_ids, dtype=torch.long), torch.tensor(tgt_ids, dtype=torch.long)


# ─── Batch Collation & Padding ─────────────────────────────────────────────────
def pad_collate(batch):
    """
    Pads source and target sequences in batch to their max lengths.
    Returns:
      padded_src: (bs, max_src), padded_tgt: (bs, max_tgt)
    """
    src_seqs, tgt_seqs = zip(*batch)
    max_src = max(len(s) for s in src_seqs)
    max_tgt = max(len(t) for t in tgt_seqs)

    pad_idx = 0
    padded_src = torch.full((len(batch), max_src), pad_idx, dtype=torch.long)
    padded_tgt = torch.full((len(batch), max_tgt), pad_idx, dtype=torch.long)

    for i, (s, t) in enumerate(zip(src_seqs, tgt_seqs)):
        padded_src[i, : len(s)] = s
        padded_tgt[i, : len(t)] = t

    return padded_src, padded_tgt


# ─── DataLoader Factory ─────────────────────────────────────────────────────────
def get_dataloaders(root_dir, batch_size: int, build_vocab: bool = False):
    """
    Constructs train/val/test DataLoaders and returns vocab info.
    Returns:
      train_loader, val_loader, test_loader,
      src_vocab_size, tgt_vocab_size, pad_index, src_vocab, tgt_vocab
    """
    # file paths for Hindi transliteration splits
    train_file = os.path.join(root_dir, "hi.translit.sampled.train.tsv")
    val_file   = os.path.join(root_dir, "hi.translit.sampled.dev.tsv")
    test_file  = os.path.join(root_dir, "hi.translit.sampled.test.tsv")

    # build vocab on training set if requested
    train_ds = TranslitDataset(train_file, build_vocab=build_vocab)
    src_vocab, tgt_vocab = train_ds.src_vocab, train_ds.tgt_vocab

    # reuse vocabs for dev & test
    val_ds  = TranslitDataset(val_file,  src_vocab, tgt_vocab)
    test_ds = TranslitDataset(test_file, src_vocab, tgt_vocab)

    # DataLoader instances
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  collate_fn=pad_collate)
    val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, collate_fn=pad_collate)
    test_loader  = DataLoader(test_ds,  batch_size=1,            shuffle=False, collate_fn=pad_collate)

    return (
        train_loader,
        val_loader,
        test_loader,
        len(src_vocab),
        len(tgt_vocab),
        src_vocab["<pad>"],
        src_vocab,
        tgt_vocab,
    )


In [23]:
class TrainingEarlyStopper:
    """
    Halts training when the monitored metric stops improving.
    
    Args:
        patience: number of checks with no improvement before stopping
        min_delta: minimal increase to qualify as an improvement
    """
    def __init__(self, patience: int = 5, min_delta: float = 1e-4):
        self.patience = patience
        self.min_delta = min_delta
        self.best_score = None
        self.wait = 0

    def should_stop(self, current_score: float) -> bool:
        """
        Check if training should stop based on current metric.
        
        Args:
            current_score: latest value of the monitored metric
        Returns:
            True if no improvement for `patience` calls, else False
        """
        # first call or significant improvement
        if self.best_score is None or current_score > self.best_score + self.min_delta:
            self.best_score = current_score
            self.wait = 0
        else:
            self.wait += 1
        
        # stop if waited too long
        return self.wait >= self.patience


In [24]:
# ─── WandB Sweep Configuration ────────────────────────────────────────────────
sweep_cfg = {
    "method": "bayes",  # Bayesian hyperparameter search
    "metric": {
        "name": "val_acc", 
        "goal": "maximize"
    },
    "early_terminate": {
        "type": "hyperband",
        "min_iter": 2,
        "max_iter": 8,
        "s": 2
    },
    "parameters": {
        # Model & training sizes
        "batch_size":            {"values": [16, 32, 64, 128, 256]},
        "num_epochs":            {"values": [10]},
        "encoder_layers":        {"values": [1, 2, 3]},
        "decoder_layers":        {"values": [1, 2, 3]},
        "hidden_size":           {"values": [16, 32, 64, 128, 256, 512, 1024]},
        "embedding_dim":         {"values": [16, 32, 64, 256, 512]},
        "dropout_rate":          {"values": [0.2, 0.3, 0.4]},
        "bi_directional":        {"values": [True, False]},
        # Search & decoding
        "beam_width":            {"values": [1, 3, 5]},
        "teacher_forcing_ratio": {"values": [0.0, 0.3, 0.5, 0.7, 1.0]},
        "length_penalty":        {"values": [0, 0.4, 0.5, 0.6]},
        # Optimization
        "optimizer":             {"values": ["adam", "sgd", "rmsprop", "adagrad"]},
        "learning_rate":         {"values": [0.005, 0.001, 0.01, 0.1]},
        # RNN cell variants
        "rnn_cell":              {"values": ["RNN", "GRU", "LSTM"]},
    }
}



Final Test Acc: 0.00%


0,1
epoch,▁▁▂▂▄▅▄▇█▅
test_accuracy,▁
train_acc,▃▃▃█▁▁█▃▃▆
train_loss,▁▄▁▂▁▁█▁▁▅
val_acc,▁▁▁█▁▁█▁▁▁
val_loss,▁▄▁▂▁▁▅▁▁█

0,1
epoch,4.0
test_accuracy,0.0
train_acc,0.00452
train_loss,42183814.66699
val_acc,0.0
val_loss,47228.40837


In [25]:
# ─── Optimized Sweep Parameters ───────────────────────────────────────────────
best_cfg = {
    "parameters": {
        # batch & epochs
        "batch_size":            {"values": [64]},
        "num_epochs":            {"values": [10]},
        # model architecture
        "encoder_layers":        {"values": [2]},
        "decoder_layers":        {"values": [2]},
        "hidden_size":           {"values": [256]},
        "embedding_dim":         {"values": [64]},
        "dropout_rate":          {"values": [0.4]},
        "bi_directional":        {"values": [False]},
        # decoding & loss
        "beam_width":            {"values": [5]},
        "length_penalty":        {"values": [0, 4]},
        "teacher_forcing_ratio": {"values": [1.0]},
        # optimization
        "optimizer":             {"values": ["adam"]},
        "learning_rate":         {"values": [0.001]},
        # RNN variant
        "rnn_cell":              {"values": ["LSTM"]},
    }
}


In [26]:
def run_training():
    # ─── Initialize W&B & Naming ──────────────────────────────────────────────
    with wandb.init():
        cfg = wandb.config
        run_name = (
            f"{cfg.rnn_cell.lower()}_dp{int(cfg.dropout_rate*100)}"
            f"_bw{cfg.beam_width}_tf{int(cfg.teacher_forcing_ratio*100)}"
            f"_emb{cfg.embedding_dim}_hid{cfg.hidden_size}"
            f"_enc{cfg.encoder_layers}_dec{cfg.decoder_layers}"
        )
        wandb.run.name = run_name

        # ─── Load Data & Build Vocab ────────────────────────────────────────────
        train_loader, val_loader, test_loader, src_sz, tgt_sz, pad_idx, src_vocab, tgt_vocab = get_dataloaders(
            LEXICON_ROOT, batch_size=cfg.batch_size, build_vocab=True
        )
        idx2char = {i: ch for ch, i in tgt_vocab.items()}

        # ─── Model, Loss & Optimizer ────────────────────────────────────────────
        model = Seq2SeqModel(
            src_vocab=src_sz,
            tgt_vocab=tgt_sz,
            emb_dim=cfg.embedding_dim,
            hid_dim=cfg.hidden_size,
            enc_layers=cfg.encoder_layers,
            dec_layers=cfg.decoder_layers,
            rnn_type=cfg.rnn_cell,
            drop_p=cfg.dropout_rate,
            bidir_enc=cfg.bi_directional,
        ).to(DEVICE)

        # Use Adam optimizer exclusively
        optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learning_rate)
        criterion = nn.NLLLoss(ignore_index=pad_idx)
        stopper = TrainingEarlyStopper(patience=5, min_delta=1e-4)
        best_val_acc = 0.0

        # ─── Training Loop ────────────────────────────────────────────────────────
        for epoch in range(1, cfg.num_epochs + 1):
            model.train()
            total_loss = 0.0
            correct, total = 0, 0

            for src, tgt in tqdm(train_loader, desc=f"[Epoch {epoch}] Train", leave=False):
                src, tgt = src.to(DEVICE), tgt.to(DEVICE)
                optimizer.zero_grad()
                outputs = model(src, tgt, tf_ratio=cfg.teacher_forcing_ratio)
                loss = criterion(outputs.view(-1, tgt_sz), tgt.view(-1))
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

                # compute sequence accuracy
                preds = outputs.argmax(dim=2)
                for p, t in zip(preds, tgt):
                    p_trim = p[1:][t[1:] != pad_idx]
                    t_trim = t[1:][t[1:] != pad_idx]
                    if torch.equal(p_trim, t_trim):
                        correct += 1
                    total += 1

            train_acc = 100 * correct / total

            # ─── Validation ────────────────────────────────────────────────────────
            model.eval()
            val_loss, correct, total = 0.0, 0, 0
            with torch.no_grad():
                for src, tgt in tqdm(val_loader, desc=f"[Epoch {epoch}] Val", leave=False):
                    src, tgt = src.to(DEVICE), tgt.to(DEVICE)
                    outputs = model(src, tgt, tf_ratio=0.0)
                    val_loss += criterion(outputs.view(-1, tgt_sz), tgt.view(-1)).item()
                    preds = outputs.argmax(dim=2)
                    for p, t in zip(preds, tgt):
                        p_trim = p[1:][t[1:] != pad_idx]
                        t_trim = t[1:][t[1:] != pad_idx]
                        if torch.equal(p_trim, t_trim):
                            correct += 1
                        total += 1

            val_acc = 100 * correct / total
            avg_val_loss = val_loss / len(val_loader)

            wandb.log({
                "epoch": epoch,
                "train_loss": total_loss,
                "train_acc": train_acc,
                "val_loss": avg_val_loss,
                "val_acc": val_acc,
            })
            print(f"[Epoch {epoch}] TL={total_loss:.3f} TA={train_acc:.2f}% │ VL={avg_val_loss:.3f} VA={val_acc:.2f}%")

            # early stopping
            if val_acc > best_val_acc:
                best_val_acc = val_acc
            elif stopper.should_stop(val_acc):
                print("Early stopping triggered.")
                break

        # ─── Final Test Evaluation ───────────────────────────────────────────────
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for src, tgt in tqdm(test_loader, desc="Test Eval", leave=False):
                src, tgt = src.to(DEVICE), tgt.to(DEVICE)
                outputs = model(src, tgt, tf_ratio=0.0)
                preds = outputs.argmax(dim=2)
                for p, t in zip(preds, tgt):
                    p_trim = p[1:][t[1:] != pad_idx]
                    t_trim = t[1:][t[1:] != pad_idx]
                    if torch.equal(p_trim, t_trim):
                        correct += 1
                    total += 1

        test_acc = 100 * correct / total
        print(f"\nFinal Test Acc: {test_acc:.2f}%")
        wandb.log({"test_accuracy": test_acc})


[Epoch 5] Val:   0%|          | 0/137 [00:00<?, ?it/s]

Traceback (most recent call last):
  File "/tmp/ipykernel_35/10891828.py", line 83, in run_training
    wandb.log({
  File "/usr/local/lib/python3.11/dist-packages/wandb/sdk/lib/preinit.py", line 36, in preinit_wrapper
    raise wandb.Error(f"You must call wandb.init() before {name}()")
wandb.errors.errors.Error: You must call wandb.init() before wandb.log()


In [13]:
# ─── Initiate Hyperparameter Sweep ─────────────────────────────────────────────
sweep_id = wandb.sweep(sweep_cfg, project="cs24m020_dl_a3_v2")
wandb.agent(sweep_id, function=run_training, count=100)

# ─── Optional Alternate Sweep Setup ────────────────────────────────────────────
# alt_sweep = "8espi10w"
# wandb.agent(
#     sweep_id=alt_sweep,
#     function=run_training,
#     count=100,
#     entity="cs24m020-indian-institute-of-technology-madras",
#     project="cs24m020_dl_a3_v2",
# )

# ─── Finalize W&B Session ──────────────────────────────────────────────────────
wandb.finish()


Create sweep with ID: qv4thkjl
Sweep URL: https://wandb.ai/cs24m020-indian-institute-of-technology-madras/cs24m020_dl_a3_v2/sweeps/qv4thkjl


[34m[1mwandb[0m: Agent Starting Run: n55epajr with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	beam_width: 1
[34m[1mwandb[0m: 	bi_directional: True
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout_rate: 0.2
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	encoder_layers: 1
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	length_penalty: 0.4
[34m[1mwandb[0m: 	num_epochs: 10
[34m[1mwandb[0m: 	optimizer: rmsprop
[34m[1mwandb[0m: 	rnn_cell: RNN
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.7
[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


[Epoch 1] Train:   0%|          | 0/346 [00:00<?, ?it/s]

In [12]:
# Add this cell at the end of your notebook

# First, get the data loaders and vocabs (same as in your training setup)
train_loader, val_loader, test_loader, src_size, tgt_size, pad_idx, src_vocab, tgt_vocab = get_dataloaders(
    BASE_DIR, batch_size=64, build_vocab=True
)

# Define best configuration based on sweep results
BEST_CONFIG = {
    'embedding_dim': 64,
    'hidden_size': 256,
    'encoder_layers': 2,
    'decoder_layers': 2,
    'cell_type': 'LSTM',
    'dropout_p': 0.4,
    'beam_width': 5,
    'teacher_forcing_ratio': 1.0,
    'bidirectional_encoder': False
}

def evaluate_predictions(model, device, test_loader, tgt_vocab, beam_width=5):
    model.eval()
    total_correct = 0
    total_samples = 0
    
    input_words = []
    decoded_outputs = []
    correct_outputs = []
    results = []
    
    # Create reverse vocabulary mapping
    IDX2CHAR_TGT = {idx: ch for ch, idx in tgt_vocab.items()}
    IDX2CHAR_SRC = {idx: ch for ch, idx in src_vocab.items()}
    
    sos_idx = tgt_vocab['<sos>']
    eos_idx = tgt_vocab['<eos>']
    pad_idx = tgt_vocab['<pad>']
    
    with torch.no_grad():
        for src, tgt in tqdm(test_loader, desc="Evaluating"):
            src = src.to(device)
            tgt = tgt.to(device)
            
            # Get beam search results
            beam_results = beam_search_decode(
                model, src, sos_idx, eos_idx, 
                max_len=30, beam_width=beam_width, device=device
            )
            
            # Get best prediction
            best_seq = beam_results[0][0] if beam_results else []
            
            # Process target sequence (remove <sos>, <eos> and padding)
            true_seq = []
            for idx in tgt[0].tolist():
                if idx == sos_idx:
                    continue
                if idx == eos_idx or idx == pad_idx:
                    break
                true_seq.append(idx)
            
            # Convert indices to characters
            pred_chars = [IDX2CHAR_TGT.get(idx, '<unk>') for idx in best_seq[1:-1]]  # exclude <sos> and <eos>
            true_chars = [IDX2CHAR_TGT.get(idx, '<unk>') for idx in true_seq]
            
            # Convert source indices to characters
            src_chars = [IDX2CHAR_SRC.get(idx, '<unk>') for idx in src[0].tolist() if idx != pad_idx]
            
            # Check if prediction is correct
            is_correct = pred_chars == true_chars
            if is_correct:
                total_correct += 1
            total_samples += 1
            
            # Store results
            input_words.append(''.join(src_chars))
            decoded_outputs.append(''.join(pred_chars))
            correct_outputs.append(''.join(true_chars))
            results.append("Correct" if is_correct else "Incorrect")
    
    # Calculate accuracy
    accuracy = (total_correct / total_samples) * 100
    print(f"\nEvaluation Results:")
    print(f"Total Correct: {total_correct}")
    print(f"Total Samples: {total_samples}")
    print(f"Accuracy: {accuracy:.2f}%")
    
    # Save results to CSV
    results_df = pd.DataFrame({
        'Input_Word': input_words,
        'Decoded_Output': decoded_outputs,
        'True_Output': correct_outputs,
        'Match_Result': results
    })
    results_df.to_csv('predictions_beam_search.csv', index=False)
    print("Predictions saved to 'predictions_beam_search.csv'")
    
    return results_df

# Initialize model with best configuration
best_model = Seq2Seq(
    input_size=len(src_vocab),
    output_size=len(tgt_vocab),
    embedding_dim=BEST_CONFIG['embedding_dim'],
    hidden_size=BEST_CONFIG['hidden_size'],
    encoder_layers=BEST_CONFIG['encoder_layers'],
    decoder_layers=BEST_CONFIG['decoder_layers'],
    cell_type=BEST_CONFIG['cell_type'],
    dropout_p=BEST_CONFIG['dropout_p'],
    bidirectional_encoder=BEST_CONFIG['bidirectional_encoder']
).to(DEVICE)

# Note: You'll need to load your trained model weights here before evaluation
# best_model.load_state_dict(torch.load('best_model_weights.pth'))

# Run evaluation
results_df = evaluate_predictions(
    best_model, 
    DEVICE, 
    test_loader, 
    tgt_vocab, 
    beam_width=BEST_CONFIG['beam_width']
)

# Display some sample predictions
print("\nSample Predictions:")
sample_results = results_df.sample(min(5, len(results_df)))
for _, row in sample_results.iterrows():
    print(f"Input: {row['Input_Word']}")
    print(f"True: {row['True_Output']}")
    print(f"Pred: {row['Decoded_Output']}")
    print(f"Result: {row['Match_Result']}\n")

NameError: name 'BASE_DIR' is not defined

In [None]:
import pandas as pd
import random
from tqdm import tqdm
import torch

# Get data loaders and vocabs
train_loader, val_loader, test_loader, src_size, tgt_size, pad_idx, src_vocab, tgt_vocab = get_dataloaders(
    BASE_DIR, batch_size=64, build_vocab=True
)

# Create reverse vocab mappings
IDX2CHAR_TGT = {idx: ch for ch, idx in tgt_vocab.items()}
IDX2CHAR_SRC = {idx: ch for ch, idx in src_vocab.items()}
sos_idx = tgt_vocab['<sos>']
eos_idx = tgt_vocab['<eos>']
pad_idx = tgt_vocab['<pad>']

# Best configuration
BEST_CONFIG = {
    'embedding_dim': 64,
    'hidden_size': 256,
    'encoder_layers': 2,
    'decoder_layers': 2,
    'cell_type': 'LSTM',
    'dropout_p': 0.4,
    'beam_width': 5,
    'teacher_forcing_ratio': 1.0,
    'bidirectional_encoder': False
}

def fast_beam_search(model, src, sos_idx, eos_idx, max_len=30, beam_width=5, device='cuda'):
    """Optimized single-instance beam search"""
    model.eval()
    with torch.no_grad():
        encoder_outputs, encoder_hidden = model.encoder(src.unsqueeze(0))
        
        if model.bidirectional:
            if model.cell_type == 'LSTM':
                h_n, c_n = encoder_hidden
                h_dec = torch.zeros(model.decoder.num_layers, 1, model.decoder.hidden_size).to(device)
                c_dec = torch.zeros(model.decoder.num_layers, 1, model.decoder.hidden_size).to(device)
                for layer in range(model.decoder.num_layers):
                    enc_layer = min(layer, model.encoder.num_layers - 1)
                    h_combined = torch.cat((h_n[2*enc_layer], h_n[2*enc_layer+1]), dim=1)
                    c_combined = torch.cat((c_n[2*enc_layer], c_n[2*enc_layer+1]), dim=1)
                    h_dec[layer] = model.hidden_transform(h_combined)
                    c_dec[layer] = model.hidden_transform(c_combined)
                decoder_hidden = (h_dec, c_dec)
            else:
                decoder_hidden = torch.zeros(model.decoder.num_layers, 1, model.decoder.hidden_size).to(device)
                for layer in range(model.decoder.num_layers):
                    enc_layer = min(layer, model.encoder.num_layers - 1)
                    h_combined = torch.cat((encoder_hidden[2*enc_layer], encoder_hidden[2*enc_layer+1]), dim=1)
                    decoder_hidden[layer] = model.hidden_transform(h_combined)
        else:
            decoder_hidden = encoder_hidden

        beams = [([sos_idx], 0.0, decoder_hidden)]
        completed = []

        for _ in range(max_len):
            new_beams = []
            for seq, score, hidden in beams:
                if seq[-1] == eos_idx:
                    completed.append((seq, score))
                    continue
                
                input_char = torch.tensor([[seq[-1]]], device=device)
                output, hidden_new = model.decoder(input_char, hidden)
                log_probs = output.squeeze(0)
                topk_log_probs, topk_indices = torch.topk(log_probs, beam_width)
                
                for k in range(beam_width):
                    new_seq = seq + [topk_indices[k].item()]
                    new_score = score + topk_log_probs[k].item()
                    new_beams.append((new_seq, new_score, hidden_new))
            
            beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]
            if not beams:
                break

        completed += [(seq, score) for seq, score, _ in beams if seq[-1] == eos_idx]
        if not completed:
            completed = beams
            
        return sorted(completed, key=lambda x: x[1], reverse=True)[0][0]

def generate_predictions_csv(model, test_loader, output_file='predictions.csv'):
    """Generate predictions and save to CSV with progress tracking"""
    model.eval()
    device = next(model.parameters()).device
    
    results = {
        'input': [],
        'prediction': [],
        'target': [],
        'is_correct': [],
        'input_length': [],
        'prediction_length': []
    }
    
    with torch.no_grad():
        for src, tgt in tqdm(test_loader, desc="Generating Predictions"):
            src = src.to(device)
            tgt = tgt.to(device)
            
            for i in range(src.size(0)):
                # Process source
                src_seq = src[i].tolist()
                src_str = ''.join([IDX2CHAR_SRC.get(idx, '<unk>') for idx in src_seq if idx != pad_idx])
                
                # Process target
                tgt_seq = []
                for idx in tgt[i].tolist():
                    if idx == sos_idx:
                        continue
                    if idx == eos_idx or idx == pad_idx:
                        break
                    tgt_seq.append(idx)
                tgt_str = ''.join([IDX2CHAR_TGT.get(idx, '<unk>') for idx in tgt_seq])
                
                # Get prediction
                pred_seq = fast_beam_search(
                    model, src[i], sos_idx, eos_idx,
                    beam_width=BEST_CONFIG['beam_width'],
                    device=device
                )
                pred_str = ''.join([IDX2CHAR_TGT.get(idx, '<unk>') for idx in pred_seq[1:-1]])  # remove <sos> and <eos>
                
                # Store results
                results['input'].append(src_str)
                results['prediction'].append(pred_str)
                results['target'].append(tgt_str)
                results['is_correct'].append(pred_str == tgt_str)
                results['input_length'].append(len(src_str))
                results['prediction_length'].append(len(pred_str))
    
    # Create DataFrame and save to CSV
    df = pd.DataFrame(results)
    df.to_csv(output_file, index=False)
    
    # Calculate and print statistics
    accuracy = df['is_correct'].mean() * 100
    avg_input_len = df['input_length'].mean()
    avg_pred_len = df['prediction_length'].mean()
    
    print(f"\nPrediction Generation Complete")
    print(f"Saved to: {output_file}")
    print(f"Accuracy: {accuracy:.2f}%")
    print(f"Avg Input Length: {avg_input_len:.1f} chars")
    print(f"Avg Prediction Length: {avg_pred_len:.1f} chars")
    
    return df

# Initialize model
best_model = Seq2Seq(
    input_size=len(src_vocab),
    output_size=len(tgt_vocab),
    embedding_dim=BEST_CONFIG['embedding_dim'],
    hidden_size=BEST_CONFIG['hidden_size'],
    encoder_layers=BEST_CONFIG['encoder_layers'],
    decoder_layers=BEST_CONFIG['decoder_layers'],
    cell_type=BEST_CONFIG['cell_type'],
    dropout_p=BEST_CONFIG['dropout_p'],
    bidirectional_encoder=BEST_CONFIG['bidirectional_encoder']
).to(DEVICE)

# ─── Load Data & Build Vocab ────────────────────────────────────────────
train_loader, val_loader, test_loader, src_sz, tgt_sz, pad_idx, src_vocab, tgt_vocab = get_dataloaders(
        LEXICON_ROOT, batch_size=cfg.batch_size, build_vocab=True
)
idx2char = {i: ch for ch, i in tgt_vocab.items()}
# ─── Model, Loss & Optimizer ────────────────────────────────────────────
model = Seq2SeqModel(
        src_vocab=src_sz,
        tgt_vocab=tgt_sz,
        emb_dim=64,
        hid_dim=256,
        enc_layers=2,
        dec_layers=2,
        rnn_type='LSTM',
        drop_p=0.4,
        bidir_enc=True,
        ).to(DEVICE)
# Load your trained model here (uncomment and modify path)
# best_model.load_state_dict(torch.load('best_model.pth'))

# Generate predictions and save to CSV
predictions_df = generate_predictions_csv(best_model, test_loader, '/kaggle/working/transliteration_predictions.csv')

# Display sample predictions
print("\nSample Predictions:")
sample_df = predictions_df.sample(min(5, len(predictions_df)))
for _, row in sample_df.iterrows():
    print(f"\nInput: {row['input']}")
    print(f"Target: {row['target']}")
    print(f"Prediction: {row['prediction']}")
    print(f"Correct: {'✓' if row['is_correct'] else '✗'}")

In [41]:
import os
import torch
from tqdm.auto import tqdm

# ─── Evaluation & Prediction Dump ───────────────────────────────────────────────
def evaluate_and_save(model, test_loader, idx2char, pad_idx, device, batch_size=1):
    """
    1) Runs the model on the test set
    2) Computes and prints test accuracy
    3) Writes each predicted sequence to /kaggle/working/predictions_vanilla/
    
    Args:
        batch_size: Should be 1 for this implementation as we process one sequence at a time
    """
    out_dir = "/kaggle/working/predictions_vanilla"
    os.makedirs(out_dir, exist_ok=True)

    model.eval()
    correct, total = 0, 0

    with torch.no_grad():
        for i, (src, tgt) in enumerate(tqdm(test_loader, desc="Test Eval", leave=False)):
            src, tgt = src.to(device), tgt.to(device)
            
            # Validate batch size
            if src.size(0) != 1:
                raise ValueError(f"This implementation expects batch_size=1, got {src.size(0)}")
                
            # do a forward pass (no teacher forcing)
            logp= model(src, tgt, tf_ratio=0.0)
            preds = logp.argmax(dim=2)[0].tolist()  # [0] assumes batch_size=1

            # strip <sos>
            preds = preds[1:]
            # stop at first <eos> or pad
            if pad_idx in preds:
                preds = preds[: preds.index(pad_idx)]
            
            # likewise for the true target
            true = tgt[0].tolist()[1:]
            if pad_idx in true:
                true = true[: true.index(pad_idx)]

            # check exact-match accuracy
            if preds == true:
                correct += 1
            total += 1

            # convert tokens → characters and save
            pred_str = "".join(idx2char[idx] for idx in preds)
            with open(f"{out_dir}/pred_{i:04d}.txt", "w", encoding="utf-8") as f:
                f.write(pred_str)

    acc = 100.0 * correct / total if total else 0.0
    print(f"Test Accuracy: {acc:.2f}%")
    return acc


# ─── Usage Example ──────────────────────────────────────────────────────────────
if __name__ == "__main__":
    
    # Set device
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Set data path
    LEXICON_ROOT = "/kaggle/input/dakshina/dakshina_dataset_v1.0/hi/lexicons"  # replace with actual path
    
    # Prepare the data
    train_loader, val_loader, test_loader, src_size, tgt_size, pad_idx, src_vocab, tgt_vocab = get_dataloaders(
        LEXICON_ROOT, batch_size=64, build_vocab=True  # Note: batch_size=1 for evaluation
    )
    idx2char = {i: ch for ch, i in tgt_vocab.items()}

    # Initialize model
    model = Seq2SeqModel(
        src_vocab=len(src_vocab),
        tgt_vocab=len(tgt_vocab),
        emb_dim=64,
        hid_dim=256,
        enc_layers=2,
        dec_layers=2,
        rnn_type='LSTM',
        drop_p=0.4,
        bidir_enc=False
    ).to(DEVICE)
    
    # Load trained weights (uncomment after training)
    # model.load_state_dict(torch.load("best_model.pth"))
    
    # Evaluate the model and save predictions
    evaluate_and_save(model, test_loader, idx2char, pad_idx, DEVICE)

Test Eval:   0%|          | 0/4502 [00:00<?, ?it/s]

Test Accuracy: 0.00%
