In [1]:
import csv
import random

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.font_manager import FontProperties

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

import wandb

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
# ==============================
# DATA LOADING & ANALYSIS
# ==============================
def data_load_tsv(path):
    """
    Load data from TSV files with source-target pairs
    """
    df = pd.read_csv(
        path,
        sep='\t',
        header=None,
        dtype=str,
        quoting=csv.QUOTE_NONE
    )
    df = df.dropna(subset=[0,1])
    return df[0].tolist(), df[1].tolist()

# create char set from multiple lists
def create_char_set(*datasets):
    char_set = set()
    for data in datasets:
        for word in data:
            char_set.update(word)
    return char_set

# ----------
# Replace these paths with your own .tsv file locations
train_input, train_output = data_load_tsv(
    "/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.train.tsv"
)
val_input, val_output = data_load_tsv(
    "/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.dev.tsv"
)
test_input, test_output = data_load_tsv(
    "/kaggle/input/dakshina-dataset/dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.test.tsv"
)
# ----------

# Print sizes
print(f"Number of training samples:   {len(train_input)}")
print(f"Number of validation samples: {len(val_input)}")
print(f"Number of test samples:       {len(test_input)}")

# Build character sets
src_chars = create_char_set(train_input, val_input, test_input)
tgt_chars = create_char_set(train_output, val_output, test_output)

print("\nSource Character Set:")
print(f"Total characters: {len(src_chars)}")
print(sorted(src_chars))

print("\nTarget Character Set:")
print(f"Total characters: {len(tgt_chars)}")
print(sorted(tgt_chars))

# Max seq lengths including <sow> and <eow>
max_seq_src = max(len(w) for w in train_input + val_input + test_input) + 2
max_seq_tgt = max(len(w) for w in train_output + val_output + test_output) + 2
print(f"\nMax source seq length (with tokens): {max_seq_src}")
print(f"Max target seq length (with tokens): {max_seq_tgt}")

# ==============================
# INDEX MAPPINGS
# ==============================
special_tokens = {'<pad>': 0, '<sow>': 1, '<eow>': 2}

src2idx = {ch: i+3 for i, ch in enumerate(sorted(src_chars))}
src2idx.update(special_tokens)
print("\nSource Indices:")
print(src2idx)

idx2src = {i: ch for ch, i in src2idx.items()}

tgt2idx = {ch: i+3 for i, ch in enumerate(sorted(tgt_chars))}
tgt2idx.update(special_tokens)
print("\nTarget Indices:")
print(tgt2idx)

idx2tgt = {i: ch for ch, i in tgt2idx.items()}

SRC_VOCAB = len(src2idx)
TGT_VOCAB = len(tgt2idx)

# embedding dims (tunable)
SRC_EMB_DIM = 64
TGT_EMB_DIM = 64

# ==============================
# PREPROCESSING
# ==============================
def tsv_preprocessor(data, max_len, vocab):
    processed = []
    for w in data:
        seq = ['<sow>'] + list(w) + ['<eow>']
        seq += ['<pad>'] * (max_len - len(seq))
        indices = [vocab.get(c, vocab['<pad>']) for c in seq]
        processed.append(torch.LongTensor(indices))
    return torch.stack(processed)

train_src = tsv_preprocessor(train_input, max_seq_src, src2idx)
train_tgt = tsv_preprocessor(train_output, max_seq_tgt, tgt2idx)
val_src   = tsv_preprocessor(val_input,   max_seq_src, src2idx)
val_tgt   = tsv_preprocessor(val_output,  max_seq_tgt, tgt2idx)
test_src  = tsv_preprocessor(test_input,  max_seq_src, src2idx)
test_tgt  = tsv_preprocessor(test_output, max_seq_tgt, tgt2idx)

# ==============================
# DATASET & DATALOADER
# ==============================
class TSVDataset(Dataset):
    def __init__(self, src, tgt):
        self.src = src
        self.tgt = tgt
    def __len__(self):
        return len(self.src)
    def __getitem__(self, idx):
        return self.src[idx], self.tgt[idx]

# custom collate to pad along seq dim

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_padded = nn.utils.rnn.pad_sequence(src_batch, batch_first=False, padding_value=special_tokens['<pad>'])
    tgt_padded = nn.utils.rnn.pad_sequence(tgt_batch, batch_first=False, padding_value=special_tokens['<pad>'])
    return src_padded, tgt_padded

BATCH_SIZE = 256
train_loader = DataLoader(
    TSVDataset(train_src, train_tgt),
    batch_size=BATCH_SIZE, shuffle=True,
    collate_fn=collate_fn
)
val_loader = DataLoader(
    TSVDataset(val_src, val_tgt),
    batch_size=BATCH_SIZE,
    collate_fn=collate_fn
)
test_loader = DataLoader(
    TSVDataset(test_src, test_tgt),
    batch_size=BATCH_SIZE,
    collate_fn=collate_fn
)

Number of training samples:   94543
Number of validation samples: 9279
Number of test samples:       9228

Source Character Set:
Total characters: 60
['ঁ', 'ং', 'ঃ', 'অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ', 'ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ', 'জ', 'ঝ', 'ঞ', 'ট', 'ঠ', 'ড', 'ঢ', 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন', 'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'র', 'ল', 'শ', 'ষ', 'স', 'হ', '়', 'া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ', '্', 'ৎ', '২']

Target Character Set:
Total characters: 26
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

Max source seq length (with tokens): 24
Max target seq length (with tokens): 24

Source Indices:
{'ঁ': 3, 'ং': 4, 'ঃ': 5, 'অ': 6, 'আ': 7, 'ই': 8, 'ঈ': 9, 'উ': 10, 'ঊ': 11, 'ঋ': 12, 'এ': 13, 'ঐ': 14, 'ও': 15, 'ঔ': 16, 'ক': 17, 'খ': 18, 'গ': 19, 'ঘ': 20, 'ঙ': 21, 'চ': 22, 'ছ': 23, 'জ': 24, 'ঝ': 25, 'ঞ': 26, 'ট': 27, 'ঠ': 28, 'ড': 29, 'ঢ': 30, 'ণ': 31, 'ত': 32, 'থ': 33, 'দ'

In [30]:
# ENCODER CLASS
# ==============================
class Encoder(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        emb_dim: int,
        hid_dim: int,
        rnn_type: str = 'gru',
        num_layers: int = 1,
        dropout: float = 0.0,
        bidir: bool = False
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        rnn_cls = nn.LSTM if rnn_type.lower() == 'lstm' else nn.GRU
        self.rnn = rnn_cls(
            input_size=emb_dim,
            hidden_size=hid_dim,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0.0,
            bidirectional=bidir,
            batch_first=True
        )

    def forward(
        self,
        x: torch.Tensor,
        hidden: torch.Tensor = None,
        cell: torch.Tensor = None
    ):
        emb = self.embedding(x)  # (batch, seq_len, emb_dim)
        if isinstance(self.rnn, nn.LSTM):
            if hidden is None or cell is None:
                out, (h, c) = self.rnn(emb)
            else:
                out, (h, c) = self.rnn(emb, (hidden, cell))
            return out, h, c
        else:
            if hidden is None:
                out, h = self.rnn(emb)
            else:
                out, h = self.rnn(emb, hidden)
            return out, h, None


In [31]:
# ==============================
# DECODER CLASS
# ==============================
class Decoder(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        emb_dim: int,
        hid_dim: int,
        rnn_type: str = 'gru',
        num_layers: int = 1,
        dropout: float = 0.0,
        bidir: bool = False,
        use_attention: bool = False
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        rnn_cls = nn.LSTM if rnn_type.lower() == 'lstm' else nn.GRU
        self.rnn = rnn_cls(
            input_size=emb_dim,
            hidden_size=hid_dim,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0.0,
            bidirectional=bidir,
            batch_first=True
        )
        self.out = nn.Linear(hid_dim, vocab_size)
        self.use_attention = use_attention

    def forward(
        self,
        x: torch.Tensor,
        prev_hidden: torch.Tensor,
        prev_cell: torch.Tensor = None,
        encoder_outputs: torch.Tensor = None
    ):
        emb = self.embedding(x)  # (batch, 1, emb_dim)
        if isinstance(self.rnn, nn.LSTM):
            if prev_hidden is None or prev_cell is None:
                dec_out, (h, c) = self.rnn(emb)
            else:
                dec_out, (h, c) = self.rnn(emb, (prev_hidden, prev_cell))
        else:
            if prev_hidden is None:
                dec_out, h = self.rnn(emb)
                c = None
            else:
                dec_out, h = self.rnn(emb, prev_hidden)
                c = None
        logits = self.out(dec_out)
        return logits, h, c

In [32]:
# ==============================
# SEQ2SEQ CLASS
# ==============================
class Seq2Seq(nn.Module):
    def __init__(
        self,
        encoder: Encoder,
        decoder: Decoder,
        max_tgt_len: int,
        teacher_force_rate: float = 0.5
    ):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.max_tgt_len = max_tgt_len
        self.teacher_force_rate = teacher_force_rate

    def forward(
        self,
        src: torch.Tensor,
        tgt: torch.Tensor = None,
        teacher_forcing: bool = True,
        training: bool = True
    ):
        # Encode source sequence
        enc_outputs, h_enc, c_enc = self.encoder(src)

        # Align hidden/cell to decoder's layers
        rnn = self.decoder.rnn
        dirs = 2 if rnn.bidirectional else 1
        exp_layers = rnn.num_layers * dirs
        L_enc, B, H = h_enc.size()
        if L_enc != exp_layers:
            if L_enc > exp_layers:
                h = h_enc[:exp_layers]
                c = c_enc[:exp_layers] if c_enc is not None else None
            else:
                pad = exp_layers - L_enc
                h = torch.cat([h_enc, h_enc.new_zeros(pad, B, H)], 0)
                if c_enc is not None:
                    c = torch.cat([c_enc, c_enc.new_zeros(pad, B, H)], 0)
                else:
                    c = None
        else:
            h, c = h_enc, c_enc

        # Prepare initial decoder input (<sow>=1)
        dec_in = torch.full((B,1), 1, dtype=torch.long, device=src.device)
        outputs = torch.zeros(self.max_tgt_len, B, self.decoder.out.out_features, device=src.device)

        for t in range(self.max_tgt_len):
            logits, h, c = self.decoder(dec_in, h, c, enc_outputs)
            outputs[t] = logits.squeeze(1)
            if training and teacher_forcing and random.random() < self.teacher_force_rate:
                dec_in = tgt[:,t].unsqueeze(1)
            else:
                dec_in = logits.argmax(dim=2)

        return outputs, enc_outputs


In [33]:
import wandb
import numpy as np
from types import SimpleNamespace
import random

In [34]:
wandb.login(key='1df7a902fa4a610500b8e79e21818419d5facdbb')#

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [35]:
# ================================================
# SWEEP CONFIGURATION (Bayesian, no attention)
# ================================================
sweep_config = {
    'method': 'bayes',
    'name' : 'sweep - no attention',
    'metric': {
      'goal': 'maximize',
      'name': 'validation_accuracy'
    },
    'parameters':{
        'input_embedding_size': {'values': [64, 128]},
        'enc_layers':            {'values': [1, 2, 3]},
        'dec_layers':            {'values': [1, 2, 3]},
        'hidden_size':           {'values': [64, 128, 256]},
        'cell_type':             {'values': ['lstm', 'rnn', 'gru']},
        'bidirectional':         {'values': [True]},
        'dropout':               {'values': [0.1, 0.2, 0.3]},
        'beam_size':             {'values': [1, 3, 5]}
    }
}

sweep_id = wandb.sweep(
    sweep    = sweep_config,
    entity   = "ma23m018-indian-institute-of-technology-madras",
    project  = "MA23M018_assignment3"
)

# ================================================
# AGENT ENTRYPOINT
# ================================================
def main():
    with wandb.init():
        cfg = wandb.config
        # give this run a descriptive name:
        wandb.run.name = (
            f"cell-{cfg.cell_type}_hid-{cfg.hidden_size}"
            f"_emb-{cfg.input_embedding_size}"
            f"_enc-{cfg.enc_layers}_dec-{cfg.dec_layers}"
            f"_drop{cfg.dropout}_beam{cfg.beam_size}"
        )

        # build encoder & decoder from sweep params
        encoder = Encoder(
            vocab_size    = SRC_VOCAB,
            emb_dim       = cfg.input_embedding_size,
            hid_dim       = cfg.hidden_size,
            rnn_type      = cfg.cell_type,
            num_layers    = cfg.enc_layers,
            dropout       = cfg.dropout,
            bidir         = cfg.bidirectional
        )

        decoder = Decoder(
            vocab_size    = TGT_VOCAB,
            emb_dim       = cfg.input_embedding_size,
            hid_dim       = cfg.hidden_size,
            rnn_type      = cfg.cell_type,
            num_layers    = cfg.dec_layers,
            dropout       = cfg.dropout,
            bidir         = cfg.bidirectional,
            use_attention = False
        )

        model = Seq2Seq(
            encoder                = encoder,
            decoder                = decoder,
            max_tgt_len            = max_len_tgt,
            teacher_force_rate     = 0.5
        ).to(device)

        # train & log validation acc
        train1(model, train_loader, val_loader, epochs=15)

# launch 50 sweep jobs
wandb.agent(sweep_id, function=main, count=2)
wandb.finish()

Create sweep with ID: j2kvd7d6
Sweep URL: https://wandb.ai/ma23m018-indian-institute-of-technology-madras/MA23M018_assignment3/sweeps/j2kvd7d6


[34m[1mwandb[0m: Agent Starting Run: 4tv4yhjg with config:
[34m[1mwandb[0m: 	beam_size: 3
[34m[1mwandb[0m: 	bidirectional: True
[34m[1mwandb[0m: 	cell_type: lstm
[34m[1mwandb[0m: 	dec_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	enc_layers: 2
[34m[1mwandb[0m: 	hidden_size: 256
[34m[1mwandb[0m: 	input_embedding_size: 64


Epochs:   0%|          | 0/15 [00:00<?, ?it/s]
Traceback (most recent call last):
  File "/tmp/ipykernel_35/3525132711.py", line 73, in main
    train1(model, train_loader, val_loader, epochs=15)
  File "/tmp/ipykernel_35/908659430.py", line 49, in train1
    preds, _ = model(src_batch, tgt_batch, teacher_forcing=True, training=True)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/1047456389.py", line 52, in forward
    logits, h, c = self.decoder(dec_in, h, c, enc_outputs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib

[34m[1mwandb[0m: [32m[41mERROR[0m Run 4tv4yhjg errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/3525132711.py", line 73, in main
[34m[1mwandb[0m: [32m[41mERROR[0m     train1(model, train_loader, val_loader, epochs=15)
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/908659430.py", line 49, in train1
[34m[1mwandb[0m: [32m[41mERROR[0m     preds, _ = model(src_batch, tgt_batch, teacher_forcing=True, training=True)
[34m[1mwandb[0m: [32m[41mERROR[0m                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _

Epochs:   0%|          | 0/15 [00:00<?, ?it/s]
Traceback (most recent call last):
  File "/tmp/ipykernel_35/3525132711.py", line 73, in main
    train1(model, train_loader, val_loader, epochs=15)
  File "/tmp/ipykernel_35/908659430.py", line 49, in train1
    preds, _ = model(src_batch, tgt_batch, teacher_forcing=True, training=True)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/1047456389.py", line 52, in forward
    logits, h, c = self.decoder(dec_in, h, c, enc_outputs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib

[34m[1mwandb[0m: [32m[41mERROR[0m Run c9oldf36 errored:
[34m[1mwandb[0m: [32m[41mERROR[0m Traceback (most recent call last):
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/wandb/agents/pyagent.py", line 306, in _run_job
[34m[1mwandb[0m: [32m[41mERROR[0m     self._function()
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/3525132711.py", line 73, in main
[34m[1mwandb[0m: [32m[41mERROR[0m     train1(model, train_loader, val_loader, epochs=15)
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/tmp/ipykernel_35/908659430.py", line 49, in train1
[34m[1mwandb[0m: [32m[41mERROR[0m     preds, _ = model(src_batch, tgt_batch, teacher_forcing=True, training=True)
[34m[1mwandb[0m: [32m[41mERROR[0m                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[34m[1mwandb[0m: [32m[41mERROR[0m   File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _