# This notebook was executed on Kaggle:
> https://www.kaggle.com/code/hemanthssr1/roman-to-devanagari-seq2seq

# Roman-to-Devanagari Seq2Seq

This notebook implements a configurable character-level sequence-to-sequence model (PyTorch) to map romanized strings to Devanagari. It includes data loading from the `aksharantar_sampled` folder, preprocessing, a flexible Encoder/Decoder (RNN/LSTM/GRU), training loop, and analytic formulas for compute and parameter counts.

In [10]:
import os
import glob
import random
from collections import Counter
import math

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# For reproducibility: set random seeds so results are consistent
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Use GPU if available, otherwise fallback to CPU
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device being used:', DEVICE)

# Preset hyperparameters for GPU vs CPU
if DEVICE.type == 'cuda':
    # GPU: bigger model, larger batch, longer sequence
    EMB_DIM = 128
    HIDDEN_DIM = 256
    NUM_LAYERS = 2
    BATCH_SIZE = 128
    MAX_LEN = 48
    RNN_TYPE = 'GRU'
else:
    # CPU: smaller model, smaller batch, shorter sequence
    EMB_DIM = 64
    HIDDEN_DIM = 128
    NUM_LAYERS = 1
    BATCH_SIZE = 32
    MAX_LEN = 32
    RNN_TYPE = 'LSTM'

print(f"Preset: emb_dim={EMB_DIM}, hidden_dim={HIDDEN_DIM}, num_layers={NUM_LAYERS}, batch_size={BATCH_SIZE}, max_len={MAX_LEN}, rnn_type={RNN_TYPE}")

Device being used: cuda
Preset: emb_dim=128, hidden_dim=256, num_layers=2, batch_size=128, max_len=48, rnn_type=GRU


In [11]:
# Load the Aksharantar dataset: we'll grab all *_train.csv files from each language folder
DATA_DIR = '/kaggle/input/aksharantar/aksharantar_sampled'
train_files = glob.glob(os.path.join(DATA_DIR, '*', '*_train.csv'))

pairs = []  # Each pair is (romanized, devanagari)
for f in train_files:
    df = pd.read_csv(f, header=None, names=['latin','dev'])
    # Remove any empty lines or NaNs
    df = df.dropna()
    for _, row in df.iterrows():
        latin = str(row['latin']).strip()
        dev = str(row['dev']).strip()
        if latin and dev:
            pairs.append((latin, dev))

print(f'Loaded {len(pairs)} training pairs from {len(train_files)} files')
# Let's peek at a few examples
for i in range(5):
    print(f'Example {i+1}: {pairs[i][0]} -> {pairs[i][1]}')


Loaded 911513 training pairs from 19 files
Example 1: mwnlwnga -> मोनलोङा
Example 2: ransargra -> रानसारग्रा
Example 3: baohordwngmwn -> बावहरदोंमोन
Example 4: riyel -> रियेल
Example 5: tamkonayari -> थामखनायारि


In [3]:
# Build character-level vocabularies and tokenizers for both scripts
# We'll use special tokens for start-of-sequence, end-of-sequence, and padding
SOS_token = '<s>'
EOS_token = '</s>'
PAD_token = '<pad>'

# Gather all unique characters in source (romanized) and target (devanagari)
src_chars = set()
tgt_chars = set()
for src, tgt in pairs:
    src_chars.update(list(src))
    tgt_chars.update(list(tgt))

src_vocab = [PAD_token, SOS_token, EOS_token] + sorted(src_chars)
tgt_vocab = [PAD_token, SOS_token, EOS_token] + sorted(tgt_chars)

src_char2idx = {c: i for i, c in enumerate(src_vocab)}
src_idx2char = {i: c for c, i in src_char2idx.items()}
tgt_char2idx = {c: i for i, c in enumerate(tgt_vocab)}
tgt_idx2char = {i: c for c, i in tgt_char2idx.items()}

print(f'Source vocab size: {len(src_vocab)}')
print(f'Target vocab size: {len(tgt_vocab)}')
print('First few source vocab chars:', src_vocab[:10])
print('First few target vocab chars:', tgt_vocab[:10])


Source vocab size: 29
Target vocab size: 681
First few source vocab chars: ['<pad>', '<s>', '</s>', 'a', 'b', 'c', 'd', 'e', 'f', 'g']
First few target vocab chars: ['<pad>', '<s>', '</s>', '،', 'ؐ', 'ء', 'آ', 'ؤ', 'ئ', 'ا']


In [4]:
# Let's wrap our data in a PyTorch Dataset for easy batching and training
class TransliterationDataset(Dataset):
    def __init__(self, pairs, src_char2idx, tgt_char2idx, max_len=32):
        self.pairs = pairs
        self.src_char2idx = src_char2idx
        self.tgt_char2idx = tgt_char2idx
        self.max_len = max_len
    def __len__(self):
        return len(self.pairs)
    def encode_seq(self, seq, char2idx):
        # Add SOS and EOS tokens, then pad to max_len
        idxs = [char2idx[SOS_token]] + [char2idx[c] for c in seq] + [char2idx[EOS_token]]
        if len(idxs) < self.max_len:
            idxs += [char2idx[PAD_token]] * (self.max_len - len(idxs))
        else:
            idxs = idxs[:self.max_len]
        return torch.tensor(idxs, dtype=torch.long)
    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]
        src_encoded = self.encode_seq(src, self.src_char2idx)
        tgt_encoded = self.encode_seq(tgt, self.tgt_char2idx)
        return src_encoded, tgt_encoded

# Example usage: create a dataset and a dataloader
train_dataset = TransliterationDataset(pairs, src_char2idx, tgt_char2idx, max_len=MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

src_batch, tgt_batch = next(iter(train_loader))
print('Source batch shape:', src_batch.shape)
print('Target batch shape:', tgt_batch.shape)


Source batch shape: torch.Size([128, 48])
Target batch shape: torch.Size([128, 48])


In [5]:
# Defining our flexible Seq2Seq model: you can change embedding size, hidden size, RNN type, and layers
class Seq2Seq(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, emb_dim=64, hidden_dim=128, rnn_type='LSTM', num_layers=1):
        super().__init__()
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.rnn_type = rnn_type
        self.num_layers = num_layers
        self.src_embedding = nn.Embedding(src_vocab_size, emb_dim, padding_idx=src_char2idx[PAD_token])
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, emb_dim, padding_idx=tgt_char2idx[PAD_token])
        rnn_cls = {'RNN': nn.RNN, 'LSTM': nn.LSTM, 'GRU': nn.GRU}[rnn_type]
        self.encoder = rnn_cls(emb_dim, hidden_dim, num_layers, batch_first=True)
        self.decoder = rnn_cls(emb_dim, hidden_dim, num_layers, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, tgt_vocab_size)
    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        src_emb = self.src_embedding(src)
        # Encoder: process the input sequence
        enc_outputs, enc_hidden = self.encoder(src_emb)
        # Decoder: generate output sequence one token at a time
        tgt_len = tgt.size(1)
        outputs = torch.zeros(batch_size, tgt_len, len(tgt_vocab)).to(src.device)
        input = tgt[:,0]  # Start with SOS token
        hidden = enc_hidden
        for t in range(1, tgt_len):
            input_emb = self.tgt_embedding(input).unsqueeze(1)  # (batch, 1, emb_dim)
            dec_output, hidden = self.decoder(input_emb, hidden)
            out_token = self.fc_out(dec_output.squeeze(1))
            outputs[:,t,:] = out_token
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = out_token.argmax(1)
            input = tgt[:,t] if teacher_force else top1
        return outputs

# Create the model (feel free to change hyperparameters)
model = Seq2Seq(
    src_vocab_size=len(src_vocab),
    tgt_vocab_size=len(tgt_vocab),
    emb_dim=EMB_DIM,
    hidden_dim=HIDDEN_DIM,
    rnn_type=RNN_TYPE,
    num_layers=NUM_LAYERS
).to(DEVICE)
print(model)

Seq2Seq(
  (src_embedding): Embedding(29, 128, padding_idx=0)
  (tgt_embedding): Embedding(681, 128, padding_idx=0)
  (encoder): GRU(128, 256, num_layers=2, batch_first=True)
  (decoder): GRU(128, 256, num_layers=2, batch_first=True)
  (fc_out): Linear(in_features=256, out_features=681, bias=True)
)


In [6]:
# Let's train for a few batches to check everything works
loss_fn = nn.CrossEntropyLoss(ignore_index=tgt_char2idx[PAD_token])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

model.train()
for batch_idx, (src_batch, tgt_batch) in enumerate(train_loader):
    src_batch = src_batch.to(DEVICE)
    tgt_batch = tgt_batch.to(DEVICE)
    optimizer.zero_grad()
    outputs = model(src_batch, tgt_batch, teacher_forcing_ratio=0.5)
    # outputs: (batch, tgt_len, tgt_vocab_size)
    # tgt_batch: (batch, tgt_len)
    loss = loss_fn(outputs.view(-1, len(tgt_vocab)), tgt_batch.view(-1))
    loss.backward()
    optimizer.step()
    if batch_idx % 10 == 0:
        print(f'Batch {batch_idx}, Loss: {loss.item():.4f}')
    if batch_idx == 1000:
        print('Concluding Training Process')
        break  # Tested with 20 batches on cpu locally, 1000 here on Kaggle as it has the P100 GPU


Batch 0, Loss: 6.5271
Batch 10, Loss: 5.4728
Batch 20, Loss: 5.3213
Batch 30, Loss: 5.2940
Batch 40, Loss: 5.2104
Batch 50, Loss: 5.1846
Batch 60, Loss: 5.0204
Batch 70, Loss: 5.1410
Batch 80, Loss: 4.7941
Batch 90, Loss: 4.7589
Batch 100, Loss: 4.4605
Batch 110, Loss: 4.5106
Batch 120, Loss: 4.2648
Batch 130, Loss: 3.9539
Batch 140, Loss: 4.2836
Batch 150, Loss: 4.2027
Batch 160, Loss: 3.8458
Batch 170, Loss: 3.8116
Batch 180, Loss: 4.0138
Batch 190, Loss: 3.9015
Batch 200, Loss: 3.5650
Batch 210, Loss: 3.6606
Batch 220, Loss: 3.7727
Batch 230, Loss: 3.6330
Batch 240, Loss: 4.4366
Batch 250, Loss: 3.4200
Batch 260, Loss: 3.5014
Batch 270, Loss: 3.9123
Batch 280, Loss: 3.3753
Batch 290, Loss: 3.3913
Batch 300, Loss: 3.3184
Batch 310, Loss: 3.1361
Batch 320, Loss: 3.3656
Batch 330, Loss: 3.5291
Batch 340, Loss: 3.4821
Batch 350, Loss: 3.3127
Batch 360, Loss: 3.1455
Batch 370, Loss: 3.0067
Batch 380, Loss: 3.7520
Batch 390, Loss: 2.9772
Batch 400, Loss: 2.9951
Batch 410, Loss: 2.9202
Bat

## Observations:

We can gradually notice a decrease in the loss function value, which indicates progress in the model training. Thus, this particular setup is quite effective.

## Analytic Formulas: Computations and Parameters

Let:
- $E$ = embedding size
- $H$ = hidden size
- $L$ = sequence length
- $V$ = vocabulary size (source/target)

**Total Parameters:**
- Embedding layers: $2 \times (V \times E)$
- Encoder RNN (LSTM): $4 \times [(E + H) \times H + H]$ (weights and biases)
- Decoder RNN (LSTM): $4 \times [(E + H) \times H + H]$
- Output FC: $H \times V + V$
- **Total:**
  $$P = 2VE + 8(E+H)H + 8H + HV + V$$

**Total Computations per Forward Pass:**
- For each time step in encoder and decoder (L steps each):
  - LSTM cell: $4 \times [(E + H) \times H]$ multiplies per step
- Output FC: $H \times V$ per output step
- **Total:**
  $$C = L \times [4(E+H)H]_{enc} + L \times [4(E+H)H + HV]_{dec}$$

Assume 1 layer each, $E=64$, $H=128$, $L=16$, $V=50$ for example.

We can change $E$, $H$, $L$, $V$ in the model and recompute these values.

The below python code answers the questions based on the usage of cpu or gpu(cuda).

In [7]:
# --- Analytic calculation of parameters and computations for current model settings ---

def calc_params_and_computations(E, H, L, V):
    # Total parameters
    embedding_params = 2 * V * E
    encoder_params = 4 * ((E + H) * H + H)
    decoder_params = 4 * ((E + H) * H + H)
    output_params = H * V + V
    total_params = embedding_params + encoder_params + decoder_params + output_params

    # Total computations per forward pass (batch size = 1)
    encoder_computations = L * 4 * (E + H) * H
    decoder_computations = L * 4 * (E + H) * H
    output_computations = L * H * V
    total_computations = encoder_computations + decoder_computations + output_computations

    return total_params, total_computations

E = EMB_DIM
H = HIDDEN_DIM
L = MAX_LEN
V = len(src_vocab)  # or len(tgt_vocab), they are the same

params, computations = calc_params_and_computations(E, H, L, V)

print("Model analytic summary for current settings:")
print(f"Embedding size (E): {E}")
print(f"Hidden size (H): {H}")
print(f"Sequence length (L): {L}")
print(f"Vocabulary size (V): {V}")
print(f"Total number of parameters: {params:,}")
print(f"Total number of computations per forward pass: {computations:,}")

# For reference, formulas used:
print("\nFormulas used:")
print("Parameters: 2VE + 8(E+H)H + 8H + HV + V")
print("Computations: 8L(E+H)H + LHV")

Model analytic summary for current settings:
Embedding size (E): 128
Hidden size (H): 256
Sequence length (L): 48
Vocabulary size (V): 29
Total number of parameters: 803,357
Total number of computations per forward pass: 38,105,088

Formulas used:
Parameters: 2VE + 8(E+H)H + 8H + HV + V
Computations: 8L(E+H)H + LHV
