Question Number-1


In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
import random

In [5]:
train_path = '/content/hi.translit.sampled.train.tsv'
test_path = '/content/hi.translit.sampled.test.tsv'

# Load data assuming TSV format with columns: target, input, count
train_df = pd.read_csv(train_path, sep='\t', header=None, names=['target', 'input', 'count'])
test_df = pd.read_csv(test_path, sep='\t', header=None, names=['target', 'input', 'count'])

# Optional: create validation split from training data
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

In [6]:
def build_vocab(sequences):
    # Filter out any non-string values before building the vocabulary
    sequences = [seq for seq in sequences if isinstance(seq, str)]
    chars = set(char for seq in sequences for char in seq)
    vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2}
    vocab.update({char: i+3 for i, char in enumerate(sorted(chars))})
    return vocab

input_vocab = build_vocab(train_df['input'])
target_vocab = build_vocab(train_df['target'])

inv_target_vocab = {v: k for k, v in target_vocab.items()}

In [7]:
class TransliterationDataset(Dataset):
    def __init__(self, data, input_vocab, target_vocab):
        self.data = data
        self.input_vocab = input_vocab
        self.target_vocab = target_vocab

    def encode_seq(self, seq, vocab, add_sos_eos=False):
        # Convert seq to string if it's not already
        if not isinstance(seq, str):
            seq = str(seq)
        ids = [vocab[char] for char in seq]
        if add_sos_eos:
            ids = [vocab['<sos>']] + ids + [vocab['<eos>']]
        return torch.tensor(ids)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        # Ensure input is a string by converting it if necessary
        input_seq = self.encode_seq(str(row['input']), self.input_vocab)
        target_seq = self.encode_seq(str(row['target']), self.target_vocab, add_sos_eos=True)
        return input_seq, target_seq

In [8]:
def collate_fn(batch):
    input_seqs, target_seqs = zip(*batch)
    input_lens = [len(seq) for seq in input_seqs]
    target_lens = [len(seq) for seq in target_seqs]
    input_pad = nn.utils.rnn.pad_sequence(input_seqs, batch_first=True, padding_value=0)
    target_pad = nn.utils.rnn.pad_sequence(target_seqs, batch_first=True, padding_value=0)
    return input_pad, target_pad, input_lens, target_lens

train_ds = TransliterationDataset(train_df, input_vocab, target_vocab)
val_ds = TransliterationDataset(val_df, input_vocab, target_vocab)
test_ds = TransliterationDataset(test_df, input_vocab, target_vocab)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=64, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_ds, batch_size=1, shuffle=False, collate_fn=collate_fn)

In [9]:
class Seq2Seq(nn.Module):
    def __init__(self, input_dim, target_dim, emb_dim, hidden_dim, rnn_type='GRU', num_layers=1):
        super().__init__()
        self.encoder_embed = nn.Embedding(input_dim, emb_dim)
        self.decoder_embed = nn.Embedding(target_dim, emb_dim)

        rnn_cls = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}[rnn_type]
        self.encoder = rnn_cls(emb_dim, hidden_dim, num_layers, batch_first=True)
        self.decoder = rnn_cls(emb_dim, hidden_dim, num_layers, batch_first=True)

        self.fc_out = nn.Linear(hidden_dim, target_dim)
        self.rnn_type = rnn_type
        self.hidden_dim = hidden_dim

    def forward(self, src, tgt):
        # Encoder
        src_emb = self.encoder_embed(src)
        _, hidden = self.encoder(src_emb)

        # Decoder
        tgt_emb = self.decoder_embed(tgt[:, :-1])
        outputs, _ = self.decoder(tgt_emb, hidden)
        logits = self.fc_out(outputs)
        return logits

In [10]:
def train(model, loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for inputs, targets, _, _ in loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        output = model(inputs, targets)
        loss = loss_fn(output.view(-1, output.size(-1)), targets[:, 1:].contiguous().view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

In [11]:
def evaluate(model, loader, loss_fn, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, targets, _, _ in loader:
            inputs, targets = inputs.to(device), targets.to(device)
            output = model(inputs, targets)
            loss = loss_fn(output.view(-1, output.size(-1)), targets[:, 1:].contiguous().view(-1))
            total_loss += loss.item()
    return total_loss / len(loader)

In [12]:
def predict(model, dataset, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for input_seq, _ in dataset:
            input_seq = input_seq.unsqueeze(0).to(device)
            embedded = model.encoder_embed(input_seq)
            _, hidden = model.encoder(embedded)

            decoder_input = torch.tensor([[target_vocab['<sos>']]], device=device)
            output_seq = []
            for _ in range(30):  # max length
                emb = model.decoder_embed(decoder_input)
                out, hidden = model.decoder(emb, hidden)
                logits = model.fc_out(out.squeeze(1))
                pred_token = logits.argmax(-1).item()
                if pred_token == target_vocab['<eos>']:
                    break
                output_seq.append(inv_target_vocab[pred_token])
                decoder_input = torch.tensor([[pred_token]], device=device)
            predictions.append("".join(output_seq))
    return predictions

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Seq2Seq(
    input_dim=len(input_vocab),
    target_dim=len(target_vocab),
    emb_dim=64,
    hidden_dim=128,
    rnn_type='GRU',
    num_layers=1
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss(ignore_index=0)

# Training loop
for epoch in range(1, 21):
    train_loss = train(model, train_loader, optimizer, loss_fn, device)
    val_loss = evaluate(model, val_loader, loss_fn, device)
    print(f"Epoch {epoch}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

# Predict on test set
test_predictions = predict(model, test_ds, device)
for i in range(20):
    print(f"Latin: {test_df.iloc[i]['input']} → Predicted Devanagari: {test_predictions[i]} | Actual: {test_df.iloc[i]['target']}")

Epoch 1: Train Loss = 2.2732, Val Loss = 1.5129
Epoch 2: Train Loss = 1.1726, Val Loss = 0.9525
Epoch 3: Train Loss = 0.8499, Val Loss = 0.7766
Epoch 4: Train Loss = 0.7058, Val Loss = 0.6769
Epoch 5: Train Loss = 0.6185, Val Loss = 0.6289
Epoch 6: Train Loss = 0.5589, Val Loss = 0.5780
Epoch 7: Train Loss = 0.5123, Val Loss = 0.5589
Epoch 8: Train Loss = 0.4771, Val Loss = 0.5399
Epoch 9: Train Loss = 0.4480, Val Loss = 0.5199
Epoch 10: Train Loss = 0.4225, Val Loss = 0.5058
Epoch 11: Train Loss = 0.4008, Val Loss = 0.4941
Epoch 12: Train Loss = 0.3819, Val Loss = 0.4852
Epoch 13: Train Loss = 0.3641, Val Loss = 0.4883
Epoch 14: Train Loss = 0.3503, Val Loss = 0.4728
Epoch 15: Train Loss = 0.3365, Val Loss = 0.4721
Epoch 16: Train Loss = 0.3234, Val Loss = 0.4708
Epoch 17: Train Loss = 0.3109, Val Loss = 0.4643
Epoch 18: Train Loss = 0.2992, Val Loss = 0.4666
Epoch 19: Train Loss = 0.2906, Val Loss = 0.4686
Epoch 20: Train Loss = 0.2812, Val Loss = 0.4647
Latin: ank → Predicted Devana

In [16]:
# =================== Accuracy ====================
def compute_accuracy(predictions, targets):
    correct = 0
    total = len(predictions)
    for pred, actual in zip(predictions, targets):
        if pred.strip() == actual.strip():
            correct += 1
    return correct / total

# Get actual Devanagari targets
actual_targets = list(test_df['target'])

# Calculate accuracy
accuracy = compute_accuracy(test_predictions, actual_targets)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Test Accuracy: 23.92%
