<a href="https://colab.research.google.com/github/shruti0731/MiniProject2/blob/main/MP2Own_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files

uploaded = files.upload()  # upload both your .sa and .en files


Saving gitasopanam.en to gitasopanam.en
Saving gitasopanam.sa to gitasopanam.sa


In [2]:
# Replace with the actual filenames you upload
with open("gitasopanam.sa", "r", encoding="utf-8") as f:
    sanskrit_lines = f.readlines()

with open("gitasopanam.en", "r", encoding="utf-8") as f:
    english_lines = f.readlines()


In [3]:
# 1. Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from typing import List

In [4]:
# 2. Parameters
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 32
EMBED_SIZE = 256
NUM_HEADS = 8
NUM_ENCODER_LAYERS = 4
NUM_DECODER_LAYERS = 4
FFN_HIDDEN = 512
MAX_LEN = 100
EPOCHS = 20

In [5]:
# 3. Tokenization and Vocab
class Vocab:
    def __init__(self, texts, min_freq=1):
        self.pad_token = "<pad>"
        self.sos_token = "<sos>"
        self.eos_token = "<eos>"
        self.unk_token = "<unk>"

        tokens = [word for line in texts for word in line.strip().split()]
        counter = Counter(tokens)
        self.itos = [self.pad_token, self.sos_token, self.eos_token, self.unk_token] + [w for w, c in counter.items() if c >= min_freq]
        self.stoi = {w: i for i, w in enumerate(self.itos)}

    def encode(self, text):
        return [self.stoi.get(w, self.stoi[self.unk_token]) for w in text.strip().split()] + [self.stoi[self.eos_token]]

    def decode(self, tokens):
        words = [self.itos[token] for token in tokens if token != self.stoi[self.eos_token]]
        return " ".join(words)

    def __len__(self):
        return len(self.itos)

In [6]:
# 4. Dataset Preparation
class TranslationDataset(Dataset):
    def __init__(self, src_lines, tgt_lines, src_vocab, tgt_vocab, max_len=MAX_LEN):
        self.data = list(zip(src_lines, tgt_lines))
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src, tgt = self.data[idx]
        src_ids = self.src_vocab.encode(src)[:self.max_len]
        tgt_ids = [self.tgt_vocab.stoi[self.tgt_vocab.sos_token]] + self.tgt_vocab.encode(tgt)[:self.max_len]
        return torch.tensor(src_ids), torch.tensor(tgt_ids)


In [7]:
# 5. Collate Function
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_pad = nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=src_vocab.stoi[src_vocab.pad_token])
    tgt_pad = nn.utils.rnn.pad_sequence(tgt_batch, batch_first=True, padding_value=tgt_vocab.stoi[tgt_vocab.pad_token])
    return src_pad.to(DEVICE), tgt_pad.to(DEVICE)


In [8]:
# Build vocabs
src_vocab = Vocab(sanskrit_lines)
tgt_vocab = Vocab(english_lines)


In [9]:
# Create dataset and dataloader
dataset = TranslationDataset(sanskrit_lines, english_lines, src_vocab, tgt_vocab)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

In [10]:
#  6. Transformer Model Definition
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size):
        super().__init__()
        self.src_tok_emb = nn.Embedding(src_vocab_size, EMBED_SIZE)
        self.tgt_tok_emb = nn.Embedding(tgt_vocab_size, EMBED_SIZE)
        self.positional_encoding = nn.Parameter(torch.rand(MAX_LEN, EMBED_SIZE))

        self.transformer = nn.Transformer(
            d_model=EMBED_SIZE,
            nhead=NUM_HEADS,
            num_encoder_layers=NUM_ENCODER_LAYERS,
            num_decoder_layers=NUM_DECODER_LAYERS,
            dim_feedforward=FFN_HIDDEN,
            dropout=0.1,
            batch_first=True
        )

        self.fc_out = nn.Linear(EMBED_SIZE, tgt_vocab_size)

    def forward(self, src, tgt):
        src_mask = None
        tgt_mask = self.transformer.generate_square_subsequent_mask(tgt.size(1)).to(DEVICE)

        src_emb = self.src_tok_emb(src) + self.positional_encoding[:src.size(1), :]
        tgt_emb = self.tgt_tok_emb(tgt) + self.positional_encoding[:tgt.size(1), :]

        output = self.transformer(src_emb, tgt_emb, src_mask=src_mask, tgt_mask=tgt_mask)
        return self.fc_out(output)

In [11]:
# Instantiate model
model = Transformer(len(src_vocab), len(tgt_vocab)).to(DEVICE)


In [12]:
# 7. Training Setup
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab.stoi[tgt_vocab.pad_token])


In [35]:
# 8. Training Loop
# def train():
#     model.train()
#     for epoch in range(EPOCHS):
#         total_loss = 0
#         for src, tgt in dataloader:
#             tgt_input = tgt[:, :-1]
#             tgt_output = tgt[:, 1:]

#             output = model(src, tgt_input)
#             output = output.reshape(-1, output.shape[-1])
#             tgt_output = tgt_output.reshape(-1)

#             loss = criterion(output, tgt_output)
#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()

#             total_loss += loss.item()
#         print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")


# train and acc
import torch.nn as nn

# Define the loss function
loss_fn = nn.CrossEntropyLoss(ignore_index=tgt_vocab.stoi[tgt_vocab.pad_token])

def train():
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        correct_tokens = 0
        total_tokens = 0

        for src, tgt in dataloader:
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]

            optimizer.zero_grad()
            output = model(src, tgt_input)
            loss = loss_fn(output.reshape(-1, output.shape[-1]), tgt_output.reshape(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            # Accuracy calculation
            preds = output.argmax(dim=-1)
            mask = tgt_output != tgt_vocab.stoi[tgt_vocab.pad_token]
            correct_tokens += (preds == tgt_output).masked_select(mask).sum().item()
            total_tokens += mask.sum().item()

        train_accuracy = correct_tokens / total_tokens
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}, Train Accuracy: {train_accuracy * 100:.2f}%")



In [36]:
# 9. Inference
def translate(sentence):
    model.eval()
    with torch.no_grad():
        src = torch.tensor(src_vocab.encode(sentence)).unsqueeze(0).to(DEVICE)
        tgt = torch.tensor([tgt_vocab.stoi[tgt_vocab.sos_token]]).unsqueeze(0).to(DEVICE)

        for _ in range(MAX_LEN):
            output = model(src, tgt)
            next_token = output[:, -1, :].argmax(dim=-1).unsqueeze(1)
            tgt = torch.cat((tgt, next_token), dim=1)
            if next_token.item() == tgt_vocab.stoi[tgt_vocab.eos_token]:
                break

        return tgt_vocab.decode(tgt.squeeze().tolist()[1:])


In [37]:
# 10. Run Training
train()


Epoch 1, Loss: 1.6665, Train Accuracy: 68.19%
Epoch 2, Loss: 1.5574, Train Accuracy: 70.21%
Epoch 3, Loss: 1.4481, Train Accuracy: 72.44%
Epoch 4, Loss: 1.3463, Train Accuracy: 74.63%
Epoch 5, Loss: 1.2457, Train Accuracy: 76.52%
Epoch 6, Loss: 1.1566, Train Accuracy: 78.49%
Epoch 7, Loss: 1.0622, Train Accuracy: 80.60%
Epoch 8, Loss: 0.9726, Train Accuracy: 82.71%
Epoch 9, Loss: 0.8853, Train Accuracy: 84.71%
Epoch 10, Loss: 0.8125, Train Accuracy: 86.16%
Epoch 11, Loss: 0.7349, Train Accuracy: 88.09%
Epoch 12, Loss: 0.6650, Train Accuracy: 89.47%
Epoch 13, Loss: 0.6003, Train Accuracy: 90.95%
Epoch 14, Loss: 0.5436, Train Accuracy: 92.17%
Epoch 15, Loss: 0.4868, Train Accuracy: 93.23%
Epoch 16, Loss: 0.4357, Train Accuracy: 94.18%
Epoch 17, Loss: 0.3935, Train Accuracy: 94.80%
Epoch 18, Loss: 0.3584, Train Accuracy: 95.50%
Epoch 19, Loss: 0.3147, Train Accuracy: 96.19%
Epoch 20, Loss: 0.2826, Train Accuracy: 96.93%


In [38]:
def compute_accuracy(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for src, tgt in dataloader:
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]

            output = model(src, tgt_input)
            preds = output.argmax(dim=-1)

            mask = tgt_output != tgt_vocab.stoi[tgt_vocab.pad_token]
            correct += (preds == tgt_output).masked_select(mask).sum().item()
            total += mask.sum().item()

    acc = correct / total
    print(f"Token-level Accuracy: {acc:.4f}")


In [39]:
compute_accuracy(model, dataloader)

Token-level Accuracy: 0.9953


In [40]:
def compute_sequence_accuracy(model, src_lines, tgt_lines, max_samples=100):
    model.eval()
    correct = 0
    total = min(len(src_lines), max_samples)

    for i in range(total):
        pred = translate(src_lines[i]).strip().lower()
        ref = tgt_lines[i].strip().lower()
        if pred == ref:
            correct += 1

    acc = correct / total
    print(f"Sequence-Level Accuracy: {acc * 100:.2f}% on {total} samples")


In [41]:
compute_sequence_accuracy(model, sanskrit_lines, english_lines)


Sequence-Level Accuracy: 89.00% on 100 samples


In [42]:
# ✅ Save the trained model
torch.save(model.state_dict(), "sanskrit_translator.pth")

# ✅ Later, or in a new session, reload like this:
model.load_state_dict(torch.load("sanskrit_translator.pth"))
model.eval()


Transformer(
  (src_tok_emb): Embedding(6517, 256)
  (tgt_tok_emb): Embedding(5661, 256)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-3): 4 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=512, out_features=256, bias=True)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0-3)

In [43]:
print(translate("पिता कार्यालयात् गृहम् आगच्छति ।"))
print(translate("अहं गच्छामि ।"))

Father comes home from office.
I am a devotee.


In [45]:
print(translate("कर्मण्येवाधिकारस्ते मा फलेषु कदाचन।"))
print(translate("धर्मक्षेत्रे कुरुक्षेत्रे समवेता युयुत्सवः"))

In which river there is giving other two studying to you .
Have fondness in you .
