In [18]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import re


In [19]:
data = pd.read_csv("D:\hindi_english_parallel.csv").dropna().head(100000)

# Clean Hindi
def clean_hindi(text):
    if not isinstance(text, str): return ""
    text = re.sub(r"[^\u0900-\u097F\s]", "", text)
    return re.sub(r"\s+", " ", text).strip()

# Clean English
def clean_english(text):
    if not isinstance(text, str): return ""
    text = re.sub(r"[^\w\s]", "", text.lower())
    return re.sub(r"\s+", " ", text).strip()

data["hindi"] = data["hindi"].apply(clean_hindi)
data["english"] = data["english"].apply(clean_english)
data["english"] = data["english"].apply(lambda x: "<sos> " + x + " <eos>")


In [20]:
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)


In [21]:
from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(texts):
    for sentence in texts:
        yield sentence.split()

# Hindi Vocab
hindi_vocab = build_vocab_from_iterator(yield_tokens(train_data["hindi"]), specials=["<pad>", "<unk>"])
hindi_vocab.set_default_index(hindi_vocab["<unk>"])

# English Vocab
english_vocab = build_vocab_from_iterator(yield_tokens(train_data["english"]), specials=["<pad>", "<unk>", "<sos>", "<eos>"])
english_vocab.set_default_index(english_vocab["<unk>"])


In [22]:
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, data, src_vocab, tgt_vocab):
        self.data = data
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src = self.data.iloc[idx]["hindi"]
        tgt = self.data.iloc[idx]["english"]
        
        src_tensor = torch.tensor([self.src_vocab[token] for token in src.split()], dtype=torch.long)
        tgt_tensor = torch.tensor([self.tgt_vocab[token] for token in tgt.split()], dtype=torch.long)

        return src_tensor, tgt_tensor


In [23]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    
    src_batch = pad_sequence(src_batch, padding_value=hindi_vocab["<pad>"], batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=english_vocab["<pad>"], batch_first=True)
    
    return src_batch, tgt_batch


In [24]:
train_dataset = TranslationDataset(train_data, hindi_vocab, english_vocab)
val_dataset = TranslationDataset(val_data, hindi_vocab, english_vocab)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)


In [25]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers=1, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers=n_layers, batch_first=True, dropout=dropout)

    def forward(self, src):
        embedded = self.embedding(src)  # [batch, src_len, emb_dim]
        outputs, (hidden, cell) = self.lstm(embedded)  # LSTM returns (hidden, cell)
        return hidden, cell


In [26]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers=1, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers=n_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(1)  # [batch, 1]
        embedded = self.embedding(input)  # [batch, 1, emb_dim]
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc(output.squeeze(1))  # [batch, output_dim]
        return prediction, hidden, cell


In [27]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size, tgt_len = tgt.shape
        tgt_vocab_size = self.decoder.fc.out_features
        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)
        input = tgt[:, 0]  # <sos>

        for t in range(1, tgt_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            input = tgt[:, t] if teacher_force else output.argmax(1)

        return outputs


In [28]:
import torch

# Set up GPU usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


Using device: NVIDIA GeForce GTX 1650


In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

INPUT_DIM = len(hindi_vocab)
OUTPUT_DIM = len(english_vocab)
EMB_DIM = 256
HIDDEN_DIM = 512
N_LAYERS = 1
DROPOUT = 0.3

encoder = Encoder(INPUT_DIM, EMB_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT)
decoder = Decoder(OUTPUT_DIM, EMB_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT)

model = Seq2Seq(encoder, decoder, device).to(device)




In [30]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

PAD_IDX = english_vocab["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)


In [31]:
from tqdm import tqdm

def train(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0

    loop = tqdm(dataloader, desc="Training", leave=False)

    for batch_idx, (src, tgt) in enumerate(loop):
        src, tgt = src.to(device), tgt.to(device)

        optimizer.zero_grad()
        output = model(src, tgt)  # output: [batch, tgt_len, output_dim]

        output = output[:, 1:].reshape(-1, output.shape[-1])
        tgt = tgt[:, 1:].reshape(-1)

        loss = criterion(output, tgt)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Update progress bar description
        loop.set_postfix(batch=batch_idx + 1, loss=loss.item())

    return total_loss / len(dataloader)


In [32]:
for epoch in range(10):
    train_loss = train(model, train_loader, optimizer, criterion)
    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f}")


                                                                                    

Epoch 1 | Train Loss: 3.8295


                                                                                     

Epoch 2 | Train Loss: 1.5059


                                                                                     

Epoch 3 | Train Loss: 0.8499


                                                                                     

Epoch 4 | Train Loss: 0.6275


                                                                                     

Epoch 5 | Train Loss: 0.5204


                                                                                      

Epoch 6 | Train Loss: 0.4582


                                                                                      

Epoch 7 | Train Loss: 0.4224


                                                                                      

Epoch 8 | Train Loss: 0.4027


                                                                                      

Epoch 9 | Train Loss: 0.3839


                                                                                      

Epoch 10 | Train Loss: 0.3648




In [None]:
def translate_sentence(sentence, model, hindi_vocab, english_vocab, max_len=50):
    model.eval()

    # Tokenize and convert Hindi sentence to indices
    tokens = sentence.lower().split()
    input_ids = [hindi_vocab[token] for token in tokens]
    input_ids = input_ids[:max_len]
    input_ids += [hindi_vocab["<pad>"]] * (max_len - len(input_ids))
    src_tensor = torch.LongTensor([input_ids]).to(device)  # shape: [1, max_len]

    # Start decoding with <sos>
    tgt_input = torch.LongTensor([english_vocab["<sos>"]]).unsqueeze(0).to(device)  # shape: [1, 1]

    outputs = []
    with torch.no_grad():
        hidden, cell = model.encoder(src_tensor)
        for _ in range(max_len):
            output, hidden, cell = model.decoder(tgt_input, hidden, cell)  # output shape: [1, 1, vocab_size]
            pred_token = output.argmax(2)[:, -1].item()

            if pred_token == english_vocab["<eos>"]:
                break
            outputs.append(pred_token)

            # Update input
            tgt_input = torch.LongTensor([pred_token]).unsqueeze(0).to(device)

    # Convert predicted indices to words
    inv_vocab = {idx: word for word, idx in english_vocab.get_stoi().items()}
    predicted_words = [inv_vocab.get(idx, "<unk>") for idx in outputs]
    return ' '.join(predicted_words)


In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np

def evaluate_bleu(model, val_df, hindi_vocab, english_vocab, num_samples=100):
    scores = []
    smooth = SmoothingFunction().method4

    for i in range(min(num_samples, len(val_df))):
        src = val_df.iloc[i]["hindi"]
        tgt = val_df.iloc[i]["english"]

        ref = tgt.lower().split()
        pred = translate_sentence(src, model, hindi_vocab, english_vocab).split()

        bleu = sentence_bleu([ref], pred, smoothing_function=smooth)
        scores.append(bleu)

        # Show every 10th example
        if i % 10 == 0:
            print(f"[{i}]")
            print("Hindi:    ", src)
            print("Target:   ", tgt)
            print("Predicted:", ' '.join(pred))
            print("BLEU:     ", round(bleu, 4), "\n")

    avg_bleu = np.mean(scores)
    print(f"\n✅ Average BLEU score on {num_samples} samples: {round(avg_bleu, 4)}")
    return avg_bleu


In [None]:
avg_bleu = evaluate_bleu(model, val_data, hindi_vocab, english_vocab, num_samples=100)
