In [166]:
import pandas as pd
data = pd.read_csv("D:\hindi_english_parallel.csv")
data.head(10)

Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the top panel
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default
5,अवधि को हाइलाइट रकें,Highlight duration
6,पहुंचनीय आसंधि (नोड) को चुनते समय हाइलाइट बक्स...,The duration of the highlight box when selecti...
7,सीमांत (बोर्डर) के रंग को हाइलाइट करें,Highlight border color
8,हाइलाइट किए गए सीमांत का रंग और अपारदर्शिता।,The color and opacity of the highlight border.
9,भराई के रंग को हाइलाइट करें,Highlight fill color


In [167]:
data.shape

(1561841, 2)

In [168]:
# Shuffle (optional but recommended for randomness)
data = data.sample(frac=1).reset_index(drop=True)

# Take first 10,000 rows
data = data.iloc[:30000]

# Save it for training
data.to_csv("subset_10k.csv", index=False)
print("data shape : ",data.shape)

data shape :  (30000, 2)


In [169]:
import re

def clean_hindi(text):
    # Handle non-string values (e.g., NaN, None, float)
    if not isinstance(text, str):
        return ""
    
    # Remove characters that are not Hindi or spaces
    text = re.sub(r"[^\u0900-\u097F\s]", "", text)
    
    # Replace multiple spaces with a single space
    text = re.sub(r"\s+", " ", text)
    
    # Trim leading/trailing spaces
    return text.strip()


In [170]:
data["hindi"] = data["hindi"].apply(clean_hindi)


In [171]:
data.hindi.head(10)

0                 बेशक तू तो बड़ा इज्ज़त वाला सरदार है
1    आगामी विभिन्न आयोजनों प्रदर्शनियों तथा व्यापार...
2                                                     
3    तो यह सेकंड के हर घंटे के लिए छोटी इकाइयों सेक...
4                                        किशनगढ़ अजमेर
5                                            नामस्थानः
6    मेरे पिताजी के वसीयत दार के बारे में मुझे पता ...
7    समसामयिक साहित्य मनुष्य के अन्तर्मन और उसके वि...
8    यह भी प्राख्यान किया गया है कि बौद्धिक संपदा अ...
9    जैसा कि पीछे कहा जा चुका है उत्तराखंड में दो क...
Name: hindi, dtype: object

In [172]:
import re

def clean_english(text):
    if not isinstance(text, str):
        return ""
    
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation and special characters (except spaces)
    text = re.sub(r"[^\w\s]", "", text)
    
    # Replace multiple spaces with a single space
    text = re.sub(r"\s+", " ", text)
    
    return text.strip()


In [173]:
data["english"] = data["english"].apply(clean_english)

In [174]:
data.english.head(10)

0         taste this you are a person mighty and noble
1    click on the links below to know about various...
2                       that s what the y intercept is
3    the smaller units are seconds so it s 3600 sec...
4                                           kishangarh
5                                            namespace
6               my father s legatee is not known to me
7    contemporary literature probes deeper into the...
8    it is also asserted that in another proceeding...
9    this area as mentioned earlier consists of two...
Name: english, dtype: object

In [175]:
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

Adding Special Tokens

In [176]:
train_data['english'] = train_data['english'].apply(lambda x: '<sos> ' + x + ' <eos>')
val_data['english'] = val_data['english'].apply(lambda x: '<sos> ' + x + ' <eos>')

In [177]:
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset

def yield_tokens(texts):
    for sent in texts:
        yield sent.split()

In [178]:
hindi_vocab = build_vocab_from_iterator(yield_tokens(train_data['hindi']), specials=["<pad>", "<sos>", "<eos>", "<unk>"])
english_vocab = build_vocab_from_iterator(yield_tokens(train_data['english']), specials=["<pad>", "<sos>", "<eos>", "<unk>"])

hindi_vocab.set_default_index(hindi_vocab["<unk>"])
english_vocab.set_default_index(english_vocab["<unk>"])

Numericalization


In [179]:
def numericalize(sentence, vocab):
    tokens = sentence.split()
    return [vocab["<sos>"]] + [vocab.get(token, vocab["<unk>"]) for token in tokens] + [vocab["<eos>"]]


Custom Dataset for English → Hindi


In [None]:
from torch.utils.data import Dataset
import torch

class TranslationDataset(Dataset):
    def __init__(self, source_sentences, target_sentences, src_vocab, tgt_vocab):
        self.source = source_sentences  # English
        self.target = target_sentences  # Hindi
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.source)
    def __getitem__(self, idx):
        src_sentence = str(self.source[idx])
        tgt_sentence = str(self.target[idx])
    
        src = numericalize(src_sentence, self.src_vocab)
        tgt = numericalize(tgt_sentence, self.tgt_vocab)
    
        return torch.tensor(src), torch.tensor(tgt)


Padding

In [181]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)

    src_batch = pad_sequence(src_batch, batch_first=True, padding_value=english_vocab["<pad>"])
    tgt_batch = pad_sequence(tgt_batch, batch_first=True, padding_value=hindi_vocab["<pad>"])

    return src_batch, tgt_batch


DataLoader

In [182]:
from torch.utils.data import DataLoader

train_dataset = TranslationDataset(
    source_sentences=train_data['english'],
    target_sentences=train_data['hindi'],
    src_vocab=english_vocab,
    tgt_vocab=hindi_vocab
)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)


In [183]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x141380cf430>

In [184]:
import torch.nn as nn

Encoder

In [185]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, num_layers, dropout):
        super(Encoder, self).__init__()

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src: [batch_size, src_len]
        embedded = self.dropout(self.embedding(src))  # [batch_size, src_len, emb_dim]
        outputs, (hidden, cell) = self.rnn(embedded)  # hidden/cell: [num_layers, batch_size, hidden_dim]
        return hidden, cell


Decoder

In [186]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, num_layers, dropout):
        super(Decoder, self).__init__()
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        input = input.unsqueeze(1)
        
        embedded = self.dropout(self.embedding(input))
        
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        
        prediction = self.fc_out(output.squeeze(1))
        
        return prediction, hidden, cell


Define Seq2Seq model

In [187]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.fc_out.out_features

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)
        input = trg[:, 0]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t] = output
            top1 = output.argmax(1)
            input = trg[:, t] if random.random() < teacher_forcing_ratio else top1

        return outputs


Initialize hyperparameters

In [188]:
INPUT_DIM = len(hindi_vocab)
OUTPUT_DIM = len(english_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HIDDEN_DIM = 512
NUM_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

In [189]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


Initialize model objects

In [190]:
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HIDDEN_DIM, NUM_LAYERS, ENC_DROPOUT)
decoder = Encoder(OUTPUT_DIM,DEC_EMB_DIM,HIDDEN_DIM,NUM_LAYERS,DEC_DROPOUT)

model = Seq2Seq(encoder,decoder,device).to(device)

Define Loss Funtion and optimizer

In [191]:
import torch.optim as optim

PAD_IDX = english_vocab['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = optim.Adam(model.parameters(), lr=0.001)


Define Training Function

In [192]:
def train(model, dataloader, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0

    for src, trg in dataloader:
        src = src.to(device)
        trg = trg.to(device)

        optimizer.zero_grad()
        
        output = model(src, trg)  # output: [batch_size, trg_len, output_dim]
        output_dim = output.shape[-1]
        
        # reshape for loss: flatten trg and output
        output = output[:, 1:].reshape(-1, output_dim)  # skip <sos>
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)


Define evaluation Function

In [193]:
def evaluate(model, dataloader, criterion):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for src, trg in dataloader:
            src = src.to(device)
            trg = trg.to(device)

            output = model(src, trg, 0)  # turn off teacher forcing
            output_dim = output.shape[-1]

            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)


Training loop

In [194]:
N_EPOCHS = 10
CLIP = 1

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    val_loss = evaluate(model, train_loader, criterion)  # using train_loader as placeholder

    print(f"Epoch: {epoch+1:02}")
    print(f"\tTrain Loss: {train_loss:.3f}")
    print(f"\t Val. Loss: {val_loss:.3f}")


AttributeError: 'Vocab' object has no attribute 'get'