In [3]:
from google.colab import drive
import os
drive.mount('/content/drive')

# Folder where your data is stored
data_folder = "/content/drive/MyDrive/data"
os.makedirs(data_folder, exist_ok=True)


Mounted at /content/drive


In [13]:
!pip install torch torchvision torchaudio sentencepiece




In [14]:
data_folder = "/content/drive/MyDrive/data"

# English sentences
with open(f"{data_folder}/en.train.fixed.txt", encoding="utf-8") as f:
    english_sentences = f.read().splitlines()

# Kannada sentences
with open(f"{data_folder}/kn.train.fixed.txt", encoding="utf-8") as f:
    kannada_sentences = f.read().splitlines()

print("Loaded dataset:")
print("Total sentences:", len(english_sentences))
print("Sample English:", english_sentences[0])
print("Sample Kannada:", kannada_sentences[0])


Loaded dataset:
Total sentences: 2432
Sample English: Birth: 15-09-1931 Uru: Managuli village of Basavan Bavewadi Taluk in Bijapur District. Education: BA Honors degree. 25-04-1955: Gives Spiritual Repentance. 
Sample Kannada: ಜನನ : 15-09-1931 ಉರು: ಬಿಜಾಪುರ ಜಿಲ್ಲೆಯ ಬಸವನ ಬಾಗೇವಾಡಿ ತಾಲ್ಲೋಕಿನ ಮನುಗೂಳಿ ಗ್ರಾಮ. ವಿದ್ಯಾಭ್ಯಾಸ: ಬಿ.ಎ. ಆನರ್್ಸ ಪದವಿ. 


In [15]:
import sentencepiece as spm
import os

data_folder = "/content/drive/MyDrive/data"

# Make sure dataset is loaded first
with open(f"{data_folder}/en.train.fixed.txt", encoding="utf-8") as f:
    english_sentences = f.read().splitlines()

with open(f"{data_folder}/kn.train.fixed.txt", encoding="utf-8") as f:
    kannada_sentences = f.read().splitlines()

# Save corpus files for tokenizer training
with open(f"{data_folder}/corpus.en", "w", encoding="utf-8") as f:
    f.write("\n".join(english_sentences))

with open(f"{data_folder}/corpus.kn", "w", encoding="utf-8") as f:
    f.write("\n".join(kannada_sentences))

# Train tokenizers (only once; if already trained, you can skip)
spm.SentencePieceTrainer.train(input=f"{data_folder}/corpus.en", model_prefix=f"{data_folder}/spm_en", vocab_size=8000)
spm.SentencePieceTrainer.train(input=f"{data_folder}/corpus.kn", model_prefix=f"{data_folder}/spm_kn", vocab_size=8000)

# Load tokenizers
sp_en = spm.SentencePieceProcessor(model_file=f"{data_folder}/spm_en.model")
sp_kn = spm.SentencePieceProcessor(model_file=f"{data_folder}/spm_kn.model")

print("Tokenizers ready!")


Tokenizers ready!


In [16]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hid_dim*2, hid_dim)

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, hidden = self.rnn(embedded)
        # Combine bidirectional hidden for decoder
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))).unsqueeze(0)
        return outputs, hidden


In [17]:
class LuongAttention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim*3, hid_dim)

    def forward(self, hidden, encoder_outputs):
        # hidden: [1, batch, hid_dim]
        # encoder_outputs: [batch, seq_len, hid_dim*2]
        batch_size = encoder_outputs.shape[0]
        seq_len = encoder_outputs.shape[1]

        hidden_expanded = hidden.permute(1,0,2).repeat(1, seq_len, 1)  # [batch, seq_len, hid_dim]
        energy = torch.tanh(self.attn(torch.cat((hidden_expanded, encoder_outputs), dim=2)))  # [batch, seq_len, hid_dim]
        attention = torch.sum(energy, dim=2)  # [batch, seq_len]
        return torch.softmax(attention, dim=1)


In [18]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, attention, n_layers=1):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim + hid_dim*2, hid_dim, n_layers, batch_first=True)
        self.fc_out = nn.Linear(hid_dim + hid_dim*2 + emb_dim, output_dim)  # Correct dimensions

    def forward(self, input, hidden, encoder_outputs):
        input = input.unsqueeze(1)  # [batch,1]
        embedded = self.embedding(input)  # [batch,1,emb_dim]

        # Attention
        a = self.attention(hidden, encoder_outputs).unsqueeze(1)  # [batch,1,seq_len]
        weighted = torch.bmm(a, encoder_outputs)  # [batch,1,hid_dim*2]

        rnn_input = torch.cat((embedded, weighted), dim=2)  # [batch,1, emb+hid*2]
        output, hidden = self.rnn(rnn_input, hidden)

        output_cat = torch.cat((output.squeeze(1), weighted.squeeze(1), embedded.squeeze(1)), dim=1)  # [batch, hid+hid*2+emb]
        prediction = self.fc_out(output_cat)
        return prediction, hidden


In [19]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size, trg_len = trg.size(0), trg.size(1)
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        encoder_outputs, hidden = self.encoder(src)
        input = trg[:,0]

        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            outputs[:,t] = output
            top1 = output.argmax(1)
            input = trg[:,t] if torch.rand(1).item() < teacher_forcing_ratio else top1
        return outputs


In [25]:
from torch.utils.data import Dataset, DataLoader
import torch.nn.utils.rnn as rnn_utils

# Custom dataset class
class ParallelDataset(Dataset):
    def __init__(self, src_sentences, trg_sentences, sp_src, sp_trg, max_len=50):
        self.src_sentences = src_sentences
        self.trg_sentences = trg_sentences
        self.sp_src = sp_src
        self.sp_trg = sp_trg
        self.max_len = max_len

    def __len__(self):
        return len(self.src_sentences)

    def __getitem__(self, idx):
        src = [self.sp_src.bos_id()] + self.sp_src.EncodeAsIds(self.src_sentences[idx]) + [self.sp_src.eos_id()]
        trg = [self.sp_trg.bos_id()] + self.sp_trg.EncodeAsIds(self.trg_sentences[idx]) + [self.sp_trg.eos_id()]
        return torch.tensor(src[:self.max_len]), torch.tensor(trg[:self.max_len])

# Initialize dataset & dataloader
dataset = ParallelDataset(english_sentences, kannada_sentences, sp_en, sp_kn)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=lambda batch: batch)

print("✅ DataLoader is ready!")


✅ DataLoader is ready!


In [26]:
# Attention
attn = LuongAttention(HID_DIM)

# Encoder & Decoder
enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM).to(DEVICE)
dec = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM, attn).to(DEVICE)

# Seq2Seq
model = Seq2Seq(enc, dec, DEVICE).to(DEVICE)

# Optimizer & Loss
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=0)  # ignore padding


In [27]:
import torch

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

INPUT_DIM = len(sp_en)    # English vocab size
OUTPUT_DIM = len(sp_kn)   # Kannada vocab size
HID_DIM = 256
EMB_DIM = 256
EPOCHS = 20
BATCH_SIZE = 32


In [28]:
import torch.nn.utils.rnn as rnn_utils

for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0

    for batch in dataloader:
        src, trg = zip(*batch)
        src = rnn_utils.pad_sequence(src, batch_first=True).to(DEVICE)
        trg = rnn_utils.pad_sequence(trg, batch_first=True).to(DEVICE)

        optimizer.zero_grad()
        output = model(src, trg)  # [batch, trg_len, vocab_size]

        output_dim = output.shape[-1]
        output = output[:,1:,:].reshape(-1, output_dim)  # ignore <sos>
        trg = trg[:,1:].reshape(-1)

        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {epoch_loss/len(dataloader):.4f}")


Epoch 1/20, Loss: 7.0969
Epoch 2/20, Loss: 5.9858
Epoch 3/20, Loss: 5.4051
Epoch 4/20, Loss: 4.8945
Epoch 5/20, Loss: 4.3425
Epoch 6/20, Loss: 3.8701
Epoch 7/20, Loss: 3.4001
Epoch 8/20, Loss: 3.0456
Epoch 9/20, Loss: 2.7187
Epoch 10/20, Loss: 2.4415
Epoch 11/20, Loss: 2.2458
Epoch 12/20, Loss: 2.0095
Epoch 13/20, Loss: 1.8493
Epoch 14/20, Loss: 1.6637
Epoch 15/20, Loss: 1.5116
Epoch 16/20, Loss: 1.3641
Epoch 17/20, Loss: 1.1976
Epoch 18/20, Loss: 1.1083
Epoch 19/20, Loss: 0.9724
Epoch 20/20, Loss: 0.8254


In [29]:
def translate_sentence(sentence, model, sp_en, sp_kn, max_len=50):
    model.eval()
    tokens = [1] + sp_en.encode(sentence, out_type=int) + [2]
    src_tensor = torch.tensor(tokens).unsqueeze(0).to(DEVICE)

    encoder_outputs, hidden = model.encoder(src_tensor)
    input_tok = torch.tensor([1]).to(DEVICE)  # <sos>
    outputs = []

    for _ in range(max_len):
        output, hidden = model.decoder(input_tok, hidden, encoder_outputs)
        pred_token = output.argmax(1).item()
        if pred_token == 2:  # <eos>
            break
        outputs.append(pred_token)
        input_tok = torch.tensor([pred_token]).to(DEVICE)

    return sp_kn.decode(outputs)


In [None]:
while True:
    english_text = input("Enter English sentence (or 'quit' to stop): ")
    if english_text.lower() == "quit":
        break
    kannada_translation = translate_sentence(english_text, model, sp_en, sp_kn)
    print("Kannada:", kannada_translation)


Enter English sentence (or 'quit' to stop): Birth: 15-09-1931 Uru: Managuli village of Basavan Bavewadi Taluk in Bijapur District. Education: BA Honors degree. 25-04-1955: Gives Spiritual Repentance.
Kannada: ಜನನ : ಬಿ ಜನನ ಜಿಲ್ಲೆಯ ಬಸವನ ಬಾಗೇವಾಡಿ ತಾಲ್ಲೋಕಿನ ಮನುಗೂಳಿ ಗ್ರಾಮ. ವಿದ್ಯಾಭ್ಯಾಸ: ಬಿಎ. ಆನರ್್ಸ ಪದವಿರ್.ಸ ಪದವಿ. ಪದವಿರ್ ಪ್ಯಾರ್ಸ ಪದವಿ. ಪದವಿ.ಸ ಪದವಿ.ಸ ಪದವಿ. ಪದವಿ.ಸ ಪದವಿ
