In [1]:
import torch
import torch.nn as nn
import numpy as np

# Hyperparameters
d_model = 16  # Reduced from 64
n_heads = 2   # Reduced from 4
d_ff = 32     # Reduced from 128
max_seq_len = 10
vocab_size_en = 12
vocab_size_ka = 12
learning_rate = 0.001
warmup_steps = 400
epochs = 1000
label_smoothing = 0.1

# Scaled Dot-Product Attention
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, Q, K, V, mask=None):
        d_k = Q.size(-1)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attention_weights = torch.softmax(scores, dim=-1)
        output = torch.matmul(attention_weights, V)
        return output, attention_weights

# Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        self.attention = ScaledDotProductAttention()
        # Xavier initialization
        nn.init.xavier_uniform_(self.W_q.weight)
        nn.init.xavier_uniform_(self.W_k.weight)
        nn.init.xavier_uniform_(self.W_v.weight)
        nn.init.xavier_uniform_(self.W_o.weight)

    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)
        Q = self.W_q(Q).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        K = self.W_k(K).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        V = self.W_v(V).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        output, attn_weights = self.attention(Q, K, V, mask)
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        return self.W_o(output)

# Feed-Forward Network
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        nn.init.xavier_uniform_(self.linear1.weight)
        nn.init.xavier_uniform_(self.linear2.weight)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.linear2(self.relu(self.linear1(x)))

# Positional Encoding
def positional_encoding(max_seq_len, d_model):
    pe = torch.zeros(max_seq_len, d_model)
    position = torch.arange(0, max_seq_len, dtype=torch.float32).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float32) * (-np.log(10000.0) / d_model))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    return pe

# Layer Normalization
class LayerNorm(nn.Module):
    def __init__(self, d_model, epsilon=1e-6):
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))
        self.epsilon = epsilon

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        return self.gamma * (x - mean) / (std + self.epsilon) + self.beta

# Transformer Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, n_heads)
        self.ffn = FeedForward(d_model, d_ff)
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)

    def forward(self, x, mask=None):
        attn_output = self.mha(x, x, x, mask)
        x = self.norm1(x + attn_output)
        ffn_output = self.ffn(x)
        x = self.norm2(x + ffn_output)
        return x

# Transformer Decoder Layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff):
        super().__init__()
        self.mha1 = MultiHeadAttention(d_model, n_heads)
        self.mha2 = MultiHeadAttention(d_model, n_heads)
        self.ffn = FeedForward(d_model, d_ff)
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.norm3 = LayerNorm(d_model)

    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        attn1 = self.mha1(x, x, x, tgt_mask)
        x = self.norm1(x + attn1)
        attn2 = self.mha2(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + attn2)
        ffn_output = self.ffn(x)
        x = self.norm3(x + ffn_output)
        return x

# Transformer Model
class Transformer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, max_seq_len, vocab_size_en, vocab_size_ka):
        super().__init__()
        self.encoder = EncoderLayer(d_model, n_heads, d_ff)
        self.decoder = DecoderLayer(d_model, n_heads, d_ff)
        self.pos_enc = positional_encoding(max_seq_len, d_model)
        self.embedding_en = nn.Embedding(vocab_size_en, d_model)
        self.embedding_ka = nn.Embedding(vocab_size_ka, d_model)
        self.output_layer = nn.Linear(d_model, vocab_size_ka)
        nn.init.xavier_uniform_(self.embedding_en.weight)
        nn.init.xavier_uniform_(self.embedding_ka.weight)
        nn.init.xavier_uniform_(self.output_layer.weight)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        src_emb = self.embedding_en(src) + self.pos_enc[:src.size(1), :].to(src.device)
        tgt_emb = self.embedding_ka(tgt) + self.pos_enc[:tgt.size(1), :].to(tgt.device)
        enc_output = self.encoder(src_emb, src_mask)
        dec_output = self.decoder(tgt_emb, enc_output, src_mask, tgt_mask)
        return self.output_layer(dec_output)

# Create Masks
def create_padding_mask(seq, pad_id=0):
    return (seq != pad_id).unsqueeze(1).unsqueeze(2).long()

def create_causal_mask(seq_len):
    mask = torch.tril(torch.ones(seq_len, seq_len)).unsqueeze(0).unsqueeze(1)
    return mask

# Synthetic Dataset
vocab_en = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "i": 3, "go": 4, "to": 5, "school": 6, "read": 7, "book": 8, "we": 9, "eat": 10, "food": 11}
vocab_ka = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "naanu": 3, "hoguttene": 4, "ge": 5, "shale": 6, "oduttene": 7, "pustaka": 8, "naavu": 9, "tinnuttene": 10, "ahara": 11}
data = [
    ([1, 3, 5, 6, 4, 2], [1, 3, 4, 5, 6, 2]),  # Naanu ge shale hoguttene -> I go to school
    ([1, 3, 8, 7, 2], [1, 3, 7, 8, 2]),        # Naanu pustaka oduttene -> I read book
    ([1, 9, 11, 10, 2], [1, 9, 10, 11, 2]),    # Naavu ahara tinnuttene -> We eat food
    ([1, 3, 11, 10, 2], [1, 3, 10, 11, 2]),    # Naanu ahara tinnuttene -> I eat food
    ([1, 9, 5, 6, 4, 2], [1, 9, 4, 5, 6, 2]),  # Naavu ge shale hoguttene -> We go to school
    ([1, 3, 5, 6, 7, 2], [1, 3, 7, 5, 6, 2]),  # Naanu ge shale oduttene -> I read to school
    ([1, 9, 8, 7, 2], [1, 9, 7, 8, 2]),        # Naavu pustaka oduttene -> We read book
    ([1, 3, 8, 4, 2], [1, 3, 4, 8, 2]),        # Naanu pustaka hoguttene -> I go book
    ([1, 9, 5, 6, 7, 2], [1, 9, 7, 5, 6, 2]),  # Naavu ge shale oduttene -> We read to school
]


# Cross-Entropy Loss with Label Smoothing
class CrossEntropyLossWithSmoothing(nn.Module):
    def __init__(self, vocab_size, smoothing=0.1):
        super().__init__()
        self.vocab_size = vocab_size
        self.smoothing = smoothing

    def forward(self, output, target):
        output = output.view(-1, self.vocab_size)
        target = target.view(-1)
        log_probs = torch.log_softmax(output, dim=-1)
        confidence = 1 - self.smoothing
        label_smoothed = torch.full_like(log_probs, self.smoothing / (self.vocab_size - 1))
        label_smoothed.scatter_(1, target.unsqueeze(1), confidence)
        label_smoothed[target == 0] = 0  # Ignore padding
        return -torch.mean(torch.sum(label_smoothed * log_probs, dim=-1))

# Learning Rate Schedule
class TransformerScheduler:
    def __init__(self, optimizer, d_model, warmup_steps):
        self.optimizer = optimizer
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self.step_num = 0

    def step(self):
        self.step_num += 1
        lr = (self.d_model ** -0.5) * min(self.step_num ** -0.5, self.step_num * self.warmup_steps ** -1.5)
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr
        return lr

# Training
# model = Transformer(d_model, n_heads, d_ff, max_seq_len, len(vocab_en), len(vocab_ka))

model = Transformer(d_model, n_heads, d_ff, max_seq_len, len(vocab_ka), len(vocab_en))
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9)
scheduler = TransformerScheduler(optimizer, d_model, warmup_steps)
criterion = CrossEntropyLossWithSmoothing(vocab_size_ka, label_smoothing)

def train(model, data, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for src, tgt in data:
            src = torch.tensor([src], dtype=torch.long)
            tgt_in = torch.tensor([tgt[:-1]], dtype=torch.long)
            tgt_out = torch.tensor([tgt[1:]], dtype=torch.long)
            src_mask = create_padding_mask(src)
            tgt_mask = create_causal_mask(tgt_in.size(1))
            optimizer.zero_grad()
            output = model(src, tgt_in, src_mask, tgt_mask)
            loss = criterion(output, tgt_out)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Loss: {total_loss / len(data):.4f}")

train(model, data, epochs)



# translate_ka_to_en function
def translate_ka_to_en(model, sentence, vocab_ka, vocab_en, max_len=10):
    model.eval()
    inv_vocab_en = {v: k for k, v in vocab_en.items()}
    tokens = ["<sos>"] + sentence.lower().split() + ["<eos>"]
    src = [vocab_ka.get(t, 0) for t in tokens]
    src = torch.tensor([src + [0] * (max_len - len(src))], dtype=torch.long)
    src_mask = create_padding_mask(src)
    tgt = torch.tensor([[vocab_en["<sos>"]] + [0] * (max_len - 1)], dtype=torch.long)

    for i in range(max_len - 1):
        tgt_mask = create_causal_mask(i + 1)
        with torch.no_grad():
            output = model(src, tgt[:, :i + 1], src_mask, tgt_mask)
        next_token = output[0, i, :].argmax().item()
        tgt[0, i + 1] = next_token
        if next_token == vocab_en["<eos>"]:
            break

    return " ".join(inv_vocab_en[t.item()] for t in tgt[0] if t.item() != 0 and t.item() != vocab_en["<sos>"] and t.item() != vocab_en["<eos>"])

# Test
test_sentences_ka = [
    "naanu ge shale hoguttene",
    "naavu ahara tinnuttene",
    "naanu pustaka oduttene"
]

for sentence in test_sentences_ka:
    translated = translate_ka_to_en(model, sentence, vocab_ka, vocab_en)
    print(f"Kannada: {sentence}")
    print(f"English: {translated}\n")


Epoch 0, Loss: 2.7710
Epoch 100, Loss: 0.5653
Epoch 200, Loss: 0.5650
Epoch 300, Loss: 0.5650
Epoch 400, Loss: 0.5650
Epoch 500, Loss: 0.5649
Epoch 600, Loss: 0.5649
Epoch 700, Loss: 0.5649
Epoch 800, Loss: 0.5649
Epoch 900, Loss: 0.5649
Kannada: naanu ge shale hoguttene
English: i go to school

Kannada: naavu ahara tinnuttene
English: we eat food

Kannada: naanu pustaka oduttene
English: i read book

