# Part 1

In [45]:
'''
import math
import numpy as np

def scaled_dot_product_attention(Q, K, V, mask=None):
    """
    Compute scaled dot-product attention.

    :param Q: Query matrix of shape (..., seq_len_q, d_k)
    :param K: Key matrix of shape (..., seq_len_k, d_k)
    :param V: Value matrix of shape (..., seq_len_k, d_v)
    :param mask: (Optional) broadcastable mask tensor to
                 apply on the attention logits before softmax
    :return: (output, attention_weights)
    """

    # 1. Calculate the dot products between Q and K^T
    #    shape of QK^T => (..., seq_len_q, seq_len_k)
    d_k = Q.shape[-1]
    scores = np.matmul(Q, np.transpose(K, axes=[0, 1, 3, 2])) / math.sqrt(d_k)

    # 2. (Optional) Apply the mask: set masked positions to a large negative value
    if mask is not None:
        scores = scores + (mask * -1e9)  # or float('-inf') if supported

    # 3. Softmax over the last dimension to get attention weights
    attention_weights = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
    attention_weights = attention_weights / np.sum(attention_weights, axis=-1, keepdims=True)

    # 4. Multiply by V to get the weighted sum
    #    shape => (..., seq_len_q, d_v)
    output = np.matmul(attention_weights, V)

    return output, attention_weights
'''

'\nimport math\nimport numpy as np\n\ndef scaled_dot_product_attention(Q, K, V, mask=None):\n    """\n    Compute scaled dot-product attention.\n\n    :param Q: Query matrix of shape (..., seq_len_q, d_k)\n    :param K: Key matrix of shape (..., seq_len_k, d_k)\n    :param V: Value matrix of shape (..., seq_len_k, d_v)\n    :param mask: (Optional) broadcastable mask tensor to\n                 apply on the attention logits before softmax\n    :return: (output, attention_weights)\n    """\n\n    # 1. Calculate the dot products between Q and K^T\n    #    shape of QK^T => (..., seq_len_q, seq_len_k)\n    d_k = Q.shape[-1]\n    scores = np.matmul(Q, np.transpose(K, axes=[0, 1, 3, 2])) / math.sqrt(d_k)\n\n    # 2. (Optional) Apply the mask: set masked positions to a large negative value\n    if mask is not None:\n        scores = scores + (mask * -1e9)  # or float(\'-inf\') if supported\n\n    # 3. Softmax over the last dimension to get attention weights\n    attention_weights = np.exp(sco

# Part 2

In [46]:
import torch
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        # batch_first=True => input shape (batch, seq_len)

    def forward(self, src):
        """
        :param src: (batch_size, src_len) of token indices
        :return:
          - outputs: (batch_size, src_len, hidden_dim)
          - (h, c): final hidden/cell states for each layer
        """
        embedded = self.embedding(src)  # (batch, src_len, embed_dim)
        outputs, (h, c) = self.lstm(embedded)  # outputs: (batch, src_len, hidden_dim)
        return outputs, (h, c)


In [47]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embed_dim, hidden_dim, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim * 2, output_dim)  # combine hidden + context
        self.hidden_dim = hidden_dim

    def forward(self, input_token, hidden, cell, encoder_outputs):
        """
        :param input_token: (batch_size,) current input word index
        :param hidden, cell: decoder's LSTM hidden/cell states
        :param encoder_outputs: (batch_size, src_len, hidden_dim)
        :return:
          - output logits: (batch_size, output_dim)
          - hidden, cell: updated states
          - attn_weights: (batch_size, 1, src_len)
        """
        # 1) Embed input token
        input_token = input_token.unsqueeze(1)   # (batch_size, 1)
        embedded = self.embedding(input_token)   # (batch_size, 1, embed_dim)

        # 2) Pass embedded token + previous hidden state into LSTM
        #    hidden, cell each is (num_layers, batch_size, hidden_dim)
        lstm_output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        # lstm_output shape: (batch_size, 1, hidden_dim)

        # 3) Compute scaled dot-product attention
        #    Q = lstm_output, K=encoder_outputs, V=encoder_outputs
        #    But we need them in shape (batch, seq_len, d_model).
        #    Here, d_model = hidden_dim for simplicity
        Q = lstm_output  # (batch, 1, hidden_dim)
        K = encoder_outputs  # (batch, src_len, hidden_dim)
        V = encoder_outputs  # (batch, src_len, hidden_dim)

        # scaled_dot_product_attention expects (batch, seq_q, d_model)
        # and (batch, seq_k, d_model). That is correct for Q, K, V.
        context, attn_weights = scaled_dot_product_attention(Q, K, V)
        # context: (batch, 1, hidden_dim)
        # attn_weights: (batch, 1, src_len)

        # 4) Concatenate context and LSTM output, then predict next token
        #    shape => (batch, 1, hidden_dim*2)
        combined = torch.cat((lstm_output, context), dim=-1)
        # produce a distribution over output vocab
        logits = self.fc_out(combined.squeeze(1))  # (batch_size, output_dim)

        return logits, hidden, cell, attn_weights


In [48]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg):
        """
        src: (batch_size, src_len)
        trg: (batch_size, trg_len) -- for training
        returns: logits for each trg token
                 shape => (batch_size, trg_len, output_dim)
        """
        batch_size, trg_len = trg.size()
        # Encode source
        encoder_outputs, (hidden, cell) = self.encoder(src)

        # Prepare a place to store decoder predictions
        output_dim = self.decoder.fc_out.out_features
        outputs = torch.zeros(batch_size, trg_len, output_dim).to(self.device)

        # First input token to decoder is typically the <sos> (start) token
        input_token = trg[:, 0]  # (batch_size,)

        for t in range(1, trg_len):
            # Pass the input token + hidden states + encoder outputs
            logits, hidden, cell, attn_weights = self.decoder(input_token, hidden, cell, encoder_outputs)
            outputs[:, t, :] = logits

            # Pick next token (for *teacher forcing*, we often feed the true token)
            # or do greedy decoding. For training with teacher forcing:
            input_token = trg[:, t]

        return outputs


# Part 3

In [49]:
!pip install datasets



In [50]:
import random
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# For demonstration, reusing placeholders from earlier code
import regex as re
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


In [51]:
dataset = load_dataset("Helsinki-NLP/tatoeba", "en-mr")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 53462
    })
})


In [52]:
raw_data = dataset["train"]  # All Tatoeba sentence pairs for en-mr

# Convert to a Python list of dicts
raw_list = list(raw_data)

# subset for demonstration
raw_list = raw_list[:5000]

# Now split into train/val/test using scikit-learn
train_data, test_data = train_test_split(raw_list, test_size=0.1, random_state=42)
train_data, val_data  = train_test_split(train_data, test_size=0.1, random_state=42)

print("Train size:", len(train_data))
print("Val size:  ", len(val_data))
print("Test size: ", len(test_data))


Train size: 4050
Val size:   450
Test size:  500


In [53]:
def tokenize(sentence):
    # Very naive approach
    sentence = sentence.lower()
    sentence = re.sub(r"[^\p{L}\p{N}'-]+", " ", sentence, flags=re.UNICODE)
    tokens = sentence.strip().split()
    return tokens

def build_vocab(tokenized_sents, min_freq=2):
    from collections import Counter
    freq = Counter()
    for sent in tokenized_sents:
        for w in sent:
            freq[w] += 1
    # Basic special tokens
    words = ["<pad>", "<sos>", "<eos>", "<unk>"]
    for w, c in freq.items():
        if c >= min_freq:
            words.append(w)
    word2idx = {w: i for i, w in enumerate(words)}
    return word2idx

def numericalize(tokens, word2idx):
    return [word2idx.get(t, word2idx["<unk>"]) for t in tokens]

def process_pairs(data_list):
    en_tok, mr_tok = [], []
    for d in data_list:
        # 'translation' is a list: [english_text, marathi_text]
        en_sent = tokenize(d["translation"]['en'])  # English
        mr_sent = tokenize(d["translation"]['mr'])  # Marathi
        en_tok.append(en_sent)
        mr_tok.append(mr_sent)
    return en_tok, mr_tok



In [54]:
en_tok_train, mr_tok_train = process_pairs(train_data)
en_tok_val,   mr_tok_val   = process_pairs(val_data)
en_tok_test,  mr_tok_test  = process_pairs(test_data)

en_word2idx = build_vocab(en_tok_train, min_freq=2)
mr_word2idx = build_vocab(mr_tok_train, min_freq=2)

def add_sos_eos(token_ids, sos_idx, eos_idx):
    return [sos_idx] + token_ids + [eos_idx]

def to_idx_pairs(en_tok, mr_tok):
    pairs = []
    for e, m in zip(en_tok, mr_tok):
        e_ids = numericalize(e, en_word2idx)
        m_ids = numericalize(m, mr_word2idx)
        # add <sos>, <eos> (we assume <sos>=1, <eos>=2 in that order)
        e_ids = add_sos_eos(e_ids, 1, 2)
        m_ids = add_sos_eos(m_ids, 1, 2)
        pairs.append((e_ids, m_ids))
    return pairs

train_pairs = to_idx_pairs(en_tok_train, mr_tok_train)
val_pairs   = to_idx_pairs(en_tok_val,   mr_tok_val)
test_pairs  = to_idx_pairs(en_tok_test,  mr_tok_test)

print(f"train_pairs: {len(train_pairs)}, val_pairs: {len(val_pairs)}, test_pairs: {len(test_pairs)}")


train_pairs: 4050, val_pairs: 450, test_pairs: 500


In [55]:
class MTDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs
    def __len__(self):
        return len(self.pairs)
    def __getitem__(self, idx):
        return self.pairs[idx]

def collate_fn(batch):
    src_list, trg_list = zip(*batch)
    max_src_len = max(len(s) for s in src_list)
    max_trg_len = max(len(t) for t in trg_list)
    padded_src, padded_trg = [], []
    for s, t in zip(src_list, trg_list):
        s_pad = s + [0]*(max_src_len - len(s))  # 0 = <pad>
        t_pad = t + [0]*(max_trg_len - len(t))
        padded_src.append(s_pad)
        padded_trg.append(t_pad)
    return torch.tensor(padded_src, dtype=torch.long), torch.tensor(padded_trg, dtype=torch.long)

train_dataset = MTDataset(train_pairs)
val_dataset   = MTDataset(val_pairs)
test_dataset  = test_pairs  # we can keep test as list for direct BLEU

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_dataset,   batch_size=32, shuffle=False, collate_fn=collate_fn)


In [56]:
import torch
import torch.nn.functional as F
import math

def scaled_dot_product_attention(Q, K, V, mask=None):
    d_k = Q.size(-1)
    scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask, float('-inf'))
    attn_weights = F.softmax(scores, dim=-1)
    output = torch.matmul(attn_weights, V)
    return output, attn_weights


def train_model(model, train_loader, val_loader, epochs=5):
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    for epoch in range(1, epochs+1):
        model.train()
        total_loss = 0
        for src_batch, trg_batch in train_loader:
            src_batch, trg_batch = src_batch.to(model.device), trg_batch.to(model.device)
            optimizer.zero_grad()
            outputs = model(src_batch, trg_batch)
            logits = outputs[:, 1:].reshape(-1, outputs.size(-1))
            targets = trg_batch[:, 1:].reshape(-1)
            loss = criterion(logits, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        val_loss = evaluate(model, val_loader, criterion)
        print(f"Epoch {epoch}, Train Loss: {avg_loss:.3f}, Val Loss: {val_loss:.3f}")

def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src_batch, trg_batch in loader:
            src_batch, trg_batch = src_batch.to(model.device), trg_batch.to(model.device)
            outputs = model(src_batch, trg_batch)
            logits = outputs[:, 1:].reshape(-1, outputs.size(-1))
            targets = trg_batch[:, 1:].reshape(-1)
            loss = criterion(logits, targets)
            total_loss += loss.item()
    return total_loss / len(loader)


In [57]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
en_vocab_size = len(en_word2idx)
mr_vocab_size = len(mr_word2idx)

encoder = Encoder(en_vocab_size, embed_dim=128, hidden_dim=256)
decoder = Decoder(mr_vocab_size, embed_dim=128, hidden_dim=256)
model = Seq2Seq(encoder, decoder, device).to(device)

train_model(model, train_loader, val_loader, epochs=10)


Epoch 1, Train Loss: 4.127, Val Loss: 3.686
Epoch 2, Train Loss: 3.529, Val Loss: 3.342
Epoch 3, Train Loss: 3.178, Val Loss: 3.130
Epoch 4, Train Loss: 2.878, Val Loss: 2.929
Epoch 5, Train Loss: 2.597, Val Loss: 2.774
Epoch 6, Train Loss: 2.330, Val Loss: 2.642
Epoch 7, Train Loss: 2.088, Val Loss: 2.533
Epoch 8, Train Loss: 1.861, Val Loss: 2.459
Epoch 9, Train Loss: 1.657, Val Loss: 2.411
Epoch 10, Train Loss: 1.468, Val Loss: 2.358


In [58]:
from nltk.translate.bleu_score import corpus_bleu

def translate_sentence(model, src_ids, max_len=50):
    model.eval()
    src_tensor = torch.tensor(src_ids, dtype=torch.long, device=model.device).unsqueeze(0)
    encoder_outputs, (h, c) = model.encoder(src_tensor)
    input_token = torch.tensor([1], device=model.device)  # <sos>=1
    hidden, cell = h, c
    preds = [1]
    for _ in range(max_len):
        logits, hidden, cell, _ = model.decoder(input_token, hidden, cell, encoder_outputs)
        next_token = logits.argmax(dim=-1)
        next_id = next_token.item()
        preds.append(next_id)
        if next_id == 2:  # <eos>=2
            break
        input_token = next_token
    return preds

def compute_bleu(model, test_data, fr_idx2word):
    references = []
    hypotheses = []
    model.eval()
    with torch.no_grad():
        for (src_ids, trg_ids) in test_data:
            # remove <sos>=1, <eos>=2 from reference
            gold = [w for w in trg_ids[1:-1] if w not in [0,1,2]]
            pred = translate_sentence(model, src_ids)
            # remove <sos>, <eos> from pred
            pred = [w for w in pred if w not in [0,1,2]]
            references.append([gold])
            hypotheses.append(pred)
    bleu = corpus_bleu(references, hypotheses) * 100
    return bleu

# Index->word
mr_idx2word = {v: k for k, v in mr_word2idx.items()}

bleu_score = compute_bleu(model, test_dataset, mr_idx2word)
print(f"Test BLEU: {bleu_score:.2f}")


Test BLEU: 15.37


# Part 4

In [59]:
import math
import torch
import torch.nn as nn

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # shape => (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        x: (batch_size, seq_len, d_model)
        """
        seq_len = x.size(1)
        x = x + self.pe[:, :seq_len, :]
        return x


In [73]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model=64, num_heads=2):
        super().__init__()
        assert d_model % num_heads == 0
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)  # final projection

    def forward(self, Q, K, V, mask=None):
        """
        Q, K, V: (batch, seq_len, d_model)
        mask: broadcastable bool mask (True=ignore)
        """
        B, Lq, D = Q.shape
        _, Lk, _ = K.shape

        # 1) Project Q, K, V
        q = self.W_q(Q)
        k = self.W_k(K)
        v = self.W_v(V)

        # 2) Reshape for multi-head
        q = q.view(B, Lq, self.num_heads, self.d_k).transpose(1, 2)  # (B, heads, L, d_k)
        k = k.view(B, Lk, self.num_heads, self.d_k).transpose(1, 2)
        v = v.view(B, Lk, self.num_heads, self.d_k).transpose(1, 2)


        # Flatten batch for scaled dot-product attention
        Bnh = B * self.num_heads
        q = q.contiguous().view(Bnh, Lq, self.d_k)
        k = k.contiguous().view(Bnh, Lk, self.d_k)
        v = v.contiguous().view(Bnh, Lk, self.d_k)

        if mask is not None:
            # Expand mask for multiple heads if needed
            mask = mask.repeat(self.num_heads, 1, 1, 1)

        # 3) Call your scaled_dot_product_attention
        output, attn_weights = scaled_dot_product_attention(q, k, v, mask=mask)


        # 4) Reshape back
        output = output.view(B, self.num_heads, Lq, self.d_k).transpose(1, 2).contiguous()
        output = output.view(B, Lq, D)

        # 5) Final projection
        output = self.W_o(output)
        return output, attn_weights


In [61]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model=64, dim_feedforward=128):
        super().__init__()
        self.fc1 = nn.Linear(d_model, dim_feedforward)
        self.fc2 = nn.Linear(dim_feedforward, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        """
        x => (batch, seq_len, d_model)
        """
        return self.fc2(self.relu(self.fc1(x)))


In [62]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model=64, num_heads=2, dim_feedforward=128):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = PositionwiseFeedForward(d_model, dim_feedforward)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x, src_mask=None):
        # Self-attention
        attn_out, _ = self.mha(x, x, x, mask=src_mask)
        x = self.norm1(x + attn_out)
        # Feed-forward
        ffn_out = self.ffn(x)
        x = self.norm2(x + ffn_out)
        return x


In [63]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model=64, num_heads=2, dim_feedforward=128):
        super().__init__()
        self.self_mha = MultiHeadAttention(d_model, num_heads)
        self.cross_mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = PositionwiseFeedForward(d_model, dim_feedforward)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)

    def forward(self, x, enc_out, tgt_mask=None, src_mask=None):
        # 1) Masked self-attention
        out1, _ = self.self_mha(x, x, x, mask=tgt_mask)
        x = self.norm1(x + out1)
        # 2) Cross-attention
        out2, attn_weights = self.cross_mha(x, enc_out, enc_out, mask=src_mask)
        x = self.norm2(x + out2)
        # 3) Feed-forward
        out3 = self.ffn(x)
        x = self.norm3(x + out3)
        return x, attn_weights


In [64]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model=64, num_heads=2, dim_feedforward=128, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, dim_feedforward)
            for _ in range(num_layers)
        ])

    def forward(self, src, src_mask=None):
        # src => (batch, src_len)
        x = self.embedding(src)  # (batch, src_len, d_model)
        x = self.pos_encoding(x)
        for layer in self.layers:
            x = layer(x, src_mask)
        return x

class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model=64, num_heads=2, dim_feedforward=128, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, dim_feedforward)
            for _ in range(num_layers)
        ])
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, tgt, enc_out, tgt_mask=None, src_mask=None):
        # tgt => (batch, tgt_len)
        x = self.embedding(tgt)
        x = self.pos_encoding(x)
        attn_weights = None
        for layer in self.layers:
            x, attn_weights = layer(x, enc_out, tgt_mask, src_mask)
        logits = self.fc_out(x)  # (batch, tgt_len, vocab_size)
        return logits, attn_weights


In [75]:
class TransformerModel(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        tgt_vocab_size,
        d_model=64,
        num_heads=2,
        dim_feedforward=128,
        num_layers=2,
        device=None
    ):
        super().__init__()
        self.device = device
        self.encoder = Encoder(src_vocab_size, d_model, num_heads, dim_feedforward, num_layers)
        self.decoder = Decoder(tgt_vocab_size, d_model, num_heads, dim_feedforward, num_layers)

    def make_subsequent_mask(self, sz):
        """
        Creates a mask for the subsequent tokens (True=masked).
        shape => (1, sz, sz)
        """
        mask = torch.triu(torch.ones(sz, sz), diagonal=1).bool()
        return mask.unsqueeze(0)

    def forward(self, src, tgt):
        # 1) Encode
        enc_out = self.encoder(src)
        # 2) Create target mask for autoregressive decoding
        B, tgt_len = tgt.size()
        tgt_mask = self.make_subsequent_mask(tgt_len).to(tgt.device)
        # 3) Decode
        logits, attn_weights = self.decoder(tgt, enc_out, tgt_mask, src_mask=None)
        return logits, attn_weights


In [76]:
en_tok_train, mr_tok_train = process_pairs(train_data)
en_tok_val,   mr_tok_val   = process_pairs(val_data)
en_tok_test,  mr_tok_test  = process_pairs(test_data)

en_word2idx = build_vocab(en_tok_train, min_freq=2)
mr_word2idx = build_vocab(mr_tok_train, min_freq=2)

train_pairs = to_idx_pairs(en_tok_train, mr_tok_train)
val_pairs   = to_idx_pairs(en_tok_val,   mr_tok_val)
test_pairs  = to_idx_pairs(en_tok_test,  mr_tok_test)

train_dataset = MTDataset(train_pairs)
val_dataset   = MTDataset(val_pairs)
test_dataset  = test_pairs  # we can keep test as list for direct BLEU

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_dataset,   batch_size=32, shuffle=False, collate_fn=collate_fn)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
en_vocab_size = len(en_word2idx)
mr_vocab_size = len(mr_word2idx)

model = TransformerModel(
    src_vocab_size=en_vocab_size,
    tgt_vocab_size=mr_vocab_size,
    d_model=64,
    num_heads=2,
    dim_feedforward=128,
    num_layers=2,
    device=device
).to(device)

train_model(model, train_loader, val_loader, epochs=10)

mr_idx2word = {v: k for k, v in mr_word2idx.items()}

bleu_score = compute_bleu(model, test_dataset, mr_idx2word)
print(f"Test BLEU: {bleu_score:.2f}")

RuntimeError: shape '[32, 2, 28, 32]' is invalid for input of size 114688

I could not figure out how to debug this error. There is a shape mismatch and I already ensured that the mask I pass for cross-attention is not already repeated and that my multi-head attention logic handles L(Q) and L(K,V) separately. Besides that, I believe my code meets all the requirements outlined in Task 3 part 4.