<a href="https://colab.research.google.com/github/sachin886x/deep-learning-lab/blob/main/Untitled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Experiment 7: Sequence-to-Sequence Learning with Transformers
English-to-Spanish Neural Machine Translation
"""

import math
import time
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from collections import Counter

# ─────────────────────────────────────────────
# 0. Config
# ─────────────────────────────────────────────
DEVICE      = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DATA_PATH   = "spa.txt"          # tab-separated English\tSpanish file
MAX_SAMPLES = 10_000
MAX_LEN     = 50
MIN_FREQ    = 2
BATCH_SIZE  = 64
D_MODEL     = 256
N_HEADS     = 8
N_LAYERS    = 3
D_FF        = 512
DROPOUT     = 0.1
EPOCHS      = 20
LR          = 3e-4
CLIP        = 1.0

PAD, SOS, EOS, UNK = "<pad>", "<sos>", "<eos>", "<unk>"

# ─────────────────────────────────────────────
# 1. Data
# ─────────────────────────────────────────────
def load_pairs(path, max_samples=MAX_SAMPLES):
    pairs = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) >= 2:
                pairs.append((parts[0].lower(), parts[1].lower()))
    random.shuffle(pairs)
    return pairs[:max_samples]

def tokenize(sentence):
    import re
    return re.findall(r"\w+|[^\w\s]", sentence)

class Vocab:
    def __init__(self, tokens, min_freq=MIN_FREQ):
        counter = Counter(tokens)
        self.itos = [PAD, SOS, EOS, UNK] + [t for t, c in counter.items() if c >= min_freq]
        self.stoi = {t: i for i, t in enumerate(self.itos)}
    def __len__(self): return len(self.itos)
    def encode(self, tokens):
        return [self.stoi.get(t, self.stoi[UNK]) for t in tokens]

def build_vocabs(pairs):
    src_tokens = [t for en, _ in pairs for t in tokenize(en)]
    tgt_tokens = [t for _, es in pairs for t in tokenize(es)]
    return Vocab(src_tokens), Vocab(tgt_tokens)

class TranslationDataset(Dataset):
    def __init__(self, pairs, src_vocab, tgt_vocab, max_len=MAX_LEN):
        self.data = []
        pad_idx = src_vocab.stoi[PAD]
        for en, es in pairs:
            src = src_vocab.encode(tokenize(en))[:max_len]
            tgt = tgt_vocab.encode(tokenize(es))[:max_len]
            self.data.append((src, tgt))

    def __len__(self): return len(self.data)
    def __getitem__(self, i): return self.data[i]

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    def pad_seqs(seqs):
        max_l = max(len(s) for s in seqs)
        return torch.tensor([s + [0]*(max_l - len(s)) for s in seqs], dtype=torch.long)
    src = pad_seqs(src_batch)
    # tgt_in: <sos> + tokens, tgt_out: tokens + <eos>
    tgt_in  = pad_seqs([[1] + list(t) for t in tgt_batch])
    tgt_out = pad_seqs([list(t) + [2] for t in tgt_batch])
    return src, tgt_in, tgt_out

# ─────────────────────────────────────────────
# 2. Positional Encoding
# ─────────────────────────────────────────────
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=DROPOUT, max_len=512):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(max_len).unsqueeze(1).float()
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))  # (1, max_len, d_model)

    def forward(self, x):
        return self.dropout(x + self.pe[:, :x.size(1)])

# ─────────────────────────────────────────────
# 3. Multi-Head Attention
# ─────────────────────────────────────────────
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout=DROPOUT):
        super().__init__()
        assert d_model % n_heads == 0
        self.h = n_heads
        self.d_k = d_model // n_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def split_heads(self, x):
        B, L, _ = x.shape
        return x.view(B, L, self.h, self.d_k).transpose(1, 2)  # (B, h, L, d_k)

    def forward(self, q, k, v, mask=None):
        Q = self.split_heads(self.W_q(q))
        K = self.split_heads(self.W_k(k))
        V = self.split_heads(self.W_v(v))
        scores = (Q @ K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = self.dropout(F.softmax(scores, dim=-1))
        out = (attn @ V).transpose(1, 2).contiguous()
        out = out.view(out.size(0), out.size(1), -1)
        return self.W_o(out)

# ─────────────────────────────────────────────
# 4. Feed Forward
# ─────────────────────────────────────────────
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=DROPOUT):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(d_ff, d_model)
        )
    def forward(self, x): return self.net(x)

# ─────────────────────────────────────────────
# 5. Encoder Layer & Encoder
# ─────────────────────────────────────────────
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super().__init__()
        self.attn = MultiHeadAttention(d_model, n_heads, dropout)
        self.ff   = FeedForward(d_model, d_ff, dropout)
        self.ln1  = nn.LayerNorm(d_model)
        self.ln2  = nn.LayerNorm(d_model)
        self.drop = nn.Dropout(dropout)

    def forward(self, x, mask):
        x = self.ln1(x + self.drop(self.attn(x, x, x, mask)))
        x = self.ln2(x + self.drop(self.ff(x)))
        return x

class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, d_ff, n_layers, dropout):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.pe    = PositionalEncoding(d_model, dropout)
        self.scale = math.sqrt(d_model)
        self.layers = nn.ModuleList([EncoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)])

    def forward(self, src, mask):
        x = self.pe(self.embed(src) * self.scale)
        for layer in self.layers:
            x = layer(x, mask)
        return x

# ─────────────────────────────────────────────
# 6. Decoder Layer & Decoder
# ─────────────────────────────────────────────
class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super().__init__()
        self.self_attn  = MultiHeadAttention(d_model, n_heads, dropout)
        self.cross_attn = MultiHeadAttention(d_model, n_heads, dropout)
        self.ff  = FeedForward(d_model, d_ff, dropout)
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.ln3 = nn.LayerNorm(d_model)
        self.drop = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, tgt_mask):
        x = self.ln1(x + self.drop(self.self_attn(x, x, x, tgt_mask)))
        x = self.ln2(x + self.drop(self.cross_attn(x, enc_out, enc_out, src_mask)))
        x = self.ln3(x + self.drop(self.ff(x)))
        return x

class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, d_ff, n_layers, dropout):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.pe    = PositionalEncoding(d_model, dropout)
        self.scale = math.sqrt(d_model)
        self.layers = nn.ModuleList([DecoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)])
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, tgt, enc_out, src_mask, tgt_mask):
        x = self.pe(self.embed(tgt) * self.scale)
        for layer in self.layers:
            x = layer(x, enc_out, src_mask, tgt_mask)
        return self.fc_out(x)

# ─────────────────────────────────────────────
# 7. Full Transformer
# ─────────────────────────────────────────────
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size,
                 d_model=D_MODEL, n_heads=N_HEADS, d_ff=D_FF,
                 n_layers=N_LAYERS, dropout=DROPOUT):
        super().__init__()
        self.encoder = Encoder(src_vocab_size, d_model, n_heads, d_ff, n_layers, dropout)
        self.decoder = Decoder(tgt_vocab_size, d_model, n_heads, d_ff, n_layers, dropout)

    def make_src_mask(self, src):
        # (B, 1, 1, L)
        return (src != 0).unsqueeze(1).unsqueeze(2)

    def make_tgt_mask(self, tgt):
        B, L = tgt.shape
        pad_mask = (tgt != 0).unsqueeze(1).unsqueeze(2)          # (B,1,1,L)
        causal   = torch.tril(torch.ones(L, L, device=tgt.device)).bool()  # (L,L)
        return pad_mask & causal

    def forward(self, src, tgt):
        src_mask = self.make_src_mask(src)
        tgt_mask = self.make_tgt_mask(tgt)
        enc_out  = self.encoder(src, src_mask)
        return self.decoder(tgt, enc_out, src_mask, tgt_mask)

# ─────────────────────────────────────────────
# 8. Training & Evaluation Utilities
# ─────────────────────────────────────────────
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for src, tgt_in, tgt_out in loader:
        src, tgt_in, tgt_out = src.to(DEVICE), tgt_in.to(DEVICE), tgt_out.to(DEVICE)
        optimizer.zero_grad()
        logits = model(src, tgt_in)           # (B, L, vocab)
        loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_out.reshape(-1))
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), CLIP)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src, tgt_in, tgt_out in loader:
            src, tgt_in, tgt_out = src.to(DEVICE), tgt_in.to(DEVICE), tgt_out.to(DEVICE)
            logits = model(src, tgt_in)
            loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_out.reshape(-1))
            total_loss += loss.item()
    return total_loss / len(loader)

# ─────────────────────────────────────────────
# 9. Greedy Decoding
# ─────────────────────────────────────────────
def translate(model, sentence, src_vocab, tgt_vocab, max_len=MAX_LEN):
    model.eval()
    tokens = tokenize(sentence.lower())
    src = torch.tensor([src_vocab.encode(tokens)], dtype=torch.long).to(DEVICE)
    src_mask = model.make_src_mask(src)
    enc_out  = model.encoder(src, src_mask)
    tgt_ids  = [tgt_vocab.stoi[SOS]]
    for _ in range(max_len):
        tgt = torch.tensor([tgt_ids], dtype=torch.long).to(DEVICE)
        tgt_mask = model.make_tgt_mask(tgt)
        out = model.decoder(tgt, enc_out, src_mask, tgt_mask)
        next_id = out[0, -1].argmax().item()
        tgt_ids.append(next_id)
        if next_id == tgt_vocab.stoi[EOS]:
            break
    return " ".join(tgt_vocab.itos[i] for i in tgt_ids[1:-1])

# ─────────────────────────────────────────────
# 10. BLEU Score
# ─────────────────────────────────────────────
def bleu_score(model, pairs, src_vocab, tgt_vocab, n=200):
    from collections import Counter
    import math
    refs, hyps = [], []
    for en, es in random.sample(pairs, min(n, len(pairs))):
        ref  = tokenize(es)
        hyp  = tokenize(translate(model, en, src_vocab, tgt_vocab))
        refs.append(ref)
        hyps.append(hyp)

    scores = []
    for ngram in range(1, 5):
        match, total = 0, 0
        for ref, hyp in zip(refs, hyps):
            ref_counts = Counter(tuple(ref[i:i+ngram]) for i in range(len(ref)-ngram+1))
            hyp_grams  = [tuple(hyp[i:i+ngram]) for i in range(len(hyp)-ngram+1)]
            for g in hyp_grams:
                if ref_counts.get(g, 0) > 0:
                    match += 1
                    ref_counts[g] -= 1
            total += len(hyp_grams)
        scores.append(match / total if total > 0 else 0)

    bp = min(1.0, sum(len(h) for h in hyps) / sum(len(r) for r in refs))
    bleu = bp * math.exp(sum(math.log(s+1e-10) for s in scores) / 4)
    return bleu * 100

# ─────────────────────────────────────────────
# 11. Main
# ─────────────────────────────────────────────
if __name__ == "__main__":
    print(f"Device: {DEVICE}")

    # Load & split data
    print("Loading data...")
    pairs = load_pairs(DATA_PATH)
    random.shuffle(pairs)
    n = len(pairs)
    train_pairs = pairs[:int(0.8*n)]
    val_pairs   = pairs[int(0.8*n):int(0.9*n)]
    test_pairs  = pairs[int(0.9*n):]
    print(f"Train: {len(train_pairs)} | Val: {len(val_pairs)} | Test: {len(test_pairs)}")

    # Build vocabs
    src_vocab, tgt_vocab = build_vocabs(train_pairs)
    print(f"Src vocab: {len(src_vocab)} | Tgt vocab: {len(tgt_vocab)}")

    # Datasets & loaders
    train_ds = TranslationDataset(train_pairs, src_vocab, tgt_vocab)
    val_ds   = TranslationDataset(val_pairs,   src_vocab, tgt_vocab)
    test_ds  = TranslationDataset(test_pairs,  src_vocab, tgt_vocab)
    train_loader = DataLoader(train_ds, BATCH_SIZE, shuffle=True,  collate_fn=collate_fn)
    val_loader   = DataLoader(val_ds,   BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
    test_loader  = DataLoader(test_ds,  BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

    # Model
    model = Transformer(len(src_vocab), len(tgt_vocab)).to(DEVICE)
    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Parameters: {total_params:,}")

    optimizer = torch.optim.Adam(model.parameters(), lr=LR, betas=(0.9, 0.98), eps=1e-9)
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, factor=0.5)

    # Training loop
    best_val_loss = float("inf")
    t0 = time.time()
    for epoch in range(1, EPOCHS + 1):
        train_loss = train_epoch(model, train_loader, optimizer, criterion)
        val_loss   = evaluate(model, val_loader, criterion)
        scheduler.step(val_loss)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "best_transformer.pt")

        print(f"Epoch {epoch:02d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} "
              f"| Val PPL: {math.exp(val_loss):.2f}")

    train_time = time.time() - t0
    print(f"\nTraining time: {train_time/60:.1f} min")

    # Load best & evaluate on test
    model.load_state_dict(torch.load("best_transformer.pt", map_location=DEVICE))
    test_loss = evaluate(model, test_loader, criterion)
    print(f"Test Loss: {test_loss:.4f} | Test PPL: {math.exp(test_loss):.2f}")

    print("\nCalculating BLEU score (test set sample)...")
    bleu = bleu_score(model, test_pairs, src_vocab, tgt_vocab)
    print(f"BLEU Score: {bleu:.2f}")

    # Sample translations
    print("\n--- Sample Translations ---")
    examples = [
        "Hello.",
        "How are you?",
        "I am fine.",
        "Good morning.",
        "Thank you very much.",
    ]
    for sent in examples:
        print(f"  EN: {sent}")
        print(f"  ES: {translate(model, sent, src_vocab, tgt_vocab)}\n")

Device: cpu
Loading data...
Train: 8000 | Val: 1000 | Test: 1000
Src vocab: 2306 | Tgt vocab: 2927
Parameters: 6,045,551
Epoch 01 | Train Loss: 4.7471 | Val Loss: 3.8603 | Val PPL: 47.48
Epoch 02 | Train Loss: 3.7744 | Val Loss: 3.4181 | Val PPL: 30.51
Epoch 03 | Train Loss: 3.3625 | Val Loss: 3.1682 | Val PPL: 23.77
Epoch 04 | Train Loss: 3.0519 | Val Loss: 2.9708 | Val PPL: 19.51
Epoch 05 | Train Loss: 2.8038 | Val Loss: 2.8496 | Val PPL: 17.28
Epoch 06 | Train Loss: 2.5867 | Val Loss: 2.7268 | Val PPL: 15.28
Epoch 07 | Train Loss: 2.3851 | Val Loss: 2.6348 | Val PPL: 13.94
Epoch 08 | Train Loss: 2.2042 | Val Loss: 2.5797 | Val PPL: 13.19
Epoch 09 | Train Loss: 2.0347 | Val Loss: 2.4851 | Val PPL: 12.00
Epoch 10 | Train Loss: 1.8767 | Val Loss: 2.4253 | Val PPL: 11.31
Epoch 11 | Train Loss: 1.7289 | Val Loss: 2.4030 | Val PPL: 11.06
Epoch 12 | Train Loss: 1.5894 | Val Loss: 2.3660 | Val PPL: 10.65
Epoch 13 | Train Loss: 1.4561 | Val Loss: 2.3615 | Val PPL: 10.61
Epoch 14 | Train Loss