# WMT14 English-German Inference

Production inference notebook for translating with the trained Transformer model.

## Features
- Load trained model and tokenizer
- Single and batch translation
- Greedy and beam search decoding
- BLEU score evaluation on test set

## Requirements
```bash
pip install sacrebleu datasets
```

## 1. Setup

In [1]:
import sys
sys.path.insert(0, '..')

import os
from pathlib import Path
from typing import List, Optional

import torch
import torch.nn.functional as F
from tqdm import tqdm

from src import Transformer
from src.tokenizer import Tokenizer, PAD_ID, BOS_ID, EOS_ID

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Device: cuda
GPU: NVIDIA GeForce RTX 5090


## 2. Load Model and Tokenizer

In [19]:
# Checkpoint directory
CHECKPOINT_DIR = Path("../checkpoints/wmt14_base")

# Choose checkpoint (best_model.pt or final_model.pt)
MODEL_PATH = CHECKPOINT_DIR / "final_model.pt"
TOKENIZER_PATH = CHECKPOINT_DIR / "tokenizer.model"

print(f"Loading from: {CHECKPOINT_DIR}")
print(f"Model: {MODEL_PATH.name}")
print(f"Tokenizer: {TOKENIZER_PATH.name}")

Loading from: ../checkpoints/wmt14_base
Model: final_model.pt
Tokenizer: tokenizer.model


In [20]:
# Load tokenizer
print("Loading tokenizer...")
tokenizer = Tokenizer(model_path=str(TOKENIZER_PATH))
print(f"Vocabulary size: {tokenizer.vocab_size}")
print(f"Special tokens: PAD={tokenizer.pad_id}, BOS={tokenizer.bos_id}, EOS={tokenizer.eos_id}")

Loading tokenizer...
Vocabulary size: 37000
Special tokens: PAD=0, BOS=2, EOS=3


In [21]:
# Load checkpoint
print("Loading checkpoint...")
checkpoint = torch.load(MODEL_PATH, map_location=device, weights_only=False)

config = checkpoint['config']
print(f"\nModel configuration:")
print(f"  d_model: {config['d_model']}")
print(f"  n_heads: {config['n_heads']}")
print(f"  n_layers: {config['n_layers']}")
print(f"  d_ff: {config['d_ff']}")
print(f"  Training step: {checkpoint['step']}")
print(f"  Loss: {checkpoint['loss']:.4f}")

Loading checkpoint...

Model configuration:
  d_model: 512
  n_heads: 8
  n_layers: 6
  d_ff: 2048
  Training step: 100000
  Loss: 0.0000


In [22]:
# Build model
print("\nBuilding model...")
model = Transformer(
    src_vocab_size=tokenizer.vocab_size,
    tgt_vocab_size=tokenizer.vocab_size,
    d_model=config['d_model'],
    n_heads=config['n_heads'],
    n_encoder_layers=config['n_layers'],
    n_decoder_layers=config['n_layers'],
    d_ff=config['d_ff'],
    dropout=0.0,  # No dropout during inference
    max_seq_len=config.get('max_seq_len', 512),
    pad_idx=tokenizer.pad_id,
    share_embeddings=True,
)

# Load weights
model.load_state_dict(checkpoint['model_state_dict'])
model = model.to(device)
model.eval()

print(f"Model loaded successfully!")
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")


Building model...
Model loaded successfully!
Parameters: 82,028,680


## 3. Translation Functions

In [23]:
def translate_greedy(
    model: Transformer,
    tokenizer: Tokenizer,
    text: str,
    max_len: int = 128,
    device: str = 'cuda',
) -> str:
    """
    Translate a single sentence using greedy decoding.
    
    Args:
        model: Trained Transformer model
        tokenizer: BPE tokenizer
        text: Source text (English)
        max_len: Maximum output length
        device: Device to run on
    
    Returns:
        Translated text (German)
    """
    model.eval()
    
    # Encode source
    src_ids = tokenizer.encode(text, add_bos=True, add_eos=True)
    src_tensor = torch.tensor([src_ids], device=device)
    
    # Generate
    with torch.no_grad():
        output = model.generate(
            src=src_tensor,
            max_len=max_len,
            start_token=tokenizer.bos_id,
            end_token=tokenizer.eos_id,
        )
    
    # Decode
    translation = tokenizer.decode(output[0].tolist(), skip_special_tokens=True)
    return translation


def translate_beam_search(
    model: Transformer,
    tokenizer: Tokenizer,
    text: str,
    beam_size: int = 4,
    max_len: int = 128,
    length_penalty: float = 0.6,
    device: str = 'cuda',
) -> str:
    """
    Translate using beam search for better quality.
    
    Args:
        model: Trained Transformer model
        tokenizer: BPE tokenizer
        text: Source text (English)
        beam_size: Number of beams
        max_len: Maximum output length
        length_penalty: Length normalization factor
        device: Device to run on
    
    Returns:
        Translated text (German)
    """
    model.eval()
    
    # Encode source
    src_ids = tokenizer.encode(text, add_bos=True, add_eos=True)
    src_tensor = torch.tensor([src_ids], device=device)
    
    with torch.no_grad():
        # Encode source once
        memory = model.encode(src_tensor)
        
        # Initialize beams: (score, token_ids)
        beams = [(0.0, [tokenizer.bos_id])]
        completed = []
        
        for step in range(max_len):
            if not beams:
                break
                
            all_candidates = []
            
            for score, tokens in beams:
                if tokens[-1] == tokenizer.eos_id:
                    # This beam is complete
                    completed.append((score, tokens))
                    continue
                
                # Decode current sequence
                tgt = torch.tensor([tokens], device=device)
                tgt_mask = model._create_tgt_mask(tgt)
                decoder_output = model.decode(tgt, memory, tgt_mask)
                logits = model.output_projection(decoder_output[:, -1, :])
                log_probs = F.log_softmax(logits, dim=-1)
                
                # Get top-k next tokens
                top_log_probs, top_ids = log_probs.topk(beam_size, dim=-1)
                
                for i in range(beam_size):
                    next_token = top_ids[0, i].item()
                    next_score = score + top_log_probs[0, i].item()
                    all_candidates.append((next_score, tokens + [next_token]))
            
            # Select top beams
            all_candidates.sort(key=lambda x: x[0], reverse=True)
            beams = all_candidates[:beam_size]
            
            # Early stopping if all beams completed
            if len(completed) >= beam_size:
                break
        
        # Add remaining beams to completed
        completed.extend(beams)
        
        # Apply length penalty and select best
        def score_with_penalty(item):
            score, tokens = item
            length = len(tokens)
            return score / (length ** length_penalty)
        
        completed.sort(key=score_with_penalty, reverse=True)
        best_tokens = completed[0][1]
    
    # Decode
    translation = tokenizer.decode(best_tokens, skip_special_tokens=True)
    return translation


def translate_batch(
    model: Transformer,
    tokenizer: Tokenizer,
    texts: List[str],
    max_len: int = 128,
    device: str = 'cuda',
) -> List[str]:
    """
    Translate a batch of sentences using greedy decoding.
    
    Args:
        model: Trained Transformer model
        tokenizer: BPE tokenizer
        texts: List of source texts
        max_len: Maximum output length
        device: Device to run on
    
    Returns:
        List of translations
    """
    model.eval()
    
    # Encode all sources
    src_ids_list = [tokenizer.encode(t, add_bos=True, add_eos=True) for t in texts]
    
    # Pad to same length
    max_src_len = max(len(ids) for ids in src_ids_list)
    src_padded = []
    for ids in src_ids_list:
        padded = ids + [tokenizer.pad_id] * (max_src_len - len(ids))
        src_padded.append(padded)
    
    src_tensor = torch.tensor(src_padded, device=device)
    
    # Generate
    with torch.no_grad():
        output = model.generate(
            src=src_tensor,
            max_len=max_len,
            start_token=tokenizer.bos_id,
            end_token=tokenizer.eos_id,
        )
    
    # Decode all
    translations = []
    for i in range(len(texts)):
        translation = tokenizer.decode(output[i].tolist(), skip_special_tokens=True)
        translations.append(translation)
    
    return translations

## 4. Single Sentence Translation

In [24]:
# Test sentences
test_sentences = [
    "The weather is nice today.",
    "I love machine learning and artificial intelligence.",
    "The European Union is an economic and political union of 27 member states.",
    "Scientists have discovered a new species of deep-sea fish.",
    "The quick brown fox jumps over the lazy dog.",
]

print("Single Sentence Translation (Greedy)")
print("="*70)

for sent in test_sentences:
    translation = translate_greedy(model, tokenizer, sent, device=device)
    print(f"\nEN: {sent}")
    print(f"DE: {translation}")

Single Sentence Translation (Greedy)

EN: The weather is nice today.
DE: Heute ist das Wetter schön.

EN: I love machine learning and artificial intelligence.
DE: Ich bin ein Experte für die Technik und die Intelligenz.

EN: The European Union is an economic and political union of 27 member states.
DE: Die Europäische Union ist eine politische Union und wirtschaftliche Staaten.

EN: Scientists have discovered a new species of deep-sea fish.
DE: Wissenschaftler haben eine neue Art von Fisch entdeckt.

EN: The quick brown fox jumps over the lazy dog.
DE: Der Held der Bucht ist schnell und mit einem schwarzen Fleck.


In [25]:
# Compare greedy vs beam search
print("Greedy vs Beam Search Comparison")
print("="*70)

for sent in test_sentences[:3]:
    greedy = translate_greedy(model, tokenizer, sent, device=device)
    beam = translate_beam_search(model, tokenizer, sent, beam_size=4, device=device)
    
    print(f"\nEN: {sent}")
    print(f"Greedy: {greedy}")
    print(f"Beam-4: {beam}")

Greedy vs Beam Search Comparison

EN: The weather is nice today.
Greedy: Heute ist das Wetter schön.
Beam-4: Das Wetter ist heute schön.

EN: I love machine learning and artificial intelligence.
Greedy: Ich bin ein Experte für die Technik und die Intelligenz.
Beam-4: Ich liebe die Technik und die Intelligenz.

EN: The European Union is an economic and political union of 27 member states.
Greedy: Die Europäische Union ist eine politische Union und wirtschaftliche Staaten.
Beam-4: Die Europäische Union ist eine politische und wirtschaftliche Union.


## 5. Batch Translation

In [26]:
# Batch translation
print("Batch Translation")
print("="*70)

batch_sentences = [
    "Hello, how are you?",
    "The meeting starts at 9 AM.",
    "Please send me the report by Friday.",
    "Thank you for your help.",
]

translations = translate_batch(model, tokenizer, batch_sentences, device=device)

for en, de in zip(batch_sentences, translations):
    print(f"\nEN: {en}")
    print(f"DE: {de}")

Batch Translation

EN: Hello, how are you?
DE: Hallo, wie du es willst???

EN: The meeting starts at 9 AM.
DE: Die Konferenz wird am Freitag von 9:00 bis 10:00 Uhr stattfinden.

EN: Please send me the report by Friday.
DE: Bitte geben Sie mir den Bericht an.

EN: Thank you for your help.
DE: Vielen Dank für Ihre Hilfe. ich danke Ihnen für Ihre Hilfe. für Ihre


## 6. BLEU Score Evaluation

In [27]:
try:
    import sacrebleu
    HAS_SACREBLEU = True
except ImportError:
    print("sacrebleu not installed. Run: pip install sacrebleu")
    HAS_SACREBLEU = False

In [28]:
if HAS_SACREBLEU:
    from datasets import load_dataset
    
    # Load test set
    print("Loading WMT14 test set...")
    wmt14_test = load_dataset("wmt14", "de-en", split="test")
    print(f"Test set size: {len(wmt14_test)} examples")
    
    # Extract sentences
    test_en = [ex["translation"]["en"] for ex in wmt14_test]
    test_de = [ex["translation"]["de"] for ex in wmt14_test]  # References

Loading WMT14 test set...
Test set size: 3003 examples


In [30]:
if HAS_SACREBLEU:
    # Evaluate on subset (full test set takes time)
    EVAL_SIZE = 500  # Set to len(test_en) for full evaluation
    
    print(f"Evaluating on {EVAL_SIZE} examples...")
    print("This may take a few minutes...")
    
    # Generate translations
    hypotheses = []
    batch_size = 16
    
    for i in tqdm(range(0, EVAL_SIZE, batch_size)):
        batch = test_en[i:i+batch_size]
        translations = translate_batch(model, tokenizer, batch, device=device)
        hypotheses.extend(translations)
    
    references = test_de[:EVAL_SIZE]
    
    # Compute BLEU
    bleu = sacrebleu.corpus_bleu(hypotheses, [references])
    
    print(f"\nBLEU Score: {bleu.score:.2f}")
    print(f"Details: {bleu}")

Evaluating on 500 examples...
This may take a few minutes...


100%|██████████| 32/32 [00:08<00:00,  3.82it/s]


BLEU Score: 0.67
Details: BLEU = 0.67 12.0/1.4/0.2/0.1 (BP = 1.000 ratio = 2.465 hyp_len = 25492 ref_len = 10343)





In [14]:
if HAS_SACREBLEU:
    # Show example translations with references
    print("\nSample translations vs references:")
    print("="*70)
    
    for i in [0, 10, 50, 100]:
        if i < len(hypotheses):
            print(f"\n[{i}] Source: {test_en[i][:80]}..." if len(test_en[i]) > 80 else f"\n[{i}] Source: {test_en[i]}")
            print(f"    Reference: {references[i][:80]}..." if len(references[i]) > 80 else f"    Reference: {references[i]}")
            print(f"    Generated: {hypotheses[i][:80]}..." if len(hypotheses[i]) > 80 else f"    Generated: {hypotheses[i]}")


Sample translations vs references:

[0] Source: Gutach: Increased safety for pedestrians
    Reference: Gutach: Noch mehr Sicherheit für Fußgänger
    Generated: Höchste Sicherheit: Friede der Fußgängerurlaube.urlauber

[10] Source: "According to current measurements, around 12,000 vehicles travel through the to...
    Reference: "Laut aktuellen Messungen durchfahren auf der B 33 täglich etwa 12 000 Fahrzeuge...
    Generated: "Die Bucht von Kiel ist eine der größten Städte in Europa, die seit 1973 regelmä...

[50] Source: "It is not a matter of something we might choose to do," said Hasan Ikhrata, exe...
    Reference: „Es ist nichts, das wir nur möglicherweise verwenden werden“, sagte Hasan Ikhrat...
    Generated: "Wir sind nicht in der Lage, die verschiedenen Arten von Crew-Sendern zu überwac...

[100] Source: At the Metropolitan Transportation Commission in the San Francisco Bay Area, off...
    Reference: Bei der Metropolitan Transportation Commission für das Gebiet der San Fran

## 7. Interactive Translation

In [15]:
def interactive_translate():
    """Interactive translation mode."""
    print("Interactive EN->DE Translation")
    print("Type 'quit' to exit")
    print("="*50)
    
    while True:
        text = input("\nEN> ").strip()
        if text.lower() in ['quit', 'exit', 'q']:
            print("Goodbye!")
            break
        if not text:
            continue
        
        translation = translate_greedy(model, tokenizer, text, device=device)
        print(f"DE> {translation}")

# Uncomment to run interactive mode:
# interactive_translate()

## 8. Model Analysis

In [16]:
# Analyze encoding representations
test_sent = "The Transformer model uses self-attention mechanisms."

src_ids = tokenizer.encode(test_sent, add_bos=True, add_eos=True)
src_tensor = torch.tensor([src_ids], device=device)

with torch.no_grad():
    # Get encoder output
    memory = model.encode(src_tensor)
    
print(f"Input: {test_sent}")
print(f"Tokens: {src_ids}")
print(f"Token pieces: {[tokenizer.id_to_piece(i) for i in src_ids]}")
print(f"\nEncoder output shape: {memory.shape}")
print(f"Encoder output stats:")
print(f"  Mean: {memory.mean().item():.4f}")
print(f"  Std: {memory.std().item():.4f}")
print(f"  Min: {memory.min().item():.4f}")
print(f"  Max: {memory.max().item():.4f}")

Input: The Transformer model uses self-attention mechanisms.
Tokens: [2, 251, 6473, 231, 501, 4727, 14927, 5374, 36786, 2474, 1151, 7327, 36767, 3]
Token pieces: ['<s>', '▁The', '▁Trans', 'for', 'mer', '▁model', '▁uses', '▁self', '-', 'att', 'ention', '▁mechanisms', '.', '</s>']

Encoder output shape: torch.Size([1, 14, 512])
Encoder output stats:
  Mean: 0.0026
  Std: 0.0191
  Min: -0.1250
  Max: 0.1611


In [17]:
# Token probability analysis
with torch.no_grad():
    # Start decoding
    tgt = torch.tensor([[tokenizer.bos_id]], device=device)
    tgt_mask = model._create_tgt_mask(tgt)
    decoder_output = model.decode(tgt, memory, tgt_mask)
    logits = model.output_projection(decoder_output[:, -1, :])
    probs = F.softmax(logits, dim=-1)
    
    # Top predictions for first token
    top_probs, top_ids = probs.topk(10, dim=-1)
    
print(f"\nTop 10 predictions for first output token:")
print(f"{'Rank':<6}{'Token':<20}{'Probability':<12}")
print("-" * 38)
for i in range(10):
    token_id = top_ids[0, i].item()
    prob = top_probs[0, i].item()
    piece = tokenizer.id_to_piece(token_id)
    print(f"{i+1:<6}{piece:<20}{prob:.4f}")


Top 10 predictions for first output token:
Rank  Token               Probability 
--------------------------------------
1     ▁Das                0.3259
2     ▁Der                0.1588
3     ▁Die                0.0499
4     ▁Im                 0.0312
5     ▁Mit                0.0293
6     ▁In                 0.0247
7     ▁Modell             0.0192
8     ▁Sim                0.0183
9     ▁Mod                0.0133
10    ▁Bei                0.0129


## 9. Export for Production

In [18]:
# Create a simple translation function for production use
class Translator:
    """Simple translator class for production use."""
    
    def __init__(self, model_path: str, tokenizer_path: str, device: str = 'cuda'):
        self.device = device
        
        # Load tokenizer
        self.tokenizer = Tokenizer(model_path=tokenizer_path)
        
        # Load model
        checkpoint = torch.load(model_path, map_location=device, weights_only=False)
        config = checkpoint['config']
        
        self.model = Transformer(
            src_vocab_size=self.tokenizer.vocab_size,
            tgt_vocab_size=self.tokenizer.vocab_size,
            d_model=config['d_model'],
            n_heads=config['n_heads'],
            n_encoder_layers=config['n_layers'],
            n_decoder_layers=config['n_layers'],
            d_ff=config['d_ff'],
            dropout=0.0,
            pad_idx=self.tokenizer.pad_id,
            share_embeddings=True,
        )
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.model = self.model.to(device)
        self.model.eval()
    
    def translate(self, text: str, max_len: int = 128) -> str:
        """Translate a single sentence."""
        src_ids = self.tokenizer.encode(text, add_bos=True, add_eos=True)
        src_tensor = torch.tensor([src_ids], device=self.device)
        
        with torch.no_grad():
            output = self.model.generate(
                src=src_tensor,
                max_len=max_len,
                start_token=self.tokenizer.bos_id,
                end_token=self.tokenizer.eos_id,
            )
        
        return self.tokenizer.decode(output[0].tolist(), skip_special_tokens=True)
    
    def __call__(self, text: str) -> str:
        return self.translate(text)


# Example usage:
# translator = Translator(
#     model_path="checkpoints/wmt14_base/best_model.pt",
#     tokenizer_path="checkpoints/wmt14_base/tokenizer.model",
# )
# print(translator("Hello world"))

## Summary

This notebook demonstrated:

1. **Loading** trained model and tokenizer
2. **Greedy decoding** for fast translation
3. **Beam search** for higher quality translations
4. **Batch translation** for efficiency
5. **BLEU evaluation** on WMT14 test set
6. **Model analysis** and token probabilities

### Expected Results

| Training Steps | BLEU (newstest2014) |
|---------------|--------------------|
| 50K           | ~20                |
| 100K          | ~24                |
| 300K (paper)  | ~27.3              |

### Tips for Better Results

1. Train longer (300K+ steps)
2. Use beam search with length penalty
3. Implement checkpoint averaging
4. Use larger batch sizes if GPU memory allows