<a href="https://colab.research.google.com/github/profliuhao/CSIT599/blob/main/CSIT599_module5_exercise2_transformer_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exercise 2: Transformer Model for Neural Machine Translation

English to German Translation

Student Name: ____________________


Learning Objectives:
1. Understand positional encoding
2. Implement multi-head self-attention
3. Build complete Transformer architecture
4. Track both loss and accuracy metrics
5. Compare with Seq2Seq models

Instructions:
- Fill in the blanks marked with \_\_\_BLANK___
- Each blank is a simple parameter, function name, or dimension
- Run the code to train the Transformer
- Compare results with Exercise 1



In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import urllib.request
import zipfile
import os
import re
from tqdm.notebook import tqdm

np.random.seed(42)
tf.random.set_seed(42)

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {len(tf.config.list_physical_devices('GPU')) > 0}")

TensorFlow version: 2.19.0
GPU Available: True


In [None]:
# ==============================================================================
# HYPERPARAMETERS
# ==============================================================================

# Define hyperparameters for the Transformer model and training process.
# These values can be tuned to optimize performance.
BATCH_SIZE = 128
D_MODEL = 256          # Model dimension (embedding size) - This is the size of the vector representation for each token.
NUM_HEADS = 8          # Number of attention heads - Multi-head attention allows the model to jointly attend to information from different representation subspaces.
NUM_LAYERS = 2         # Number of encoder/decoder layers - Determines the depth of the Transformer network.
D_FF = 256             # Feed-forward network dimension - The inner dimension of the position-wise feed-forward network.
DROPOUT_RATE = 0.1     # Dropout rate for regularization, applied to embeddings, attention, and FFN outputs.
MAX_LENGTH = 20        # Maximum sequence length for input and target sentences. Longer sentences will be truncated, shorter ones padded.
EPOCHS = 10             # Number of training epochs.


In [None]:
# ==============================================================================
# DATA LOADING (Same as Exercise 1)
# ==============================================================================

def download_data():
    """Download the English-German translation dataset from a URL if it doesn't already exist."""
    url = "http://www.manythings.org/anki/deu-eng.zip"
    filename = "deu-eng.zip"

    if not os.path.exists("deu.txt"):
        print("Downloading dataset...")
        # Add User-Agent header to avoid 406 Not Acceptable error from some servers.
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
        req = urllib.request.Request(url, headers=headers)
        with urllib.request.urlopen(req) as response, open(filename, 'wb') as out_file:
            out_file.write(response.read())

        # Extract the dataset from the downloaded zip file.
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall()
        os.remove(filename) # Remove the zip file after extraction.
        print("Download complete!")
    else:
        print("Dataset already exists.")

def preprocess_sentence(sentence):
    """Preprocesses a single sentence by lowercasing, adding spaces around punctuation, cleaning extra spaces, and adding start/end tokens."""
    sentence = sentence.lower().strip() # Convert to lowercase and remove leading/trailing whitespace.
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence) # Add spaces around punctuation for tokenization.
    sentence = re.sub(r'[" "]+', " ", sentence) # Replace multiple spaces with a single space.
    sentence = sentence.strip() # Remove any new leading/trailing spaces.
    sentence = '<start> ' + sentence + ' <end>' # Add special start and end tokens.
    return sentence

def load_dataset(num_examples=10000):
    """Loads and preprocesses the English-German dataset, filtering by MAX_LENGTH."""
    download_data()

    with open('deu.txt', 'r', encoding='utf-8') as f:
        lines = f.read().strip().split('\n')

    pairs = []
    for line in lines[:num_examples]: # Iterate through a limited number of examples.
        parts = line.split('\t') # Sentences are tab-separated.
        if len(parts) >= 2:
            eng = preprocess_sentence(parts[0])
            deu = preprocess_sentence(parts[1])
            # Only include pairs where both English and German sentences are within MAX_LENGTH.
            if len(eng.split()) <= MAX_LENGTH and len(deu.split()) <= MAX_LENGTH:
                pairs.append([eng, deu])

    print(f"Loaded {len(pairs)} sentence pairs")
    return zip(*pairs) # Unzip the pairs into separate English and German lists.

# Load and tokenize data for a specified number of examples.
input_texts, target_texts = load_dataset(num_examples=15000)
input_texts = list(input_texts) # Convert zip object to list.
target_texts = list(target_texts) # Convert zip object to list.

# Initialize tokenizers for English (input) and German (target) languages.
# oov_token='<UNK>' handles out-of-vocabulary words.
input_tokenizer = keras.preprocessing.text.Tokenizer(
    filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n',
    oov_token='<UNK>'
)
target_tokenizer = keras.preprocessing.text.Tokenizer(
    filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n',
    oov_token='<UNK>'
)


# Fit tokenizers on the respective texts to build word indices.
input_tokenizer.fit_on_texts(input_texts)
target_tokenizer.fit_on_texts(target_texts)

# Convert text sequences to integer sequences using the fitted tokenizers.
input_sequences = input_tokenizer.texts_to_sequences(input_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)

# Pad sequences to MAX_LENGTH. 'post' padding adds zeros at the end.
input_sequences = keras.preprocessing.sequence.pad_sequences(
    input_sequences, maxlen=MAX_LENGTH, padding='post')
target_sequences = keras.preprocessing.sequence.pad_sequences(
    target_sequences, maxlen=MAX_LENGTH, padding='post')

# Calculate vocabulary sizes. Add 1 for the reserved 0 (padding) token.
input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

print(f"\nVocabulary sizes:")
print(f"English: {input_vocab_size}")
print(f"German: {target_vocab_size}")

# Split data into training and validation sets (80/20 split).
split_idx = int(0.8 * len(input_sequences))
train_input = input_sequences[:split_idx]
train_target = target_sequences[:split_idx]
val_input = input_sequences[split_idx:]
val_target = target_sequences[split_idx:]

print(f"\nTraining samples: {len(train_input)}")
print(f"Validation samples: {len(val_input)}")

Dataset already exists.
Loaded 15000 sentence pairs

Vocabulary sizes:
English: 2909
German: 4715

Training samples: 12000
Validation samples: 3000


## PART 1: POSITIONAL ENCODING

In [None]:
# ==============================================================================
# PART 1: POSITIONAL ENCODING
# ==============================================================================

def get_positional_encoding(seq_len, d_model):
    """
    Create positional encoding matrix using sine and cosine functions.
    This helps the Transformer understand the position of words in a sequence,
    as it has no inherent recurrence or convolution.

    Formula:
    PE(pos, 2i)   = sin(pos / 10000^(2i/d_model))
    PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))

    Args:
        seq_len: Maximum sequence length.
        d_model: Model dimension (embedding size).

    Returns:
        pos_encoding: A Tensor of shape [1, seq_len, d_model] containing the positional encodings.
    """
    # Create position indices: [0, 1, 2, ..., seq_len-1].
    # np.newaxis adds a new dimension, making its shape [seq_len, 1]
    # This allows for broadcasting when multiplied with div_term.
    position = np.arange(seq_len)[:, np.newaxis]  # shape: [seq_len, 1]

    # Create dimension indices for the divisor term: [0, 2, 4, ..., d_model-2].
    # These correspond to the '2i' in the formula.
    div_term = np.arange(0, ___BLANK___, ___BLANK___)  # ___BLANK___ step size (we want even indices for 2i)

    # Calculate the scaling factor: 1 / 10000^(2i/d_model).
    # This is equivalent to exp(-2i * log(10000) / d_model).
    div_term = 1 / np.___BLANK___(10000, ___BLANK___ / d_model) # ___BLANK___

    # Initialize the positional encoding matrix with zeros.
    pos_encoding = np.zeros((seq_len, d_model))

    # Apply sine function to even indices (0, 2, 4, ...).
    # pos_encoding[:, 0::2] selects all rows and even-indexed columns.
    pos_encoding[:, 0::2] = np.___BLANK___(position * div_term)  # ___BLANK___

    # Apply cosine function to odd indices (1, 3, 5, ...).
    # pos_encoding[:, 1::2] selects all rows and odd-indexed columns.
    pos_encoding[:, 1::2] = np.___BLANK___(position * div_term)  # ___BLANK___

    # Add a batch dimension at the beginning, resulting in shape [1, seq_len, d_model].
    # This allows it to be broadcasted across batch samples when added to embeddings.
    pos_encoding = pos_encoding[np.newaxis, ...]  # Shape: [1, seq_len, d_model]

    return tf.cast(pos_encoding, dtype=tf.float32)

## PART 2: SCALED DOT-PRODUCT ATTENTION

In [None]:
# ==============================================================================
# PART 2: SCALED DOT-PRODUCT ATTENTION
# ==============================================================================

def scaled_dot_product_attention(query, key, value, mask=None):
    """
    Calculates attention weights and applies them to values.
    This is the core mechanism of the Transformer, determining how much focus
    each part of the input sequence should receive when processing another part.

    Formula:
    Attention(Q, K, V) = softmax(QK^T / sqrt(d_k)) V

    Args:
        query: Tensor of shape [batch_size, num_heads, seq_len_q, depth]. Represents what we are looking for.
        key:   Tensor of shape [batch_size, num_heads, seq_len_k, depth]. Represents what is available.
        value: Tensor of shape [batch_size, num_heads, seq_len_v, depth]. The information to extract, corresponding to keys.
        mask:  Tensor, optional. Used to hide future tokens (look-ahead mask) or padding tokens.
               Shape: [batch_size, 1, 1, seq_len_k] (for padding mask) or [batch_size, 1, seq_len_q, seq_len_k] (for look-ahead mask).

    Returns:
        output: Tensor of shape [batch_size, num_heads, seq_len_q, depth]. The weighted sum of values.
        attention_weights: Tensor of shape [batch_size, num_heads, seq_len_q, seq_len_k]. The attention scores.
    """
    # Calculate the dot product of Query and Key, transposing the last two dimensions of Key.
    # This operation `Q @ K^T` computes the similarity between query and key vectors.
    # The resulting shape is [batch, heads, seq_len_q, seq_len_k].
    matmul_qk = tf.matmul(___BLANK___, ___BLANK___, transpose_b=___BLANK___)  # ___BLANK___ (True because we need K^T)

    # Scale the dot products by the square root of the depth (dimension of key vectors).
    # This scaling prevents the dot products from becoming too large, which can lead to
    # extremely small gradients during softmax, hindering effective learning.
    depth = tf.cast(tf.shape(key)[-1], tf.float32) # Get the last dimension of key (d_k)

    # Apply square root for scaling. `tf.math.sqrt` is the TensorFlow function for this.
    scaled_attention_logits = matmul_qk / tf.math.sqrt(depth)

    # Add the mask (if provided) to the scaled attention logits.
    # Masked positions (typically padding or future tokens) are set to a very large negative number.
    # When softmax is applied, these positions will become approximately zero, effectively ignoring them.
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)  # (Common choice for a large negative number)

    # Apply softmax to get attention weights.
    # Softmax is applied along the last axis (seq_len_k), ensuring that the weights
    # for each query position sum to 1 across all key positions.
    attention_weights = tf.nn.___BLANK___(scaled_attention_logits, axis=___BLANK___)  # ___BLANK___ (Apply softmax across the key sequence length dimension, -1)

    # Multiply the attention weights by the Value tensor.
    # This operation performs a weighted sum of the value vectors, where the weights
    # are the attention scores, giving more importance to relevant information.
    output = tf.matmul(attention_weights, value)

    return output, attention_weights

## PART 3: MULTI-HEAD ATTENTION

In [None]:
# ==============================================================================
# PART 3: MULTI-HEAD ATTENTION
# ==============================================================================

class MultiHeadAttention(layers.Layer):
    """
    Multi-Head Attention layer.

    This layer improves the model's ability to focus on different positions.
    It splits the model dimension (d_model) into multiple heads, performs
    scaled dot-product attention in parallel for each head, then concatenates
    and projects the results back to the original d_model dimension.
    """
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        # Assert that d_model is divisible by num_heads.
        # This is crucial because we need to split the d_model into `num_heads`
        # equal parts, each of `depth` size.
        assert d_model % num_heads == 0  # (Ensures even division)

        # Calculate the `depth` of each attention head.
        # d_model = num_heads * depth
        self.depth = d_model // num_heads

        # Linear projections for Query, Key, and Value.
        # These dense layers transform the input into Q, K, V matrices.
        self.wq = layers.Dense(d_model)
        self.wk = layers.Dense(d_model)
        self.wv = layers.Dense(d_model)

        # Final linear projection after concatenating all attention heads.
        self.dense = layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        """
        Splits the last dimension (d_model) of the input tensor `x`
        into `num_heads` and `depth` dimensions.

        Args:
            x: Input tensor of shape [batch_size, seq_len, d_model].
            batch_size: The batch size of the input.

        Returns:
            Tensor of shape [batch_size, num_heads, seq_len, depth].
        """
        # Reshape the input from [batch_size, seq_len, d_model]
        # to [batch_size, seq_len, num_heads, depth].
        # The `-1` automatically infers the `seq_len` dimension.
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))  # (Reshaping to separate heads)

        # Transpose the tensor to change the order of dimensions.
        # We want [batch_size, num_heads, seq_len, depth] for parallel computation across heads.
        # The original order was [batch, seq_len, num_heads, depth],
        # so we permute to swap `seq_len` (index 1) and `num_heads` (index 2).
        return tf.transpose(x, perm=[0, 2, 1, 3])  # (Permuting dimensions)

    def call(self, query, key, value, mask=None):
        """
        Performs the multi-head attention operation.

        Args:
            query, key, value: Input tensors, typically of shape [batch_size, seq_len, d_model].
            mask: Optional attention mask.

        Returns:
            output: Tensor of shape [batch_size, seq_len, d_model]. The result of multi-head attention.
            attention_weights: Tensor of shape [batch_size, num_heads, seq_len_q, seq_len_k]. The attention scores.
        """
        batch_size = tf.shape(query)[0]

        # Apply linear projections to Q, K, V.
        # This transforms them to `d_model` dimensions before splitting.
        query = self.wq(query)  # [batch_size, seq_len, d_model]
        key = self.wk(key)
        value = self.wv(value)

        # Split the projected Q, K, V into multiple heads.
        # Each now has shape [batch_size, num_heads, seq_len, depth].
        query = self.split_heads(query, batch_size)
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)

        # Perform scaled dot-product attention for all heads in parallel.
        # This results in `scaled_attention` (attended values) and `attention_weights` (scores).
        scaled_attention, attention_weights = scaled_dot_product_attention(
            query, key, value, mask
        )

        # Concatenate the output from all attention heads.
        # First, transpose back to [batch_size, seq_len, num_heads, depth]
        # to prepare for reshaping to d_model.
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (Reversing the transpose operation)

        # Reshape the tensor back to its original `d_model` dimension.
        # This combines the `num_heads` and `depth` dimensions back into `d_model`.
        # Resulting shape: [batch_size, seq_len, d_model].
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))  # (Reshaping to combine heads)

        # Apply a final linear projection to the concatenated output.
        # This is a learned transformation that helps integrate the information from all heads.
        output = self.dense(concat_attention)

        return output, attention_weights

## PART 4: FEED-FORWARD NETWORK

In [None]:
# ==============================================================================
# PART 4: FEED-FORWARD NETWORK
# ==============================================================================

class FeedForwardNetwork(layers.Layer):
    """
    Position-wise Feed-Forward Network.

    This network is applied independently to each position in the sequence.
    It consists of two linear transformations with a ReLU activation in between,
    often expressed as: FFN(x) = max(0, xW1 + b1)W2 + b2.
    It allows the model to process each position's representation further.
    """
    def __init__(self, d_model, d_ff, dropout_rate=0.1):
        super(FeedForwardNetwork, self).__init__()

        # The first dense layer expands the dimension from d_model to d_ff.
        # ReLU activation introduces non-linearity.
        self.dense1 = layers.Dense(d_ff, activation='relu')  # (Output dimension is d_ff)

        # The second dense layer projects the dimension back from d_ff to d_model.
        # This ensures the output dimension matches the input dimension for residual connections.
        self.dense2 = layers.Dense(d_model)  # (Output dimension is d_model)

        self.dropout = layers.Dropout(dropout_rate) # Dropout for regularization.

    def call(self, x, training=False):
        """
        Performs the feed-forward network operation.

        Args:
            x: Input tensor of shape [batch_size, seq_len, d_model].
            training: Boolean indicating whether the model is in training mode (for dropout).

        Returns:
            Tensor of shape [batch_size, seq_len, d_model].
        """
        x = self.dense1(x) # First linear transformation with ReLU.
        x = self.dropout(x, training=training) # Apply dropout.
        x = self.dense2(x) # Second linear transformation.
        return x

## PART 5: ENCODER LAYER

In [None]:
# ==============================================================================
# PART 5: ENCODER LAYER
# ==============================================================================

class EncoderLayer(layers.Layer):
    """
    Single Transformer Encoder Layer.

    Each encoder layer consists of two main sub-layers:
    1. A Multi-Head Self-Attention mechanism.
    2. A Position-wise Feed-Forward Network.

    Each sub-layer has a residual connection followed by layer normalization.
    """
    def __init__(self, d_model, num_heads, d_ff, dropout_rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads) # Multi-head self-attention sub-layer.
        self.ffn = FeedForwardNetwork(d_model, d_ff, dropout_rate) # Position-wise feed-forward network sub-layer.

        # Layer Normalization layers. `epsilon` is added for numerical stability.
        # Layer normalization normalizes across the feature dimension, rather than batch dimension.
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)  # (LayerNormalization is used in Transformers)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = layers.Dropout(dropout_rate) # Dropout for attention output.
        self.dropout2 = layers.Dropout(dropout_rate) # Dropout for FFN output.

    def call(self, x, mask, training=False):
        """
        Performs a single encoder layer operation.

        Args:
            x: Input tensor to the encoder layer, shape [batch_size, seq_len, d_model].
            mask: Padding mask for self-attention.
            training: Boolean indicating whether the model is in training mode.

        Returns:
            Tensor of shape [batch_size, seq_len, d_model]. The output of the encoder layer.
        """
        # Multi-head self-attention sub-layer.
        # Query, Key, and Value are all derived from the same input `x` in self-attention.
        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)

        # Add & Norm: Add residual connection and apply layer normalization.
        # `x + attn_output` implements the residual connection, helping with gradient flow.
        out1 = self.layernorm1(___BLANK___)  # ___BLANK___ (Add residual connection then LayerNorm)

        # Feed-forward network sub-layer.
        ffn_output = self.ffn(out1, training=training)
        ffn_output = self.dropout2(ffn_output, training=training)

        # Add & Norm: Add another residual connection and apply layer normalization.
        # `out1 + ffn_output` is the second residual connection.
        out2 = self.layernorm2(___BLANK___)  # ___BLANK___ (Add residual connection then LayerNorm)

        return out2

## PART 6: DECODER LAYER

In [None]:
# ==============================================================================
# PART 6: DECODER LAYER
# ==============================================================================

class DecoderLayer(layers.Layer):
    """
    Single Transformer Decoder Layer.

    Each decoder layer consists of three main sub-layers:
    1. A Masked Multi-Head Self-Attention mechanism (to attend to preceding tokens).
    2. A Multi-Head Cross-Attention mechanism (to attend to the encoder's output).
    3. A Position-wise Feed-Forward Network.

    Each sub-layer has a residual connection followed by layer normalization.
    """
    def __init__(self, d_model, num_heads, d_ff, dropout_rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)  # First MHA: Masked self-attention over target sequence.
        self.mha2 = MultiHeadAttention(d_model, num_heads)  # Second MHA: Cross-attention between encoder output and decoder output.
        self.ffn = FeedForwardNetwork(d_model, d_ff, dropout_rate) # Position-wise feed-forward network.

        # Layer Normalization layers for each sub-layer.
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)

        # Dropout layers for regularization.
        self.dropout1 = layers.Dropout(dropout_rate)
        self.dropout2 = layers.Dropout(dropout_rate)
        self.dropout3 = layers.Dropout(dropout_rate)

    def call(self, x, encoder_output, look_ahead_mask, padding_mask, training=False):
        """
        Performs a single decoder layer operation.

        Args:
            x: Input tensor to the decoder layer, shape [batch_size, target_seq_len, d_model].
            encoder_output: Output from the encoder, shape [batch_size, input_seq_len, d_model].
            look_ahead_mask: Mask to prevent the decoder from attending to future target tokens.
            padding_mask: Padding mask for the cross-attention (applied to encoder output).
            training: Boolean indicating whether the model is in training mode.

        Returns:
            Tensor of shape [batch_size, target_seq_len, d_model]. The output of the decoder layer.
        """
        # First sub-layer: Masked Multi-Head Self-Attention.
        # Decoder attends to its own previous outputs. `look_ahead_mask` prevents cheating.
        attn1, _ = self.mha1(x, x, x, look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(x + attn1) # Add & Norm.

        # Second sub-layer: Multi-Head Cross-Attention.
        # Decoder attends to the encoder's output. The query comes from the decoder (`out1`),
        # while the key and value come from the `encoder_output`.
        attn2, _ = self.mha2(out1, encoder_output, encoder_output, padding_mask)  # (Query=out1, Key=EncoderOutput, Value=EncoderOutput)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(out1 + attn2) # Add & Norm.

        # Third sub-layer: Position-wise Feed-Forward Network.
        ffn_output = self.ffn(out2, training=training)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(out2 + ffn_output) # Add & Norm.

        return out3

## PART 7: ENCODER AND DECODER

In [None]:
# ==============================================================================
# PART 7: ENCODER AND DECODER
# ==============================================================================

class Encoder(layers.Layer):
    """Transformer Encoder - A stack of `num_layers` EncoderLayer instances."""
    def __init__(self, num_layers, d_model, num_heads, d_ff, vocab_size,
                 max_len, dropout_rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = layers.Embedding(vocab_size, d_model) # Token embedding layer.
        self.pos_encoding = get_positional_encoding(max_len, d_model) # Positional encoding matrix.

        # Create a list of EncoderLayer instances.
        self.enc_layers = [
            EncoderLayer(d_model, num_heads, d_ff, dropout_rate)
            for _ in range(num_layers)
        ]

        self.dropout = layers.Dropout(dropout_rate) # Dropout for embeddings and positional encodings.

    def call(self, x, mask, training=False):
        """
        Performs the forward pass through the Encoder.

        Args:
            x: Input sequence token IDs, shape [batch_size, seq_len].
            mask: Padding mask for the input sequence.
            training: Boolean indicating whether the model is in training mode.

        Returns:
            Tensor of shape [batch_size, seq_len, d_model]. The output of the encoder.
        """
        seq_len = tf.shape(x)[1]

        # Step 1: Token Embedding.
        # Convert input token IDs into dense vector representations.
        x = self.embedding(x)

        # Step 2: Scale embeddings.
        # Multiply embeddings by sqrt(d_model) as suggested in the Transformer paper.
        # This is to make the positional encoding relatively smaller and prevent it from dominating the embeddings.
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))  # (Scaling by sqrt of model dimension)

        # Step 3: Add Positional Encoding.
        # Combine word embeddings with positional information.
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training) # Apply dropout after adding positional encoding.

        # Step 4: Pass through the stack of encoder layers.
        for enc_layer in self.enc_layers:
            x = enc_layer(x, mask, training=training)

        return x

class Decoder(layers.Layer):
    """Transformer Decoder - A stack of `num_layers` DecoderLayer instances."""
    def __init__(self, num_layers, d_model, num_heads, d_ff, vocab_size,
                 max_len, dropout_rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = layers.Embedding(vocab_size, d_model) # Token embedding layer.
        self.pos_encoding = get_positional_encoding(max_len, d_model) # Positional encoding matrix.

        # Create a list of DecoderLayer instances.
        self.dec_layers = [
            DecoderLayer(d_model, num_heads, d_ff, dropout_rate)
            for _ in range(num_layers)
        ]

        self.dropout = layers.Dropout(dropout_rate) # Dropout for embeddings and positional encodings.

    def call(self, x, encoder_output, look_ahead_mask, padding_mask, training=False):
        """
        Performs the forward pass through the Decoder.

        Args:
            x: Input tensor to the decoder layer, shape [batch_size, target_seq_len, d_model].
            encoder_output: Output from the encoder, shape [batch_size, input_seq_len, d_model].
            look_ahead_mask: Mask to prevent the decoder from attending to future target tokens.
            padding_mask: Padding mask for the cross-attention (applied to encoder output).
            training: Boolean indicating whether the model is in training mode.

        Returns:
            Tensor of shape [batch_size, target_seq_len, d_model]. The output of the decoder.
        """
        seq_len = tf.shape(x)[1]

        # Step 1: Token Embedding.
        x = self.embedding(x)
        # Step 2: Scale embeddings.
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))

        # Step 3: Add Positional Encoding.
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training) # Apply dropout.

        # Step 4: Pass through the stack of decoder layers.
        for dec_layer in self.dec_layers:
            x = dec_layer(x, encoder_output, look_ahead_mask, padding_mask, training=training)

        return x

## PART 8: TRANSFORMER MODEL

In [None]:
# ==============================================================================
# PART 8: TRANSFORMER MODEL
# ==============================================================================

class Transformer(keras.Model):
    """Complete Transformer model, combining Encoder and Decoder components."""
    def __init__(self, num_layers, d_model, num_heads, d_ff,
                 input_vocab_size, target_vocab_size, max_len, dropout_rate=0.1):
        super(Transformer, self).__init__()

        # Initialize the Encoder part of the Transformer.
        self.encoder = Encoder(num_layers, d_model, num_heads, d_ff,
                              input_vocab_size, max_len, dropout_rate)

        # Initialize the Decoder part of the Transformer.
        self.decoder = Decoder(num_layers, d_model, num_heads, d_ff,
                              target_vocab_size, max_len, dropout_rate)

        # Final linear layer to project the decoder output to the size of the target vocabulary.
        # This layer produces logits for each possible next token.
        self.final_layer = layers.Dense(target_vocab_size)

    def call(self, inputs, training=False):
        """
        Performs the forward pass through the entire Transformer model.

        Args:
            inputs: A tuple containing (encoder_input, decoder_input).
                    encoder_input: Source sequence token IDs.
                    decoder_input: Target sequence token IDs (shifted right).
            training: Boolean indicating whether the model is in training mode.

        Returns:
            Tensor of shape [batch_size, target_seq_len, target_vocab_size]. Logits for each target token.
        """
        inp, tar = inputs # Unpack encoder input and decoder input.

        # Create various masks needed for the encoder and decoder.
        enc_padding_mask = self.create_padding_mask(inp) # Mask for encoder self-attention (hides padding in source).
        dec_padding_mask = self.create_padding_mask(inp) # Mask for decoder cross-attention (hides padding in source).
        look_ahead_mask = self.create_look_ahead_mask(tf.shape(tar)[1]) # Mask for decoder self-attention (hides future tokens in target).
        dec_target_padding_mask = self.create_padding_mask(tar) # Padding mask for decoder self-attention (hides padding in target).

        # Combine the look-ahead mask and the target padding mask for decoder self-attention.
        # tf.maximum ensures that if either mask indicates a position should be masked, it will be.
        combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)  # (Combines look-ahead and padding masks)

        # Encode the source sequence.
        # The encoder output contains context-aware representations of the input.
        enc_output = self.encoder(inp, enc_padding_mask, training=training)

        # Decode the target sequence using encoder output and masks.
        dec_output = self.decoder(tar, enc_output, combined_mask, dec_padding_mask, training=training)

        # Project the decoder's output to the vocabulary size to get final token predictions.
        final_output = self.final_layer(dec_output)

        return final_output

    def create_padding_mask(self, seq):
        """
        Creates a mask to hide padding tokens (zeros) in a sequence.
        This prevents attention from being placed on non-meaningful padded elements.

        Args:
            seq: Input sequence tensor, e.g., [batch_size, seq_len].

        Returns:
            A float32 tensor of shape [batch_size, 1, 1, seq_len].
            Masked positions (where seq is 0) will be 1, others 0.
        """
        # Compare sequence elements to 0. Where `seq` is 0 (padding), the result is True.
        # `tf.cast` converts True to 1.0 and False to 0.0.
        seq = tf.cast(tf.math.equal(seq, 0), tf.float32)  # (We want to mask padding tokens which are 0)

        # Return a tensor with dimensions suitable for broadcasting into attention logits.
        # Shape becomes [batch_size, 1, 1, seq_len].
        return seq[:, tf.newaxis, tf.newaxis, :]  # [batch_size, 1, 1, seq_len]

    def create_look_ahead_mask(self, size):
        """
        Creates a mask to prevent the decoder from attending to future tokens.
        This ensures that predictions for a token depend only on known preceding tokens.

        Args:
            size: The sequence length for which to create the mask.

        Returns:
            A float32 tensor of shape [size, size] with 1s in the upper triangle (future positions).
        """
        # Create a lower triangular matrix of ones. `tf.linalg.band_part(tf.ones((size, size)), -1, 0)` creates this.
        # Subtracting it from a matrix of ones yields an upper triangular matrix with 1s where attention should be masked.
        mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)  # (1 - lower triangular matrix)

        return mask  # [seq_len, seq_len]

## PART 9: TRAINING SETUP

In [None]:
# ==============================================================================
# PART 9: TRAINING SETUP
# ==============================================================================

print("\n" + "="*70)
print("BUILDING TRANSFORMER MODEL")
print("="*70)

# Instantiate the Transformer model with defined hyperparameters.
transformer = Transformer(
    num_layers=NUM_LAYERS,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    d_ff=D_FF,
    input_vocab_size=input_vocab_size,
    target_vocab_size=target_vocab_size,
    max_len=MAX_LENGTH,
    dropout_rate=DROPOUT_RATE
)

print("✓ Transformer model created")

# Define the learning rate.
# The original Transformer paper uses a more complex learning rate schedule with warmup,
# but for simplicity, a fixed learning rate is used here. A lower rate is often chosen for stability.
learning_rate = ___BLANK___  # ___BLANK___ (A fixed learning rate, typically 0.001 is used)

# Adam optimizer with parameters from the Transformer paper.
optimizer = keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

# Loss function: SparseCategoricalCrossentropy is suitable for integer labels.
# `from_logits=True` means the model outputs raw logits (before softmax).
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)

def accuracy_function(real, pred):
    # Get the predicted token IDs
    predictions = tf.cast(tf.argmax(pred, axis=-1), dtype=tf.int32)
    # Compare predicted token IDs with real token IDs
    accuracies = tf.equal(predictions, real)

    # Create a mask to ignore padding (token ID 0)
    mask = tf.math.logical_not(tf.math.equal(real, 0))

    # Apply the mask to accuracies
    accuracies = tf.cast(accuracies, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    accuracies = accuracies * mask

    # Calculate the average accuracy, ignoring masked (padded) positions
    return tf.reduce_sum(accuracies) / tf.reduce_sum(mask)

## PART 10: TRAINING

In [None]:
# ==============================================================================
# PART 10: TRAINING
# ==============================================================================

def train_step(inp, tar):
    """Performs a single training step, including forward pass, loss calculation, and backpropagation."""
    # Prepare target sequences: tar_inp for decoder input (shifted right) and tar_real for ground truth.
    # For example, if tar = [<start>, word1, word2, <end>]
    # then tar_inp = [<start>, word1, word2] (input to predict next word)
    # and tar_real = [word1, word2, <end>] (the actual next words to predict)
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    with tf.GradientTape() as tape:
        # Make predictions using the Transformer model.
        predictions = transformer([inp, tar_inp], training=True)

        # Create a mask for non-padding tokens in the real target sequence.
        # This ensures that padding tokens do not contribute to the loss.
        mask = tf.math.not_equal(tar_real, 0) # True where tar_real is not padding (0).
        loss = loss_fn(tar_real, predictions, sample_weight=mask) # Apply mask as sample weights.
        accuracy = accuracy_function(tar_real, predictions)

    # Calculate gradients and apply them to update model weights.
    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    return loss, accuracy

def evaluate_model(input_data, target_data):
    """Evaluates the model on validation data by calculating the average loss and accuracy."""
    total_loss = 0
    total_accuracy = 0
    num_batches = len(input_data) // BATCH_SIZE

    for i in range(num_batches):
        start_idx = i * BATCH_SIZE
        end_idx = start_idx + BATCH_SIZE

        # Extract batch of input and target data.
        inp = input_data[start_idx:end_idx]
        tar = target_data[start_idx:end_idx]

        # Prepare target sequences for evaluation (similar to training step).
        tar_inp = tar[:, :-1]
        tar_real = tar[:, 1:]

        # Get predictions without updating weights (training=False).
        predictions = transformer([inp, tar_inp], training=False)

        # Calculate loss, ignoring padding tokens.
        mask = tf.math.not_equal(tar_real, 0)
        loss = loss_fn(tar_real, predictions, sample_weight=mask)
        accuracy = accuracy_function(tar_real, predictions)

        total_loss += loss
        total_accuracy += accuracy

    return total_loss / num_batches, total_accuracy / num_batches # Return average loss and accuracy per batch.

print("\n" + "="*70)
print("TRAINING TRANSFORMER")
print("="*70)

best_val_loss = float('inf') # Initialize best validation loss to infinity.
best_val_accuracy = 0.0 # Initialize best validation accuracy

# Main training loop.
for epoch in tqdm(range(EPOCHS)):
    # Shuffle training data at the beginning of each epoch to ensure batches are diverse.
    indices = np.random.permutation(len(train_input))
    train_input_shuffled = train_input[indices]
    train_target_shuffled = train_target[indices]

    # Training phase for the current epoch.
    num_batches = len(train_input) // BATCH_SIZE
    total_train_loss = 0
    total_train_accuracy = 0

    for i in tqdm(range(num_batches)):
        start_idx = i * BATCH_SIZE
        end_idx = start_idx + BATCH_SIZE

        # Get a batch of shuffled training data.
        inp = train_input_shuffled[start_idx:end_idx]
        tar = train_target_shuffled[start_idx:end_idx]

        # Perform one training step.
        loss, accuracy = train_step(inp, tar)
        total_train_loss += loss
        total_train_accuracy += accuracy

        # Print progress periodically.
        if (i + 1) % 50 == 0:
            print(f'  Batch {i+1}/{num_batches} - Loss: {loss:.4f} - Accuracy: {accuracy:.4f}')

    # Calculate average training loss and accuracy for the epoch.
    train_loss = total_train_loss / num_batches
    train_accuracy = total_train_accuracy / num_batches

    # Evaluate the model on the validation set.
    val_loss, val_accuracy = evaluate_model(val_input, val_target)

    print(f'\nEpoch {epoch+1}/{EPOCHS}')
    print(f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')
    print(f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')

    # Save the model if the validation loss improves.
    # Using validation accuracy as the primary metric for saving the best model.
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_val_loss = val_loss # Also update best_val_loss for consistent reporting
        print("*** New best model! (based on validation accuracy) ***") # In a real scenario, you'd save model weights here.

print("\n" + "="*70)
print("RESULTS")
print("="*70)
print(f"Best Validation Loss: {best_val_loss:.4f}")
print(f"Best Validation Accuracy: {best_val_accuracy:.4f}")


BUILDING TRANSFORMER MODEL
✓ Transformer model created

TRAINING TRANSFORMER


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/93 [00:00<?, ?it/s]

  Batch 50/93 - Loss: 0.9991 - Accuracy: 0.3671

Epoch 1/10
Train Loss: 1.0658, Train Accuracy: 0.3593
Val Loss: 1.0357, Val Accuracy: 0.4302
*** New best model! (based on validation accuracy) ***


  0%|          | 0/93 [00:00<?, ?it/s]

  Batch 50/93 - Loss: 0.7392 - Accuracy: 0.5108

Epoch 2/10
Train Loss: 0.7267, Train Accuracy: 0.5155
Val Loss: 0.8673, Val Accuracy: 0.4984
*** New best model! (based on validation accuracy) ***


  0%|          | 0/93 [00:00<?, ?it/s]

  Batch 50/93 - Loss: 0.5821 - Accuracy: 0.5659

Epoch 3/10
Train Loss: 0.5632, Train Accuracy: 0.5893
Val Loss: 0.7646, Val Accuracy: 0.5543
*** New best model! (based on validation accuracy) ***


  0%|          | 0/93 [00:00<?, ?it/s]

  Batch 50/93 - Loss: 0.4320 - Accuracy: 0.6515

Epoch 4/10
Train Loss: 0.4339, Train Accuracy: 0.6562
Val Loss: 0.7070, Val Accuracy: 0.5945
*** New best model! (based on validation accuracy) ***


  0%|          | 0/93 [00:00<?, ?it/s]

  Batch 50/93 - Loss: 0.3170 - Accuracy: 0.7245

Epoch 5/10
Train Loss: 0.3391, Train Accuracy: 0.7108
Val Loss: 0.6805, Val Accuracy: 0.6168
*** New best model! (based on validation accuracy) ***


  0%|          | 0/93 [00:00<?, ?it/s]

  Batch 50/93 - Loss: 0.2676 - Accuracy: 0.7679

Epoch 6/10
Train Loss: 0.2723, Train Accuracy: 0.7539
Val Loss: 0.6490, Val Accuracy: 0.6290
*** New best model! (based on validation accuracy) ***


  0%|          | 0/93 [00:00<?, ?it/s]

  Batch 50/93 - Loss: 0.2195 - Accuracy: 0.7798

Epoch 7/10
Train Loss: 0.2227, Train Accuracy: 0.7819
Val Loss: 0.6339, Val Accuracy: 0.6373
*** New best model! (based on validation accuracy) ***


  0%|          | 0/93 [00:00<?, ?it/s]

  Batch 50/93 - Loss: 0.1851 - Accuracy: 0.7971

Epoch 8/10
Train Loss: 0.1870, Train Accuracy: 0.8016
Val Loss: 0.6506, Val Accuracy: 0.6409
*** New best model! (based on validation accuracy) ***


  0%|          | 0/93 [00:00<?, ?it/s]

  Batch 50/93 - Loss: 0.1530 - Accuracy: 0.8207

Epoch 9/10
Train Loss: 0.1600, Train Accuracy: 0.8206
Val Loss: 0.6656, Val Accuracy: 0.6449
*** New best model! (based on validation accuracy) ***


  0%|          | 0/93 [00:00<?, ?it/s]

  Batch 50/93 - Loss: 0.1357 - Accuracy: 0.8369

Epoch 10/10
Train Loss: 0.1403, Train Accuracy: 0.8337
Val Loss: 0.6684, Val Accuracy: 0.6501
*** New best model! (based on validation accuracy) ***

RESULTS
Best Validation Loss: 0.6684
Best Validation Accuracy: 0.6501


## PART 11: TRANSLATION

In [None]:
# ==============================================================================
# PART 11: TRANSLATION
# ==============================================================================

def translate_transformer(sentence):
    """Translate using the Transformer"""
    sentence = preprocess_sentence(sentence)

    encoder_input = input_tokenizer.texts_to_sequences([sentence])
    encoder_input = keras.preprocessing.sequence.pad_sequences(
        encoder_input, maxlen=MAX_LENGTH, padding='post')
    encoder_input = tf.convert_to_tensor(encoder_input)

    decoder_input = [target_tokenizer.word_index['<start>']]
    output = tf.expand_dims(decoder_input, 0)

    for _ in range(MAX_LENGTH):
        predictions = transformer([encoder_input, output], training=False)
        predictions = predictions[:, -1:, :]
        predicted_id = tf.argmax(predictions, axis=-1)

        if predicted_id == target_tokenizer.word_index.get('<end>', 0):
            break

        # Cast predicted_id to tf.int32 to match output's dtype before concatenation
        output = tf.concat([output, tf.cast(predicted_id, tf.int32)], axis=-1)

    result = output.numpy()[0][1:]  # Remove <start>
    decoded = [target_tokenizer.index_word.get(i, '') for i in result]

    return ' '.join([w for w in decoded if w and w != '<end>'])

print("\n" + "="*70)
print("TRANSLATION EXAMPLES")
print("="*70)

# Updated test cases with ground truth for better comparison
test_cases = [
    {"english": "I am a student.", "german_ground_truth": "ich bin ein student"},
    {"english": "How are you?", "german_ground_truth": "wie geht es dir"},
    {"english": "Good morning.", "german_ground_truth": "guten morgen"},
    {"english": "Thank you.", "german_ground_truth": "danke"},
    {"english": "Where is the station?", "german_ground_truth": "wo ist der bahnhof"},
    {"english": "He is running.", "german_ground_truth": "er rennt"},
    {"english": "I love you.", "german_ground_truth": "ich liebe dich"},
]

for case in test_cases:
    english_sentence = case["english"]
    ground_truth = case["german_ground_truth"]
    model_prediction = translate_transformer(english_sentence)
    print(f"\nEnglish: {english_sentence}")
    print(f"Ground Truth:   {ground_truth}")
    print(f"Model Prediction: {model_prediction}")

print("\n" + "="*70)
print("TRAINING COMPLETE!")
print("="*70)


TRANSLATION EXAMPLES

English: I am a student.
Ground Truth:   ich bin ein student
Model Prediction: ich bin student

English: How are you?
Ground Truth:   wie geht es dir
Model Prediction: wie geht es dir

English: Good morning.
Ground Truth:   guten morgen
Model Prediction: gut macht es sich

English: Thank you.
Ground Truth:   danke
Model Prediction: danke

English: Where is the station?
Ground Truth:   wo ist der bahnhof
Model Prediction: wo ist die datei

English: He is running.
Ground Truth:   er rennt
Model Prediction: er rennt

English: I love you.
Ground Truth:   ich liebe dich
Model Prediction: ich liebe dich

TRAINING COMPLETE!
