<a href="https://colab.research.google.com/github/shahzadahmad3/Natural-Language-Processing/blob/main/Transformer_Arch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [136]:
#Import necessary libraries
import tensorflow as tf
import numpy as np

Since Transformers don't process input sequentially, **positional encodings** are added to provide information about the position of each word in the sequence.

Input=Embeddings+Positional Encoding


In [142]:
class TransformerEncoderBlock(tf.keras.layers.Layer):
    """
    Represents a single encoder block in the Transformer architecture.
    """
    def __init__(self, embed_dim, num_heads, dff, seq_length):
        super(TransformerEncoderBlock, self).__init__()

        self.pos_encoding = PositionalEncoding(seq_length, embed_dim)
        self.multi_head_attention = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(embed_dim)
        ])

        self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs):
        """
        Processes the input through the encoder block.
        """
        x = self.pos_encoding(inputs)  # Add positional encoding

        # Multi-Head Self-Attention
        attn_output = self.multi_head_attention(x, x)
        x = self.layer_norm1(x + attn_output)  # Add Residual Connection & LayerNorm

        # Feed-Forward Network
        ffn_output = self.ffn(x)
        x = self.layer_norm2(x + ffn_output)  # Add Residual Connection & LayerNorm

        return x

In [143]:
class PositionalEncoding(tf.keras.layers.Layer):
    """
    Creates positional encodings to inject sequence order information into the model.
    """
    def __init__(self, seq_length, d_model):
        super(PositionalEncoding, self).__init__()
        self.pos_encoding = self.create_positional_encoding(seq_length, d_model)

    def create_positional_encoding(self, seq_length, d_model):
        """
        Calculates positional encodings using sine and cosine functions.
        """
        # Generate position and dimension indices
        pos_encoding = np.zeros((seq_length, d_model))
        for pos in range(seq_length):
            for i in range(d_model):
                if i % 2 == 0:  # Even index
                    pos_encoding[pos, i] = np.sin(pos / np.power(10000, (2 * (i // 2)) / d_model))
                else:  # Odd index
                    pos_encoding[pos, i] = np.cos(pos / np.power(10000, (2 * (i // 2)) / d_model))
        # Convert to TensorFlow tensor
        return tf.convert_to_tensor(pos_encoding, dtype=tf.float32)

    def call(self, x):
        """
        Adds positional encodings to the input.
        """
        return x + self.pos_encoding[:tf.shape(x)[1], :]

In [144]:
class TransformerDecoderBlock(tf.keras.layers.Layer):
    """
    Represents a single decoder block in the Transformer architecture.
    """
    def __init__(self, embed_dim, num_heads, dff, seq_length):
        super(TransformerDecoderBlock, self).__init__()

        self.pos_encoding = PositionalEncoding(seq_length, embed_dim)
        self.masked_multi_head_attention = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.cross_attention = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)

        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(embed_dim)
        ])

        self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs, encoder_outputs):
        """
        Processes the input through the decoder block.
        """
        x = self.pos_encoding(inputs)  # Add positional encoding

        # Masked Multi-Head Self-Attention
        attn_mask = self.create_causal_mask(tf.shape(x)[1])
        attn_output = self.masked_multi_head_attention(x, x, attention_mask=attn_mask)
        x = self.layer_norm1(x + attn_output)  # Add Residual Connection & LayerNorm

        # Cross-Attention (Encoder-Decoder Attention)
        cross_attn_output = self.cross_attention(x, encoder_outputs)
        x = self.layer_norm2(x + cross_attn_output)  # Add Residual Connection & LayerNorm

        # Feed-Forward Network
        ffn_output = self.ffn(x)
        x = self.layer_norm3(x + ffn_output)  # Add Residual Connection & LayerNorm

        return x

    def create_causal_mask(self, seq_length):
        """
        Creates a causal mask to prevent the decoder from attending to future tokens.
        """
        mask = tf.linalg.band_part(tf.ones((seq_length, seq_length)), -1, 0)  # Lower triangular matrix
        return tf.cast(mask, dtype=tf.float32)

In [145]:
class Transformer(tf.keras.Model):
    """
    Implements the Transformer architecture.
    """
    def __init__(self, embed_dim, num_heads, dff, num_layers, seq_length, vocab_size):
        super(Transformer, self).__init__()

        self.embedding = tf.keras.layers.Embedding(vocab_size, embed_dim)
        self.pos_encoding = PositionalEncoding(seq_length, embed_dim)

        self.encoder_stack = [TransformerEncoderBlock(embed_dim, num_heads, dff, seq_length) for _ in range(num_layers)]
        self.decoder_stack = [TransformerDecoderBlock(embed_dim, num_heads, dff, seq_length) for _ in range(num_layers)]

        self.final_layer = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, targets):
        """
        Performs a forward pass through the Transformer.
        """
        # Step 1: Embedding and Positional Encoding
        enc_input = self.pos_encoding(self.embedding(inputs))  # Encoder input
        dec_input = self.pos_encoding(self.embedding(targets))  # Decoder input

        # Step 2: Encoder Stack
        enc_output = enc_input
        for encoder_block in self.encoder_stack:
            enc_output = encoder_block(enc_output)

        # Step 3: Decoder Stack
        dec_output = dec_input
        for decoder_block in self.decoder_stack:
            dec_output = decoder_block(dec_output, enc_output)

        # Step 4: Final Linear Layer
        logits = self.final_layer(dec_output)
        return logits

In [146]:
# Example usage
embed_dim = 64
num_heads = 8
dff = 256
num_layers = 4
seq_length = 10
vocab_size = 5000  # Example vocabulary size

transformer_model = Transformer(embed_dim, num_heads, dff, num_layers, seq_length, vocab_size)

# Dummy input (Batch size = 2, Sequence length = 10)
dummy_input = tf.random.uniform((2, seq_length), minval=0, maxval=vocab_size, dtype=tf.int32)
dummy_target = tf.random.uniform((2, seq_length), minval=0, maxval=vocab_size, dtype=tf.int32)

# Call the model with only the required arguments: inputs and targets
output = transformer_model(dummy_input, dummy_target)

print("Output shape:", output.shape)  # Expected: (Batch size, Sequence length, vocab_size)

Output shape: (2, 10, 5000)
