## --------------------------------Transformer Block(GPT2)---------------------------------------------

![image.png](attachment:7e522913-9f9d-4a12-92c4-99c7b32982d5.png)

## 1. Components of the transformer block

In [26]:
# Defining the GPT configuration settings

GPT_Config_124M = {
    "vocab_size": 50257,
    "context_length": 1024, 
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "dropout_rate": 0.0,
    "qkv_bias": False
}

In [31]:
import torch 
from torch import nn

# 1. Activation function
class GELUActivation(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, x):
        # Pass the input through the GELU activation - Approximate formula
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0) / torch.pi) * (x + 0.44715 * torch.pow(x, 3))))
        
# 2. Layer normalization
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        # Defining the epsilon -  small constant added to the variance to prevent zero division - undefined - limits
        self.eps = 1e-5
        # Defining the scaling and shifting parameters - trainable - better results
        self.shift = nn.Parameter(torch.zeros(emb_dim))
        self.scale = nn.Parameter(torch.ones(emb_dim))
    # Forward pass
    def forward(self, x):
        # Getting the mean and variance of each row
        mean = x.mean(dim=-1, keepdim=True)
        variance = x.var(dim=-1, keepdim=True)
        # Getting the normalization values
        norm_x = (x - mean) / torch.sqrt(variance + self.eps)
        # Returning the normalized values of x shifted and scaled - finetuning parameters
        return self.scale * norm_x + self.shift
        
# 3. Feed forward        
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], cfg["emb_dim"]*4),
            GELUActivation(),
            nn.Linear(cfg["emb_dim"]*4, cfg["emb_dim"])
        )
    def forward(self, x):
        return self.layers(x)
# 4. Attention Mechanism
# Creating the multi-head attention compact class
import torch
from torch import nn

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, num_heads, context_length, dropout_rate, bias_units=False):
        super().__init__()
        assert d_out % num_heads == 0, "dimensions out must be divisible by number of heads"
        # Getting the head dimensions
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        # Initializing the key query value weights - (d_out, d_out)
        self.w_key = nn.Linear(d_in, d_out, bias=bias_units)
        self.w_query = nn.Linear(d_in, d_out, bias=bias_units)
        self.w_value = nn.Linear(d_in, d_out, bias=bias_units)
        # Initializing the final projection layer - optional - (d_out, d_out)
        self.out_proj = nn.Linear(d_out, d_out)
        # Creating the masking layer
        self.register_buffer("mask", torch.triu(
            torch.ones(context_length, context_length),
            diagonal = 1
        ))
        # Creating the dropout layer
        self.dropout = nn.Dropout(dropout_rate)
    # Forward pass    
    def forward(self, x):
        # Exploding the input shape
        b, num_tokens, d_out = x.shape
        # Getting the key query value matrices (b, num_tokens, d_out)
        keys = self.w_key(x)
        queries = self.w_query(x)
        values =  self.w_value(x)
        # Reshaping the key query value matrices - (b, num_tokens, num_head, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        # Grouping by number of heads - (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)
        # Getting the attention scores - (b, num_heads, num_tokens, num_tokens)
        attention_scores = queries @ keys.transpose(2, 3)
        # Masking the attention scores
        attention_scores.masked_fill_(
            self.mask.bool()[:num_tokens, :num_tokens],
            -torch.inf
        )
        # Scaling the attention scores
        attention_scores = attention_scores / keys.shape[-1]**0.5
        # Getting the attention weights
        attention_weights = torch.softmax(attention_scores, dim=-1)
        # Implementing the dropout layer
        attention_weights = self.dropout(attention_weights)
        # Getting the context vector - (b, num_heads, num_tokens, head_dim)
        context_vector = attention_weights @ values
        # Reshaping the context vectors - (b, num_tokens, num_heads, head_dim)
        context_vector = context_vector.transpose(1, 2)
        # Combining the result of mutiple heads - d_out = num_heads * head_dim
        context_vector = context_vector.contiguous().view(b, num_tokens, d_out)
        # Passing the final context vector into the projection layer - optional
        context_vector = self.out_proj(context_vector)
        return context_vector

## 2. Constructing the transformer block

1. Shortcut connection for the attention block
2. Shortcut connection for the feed forward block
3. Add the original input back

In [32]:
# NB: Input = [b, num_tokens, 768]

class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # Defining the normalization layers
        self.layerNorm1 = LayerNorm(cfg["emb_dim"])
        self.layerNorm2 = LayerNorm(cfg["emb_dim"])
        # Defining the dropout layers
        self.drop_shortcut = nn.Dropout(cfg["dropout_rate"])
        # Defining the Multi-Head Attention layer
        self.attention = MultiHeadAttention(
            d_in = cfg["emb_dim"],
            d_out = cfg["emb_dim"],
            num_heads = cfg["n_heads"],
            context_length = cfg["context_length"],
            dropout_rate = cfg["dropout_rate"],
            bias_units = cfg["qkv_bias"]
        )
        # Deefining the feed forward layer
        self.feed_forward = FeedForward(cfg)
    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.layerNorm1(x)
        x = self.attention(x)
        x = self.drop_shortcut(x)
        # Add the original input the output 
        x = x +  shortcut
        # Shortcut connection for the 
        x = shortcut
        x = self.layerNorm2(x)
        x = self.feed_forward(x)
        x = self.drop_shortcut(x)
        # Add the original output
        x = x + shortcut
        return x

## Testing with an example

In [36]:
# The dimensionality is preserved

torch.manual_seed(42)
sample_input = torch.rand(2, 4, 768)
print(sample_input.shape)
transformer_block = TransformerBlock(GPT_Config_124M)
output = transformer_block(sample_input)
output.shape

torch.Size([2, 4, 768])


torch.Size([2, 4, 768])