In [10]:
import torch
import torch.nn as nn
from Layers import MultiHeadCausalAttention

In [11]:
GPT_CONFIG_124M = {
    "vocab_size" : 50527,  # total no of unique tokens
    "context_length" : 1024,  # seq length or context length
    "emb_dim" : 768, # embedding dim, hidden dim, d_model
    "n_heads" : 12, # number of attentio heads in MHA
    "n_layers" : 12, # number of transformer layers
    "drop_rate" : 0.1, # dropout rate
    "qkv_bias" : False # Query Key Value 
}

In [12]:
cc = GPT_CONFIG_124M
cc["emb_dim"]

768

In [13]:
'''
This is the Feed Forward Block in the Transformer block, which is an expansion contraction layer
'''

class FeedForwardBlock(nn.Module):
    def __init__(self, cfg: dict, *args, **kwargs):
        super().__init__(*args, **kwargs)
    
        self.layers = nn.Sequential(
                        nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), #expansion
                        nn.GELU(), #activation
                        nn.Linear(4*cfg["emb_dim"], cfg["emb_dim"]) #contraction
                        )
        
    def forward(self, X):
        return self.layers(X)

In [14]:
class TransformerBlock(nn.Module):

    def __init__(self,cfg, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.causal_multi_head_attention = MultiHeadCausalAttention(d_in = cfg["emb_dim"],
                                                                    d_out=cfg["emb_dim"],
                                                                    context_length=cfg["context_length"],
                                                                    num_heads=cfg["n_heads"],
                                                                    dropout=cfg["drop_rate"],
                                                                    qkv_bias=cfg["qkv_bias"])
        self.feed_forward_block = FeedForwardBlock(cfg)
        self.dropout_layer = nn.Dropout(cfg["drop_rate"])
        self.layer_norm = nn.LayerNorm(cfg["emb_dim"])


    def forward(self, X):
        out = X

        X = self.layer_norm(X)
        X = self.causal_multi_head_attention(X)
        X = self.dropout_layer(X) 
        X = X + out

        out = X
        X = self.layer_norm(X)
        X = self.feed_forward_block(X)
        X = self.dropout_layer(X)
        X = X + out

        return X

__________________________________________________

In [15]:
X = torch.randn(2, 4, 768)
transformer_block = TransformerBlock(cfg=GPT_CONFIG_124M)
out = transformer_block(X)
out.shape

torch.Size([2, 4, 768])

__________________

In [17]:
transformer_blocks = nn.Sequential(
            *[TransformerBlock(cfg=GPT_CONFIG_124M) for _ in range(2)])
out = transformer_blocks(X)
out.shape


torch.Size([2, 4, 768])