![GPT](gpt-arch-1.png)

In [5]:
import torch
import torch.nn as nn
from modules import TransformerBlock, LayerNorm

class GPTModel(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.tok_emb = nn.Embedding(config["vocab_size"], config["emb_dim"])
        self.pos_emb = nn.Embedding(config["context_length"], config["emb_dim"])
        self.drop_emb = nn.Dropout(config["drop_rate"])
        self.transformerlayers = nn.Sequential(*[TransformerBlock(config) for _ in range(config["number_of_layers"])])
        self.finalnorm = LayerNorm(config["emb_dim"])
        self.out_head = nn.Linear(config["emb_dim"], config["vocab_size"], bias=False)

    def forward(self, input_idx):
        batch_size, seq_len = input_idx.shape
        token_embeddings = self.tok_emb(input_idx)
        positional_embeddings = self.pos_emb(torch.arange(seq_len, device=input_idx.device))
        x = token_embeddings + positional_embeddings
        x = self.drop_emb(x)
        x = self.transformerlayers(x)
        x = self.finalnorm(x)
        logits = self.out_head(x)
        return logits


Weight Tying 

a concept to save overall memory footprint and computation, used in original GPT2 architecture, where the token embedding weights are used in its output layer, otherwise it would've been 163M params instead of 124M.

however, modern llm architectures use separate weights at these places as it gives better model training and performance.

![](/images/gpt-arch-1.png)

[](/images/gpt-arch-1.png)

In [7]:
GPT_CONFIG = {
    "vocab_size": 50257,      # Vocabulary size
    "context_length": 1024,   # Context length
    "emb_dim": 768,           # Embedding dimension
    "num_heads": 12,          # Number of attention heads
    "number_of_layers": 12,   # Number of transformer blocks
    "drop_rate": 0.1,         # Dropout rate
    "qkv_bias": False         # Query-Key-Value bias
}

model = GPTModel(GPT_CONFIG)

# Example input (batch_size=2, seq_len=10)
input_ids = torch.randint(0, GPT_CONFIG["vocab_size"], (2, 10))

with torch.no_grad():
    logits = model(input_ids)

print(f"Input shape: {input_ids.shape}")
print(f"Output shape: {logits.shape}")
print(f"\nModel has {sum(p.numel() for p in model.parameters()):,} parameters")


Input shape: torch.Size([2, 10])
Output shape: torch.Size([2, 10, 50257])

Model has 163,009,536 parameters
