# Chapter 4: Implementing a GPT model from Scratch To Generate Text 

## (Notes are in progress ...)

- In this chapter, we implement the architecture of a GPT-like LLM; in the next chapter, we will train this LLM

## 4.1 Coding the decoder

In [1]:
GPT_CONFIG = {
    "vocab_size": 50257,  # Vocabulary size
    "ctx_len": 1024,      # Context length
    "emb_dim": 768,       # Embedding dimension
    "n_heads": 12,        # Number of attention heads
    "n_layers": 12,       # Number of layers
    "drop_rate": 0.1,     # Dropout rate
    "qkv_bias": True      # Query-Key-Value bias
}

In [2]:
import torch.nn as nn


class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["ctx_len"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        
        # Use a placeholder for TransformerBlock
        self.trf_blocks = nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        
        # Use a placeholder for LayerNorm
        self.final_norm = DummyLayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits


class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # A simple placeholder

    def forward(self, x):
        # This block does nothing and just returns its input.
        return x


class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()
        # The parameters here are just to mimic the LayerNorm interface.

    def forward(self, x):
        # This layer does nothing and just returns its input.
        return x

In [3]:
import tiktoken
import torch

tokenizer = tiktoken.get_encoding("gpt2")

batch = []

txt1 = "Every effort moves you forward."
txt2 = "Every day holds a lesson."

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
batch

tensor([[ 6109,  3626,  6100,   345,  2651,    13],
        [ 6109,  1110,  6622,   257, 11483,    13]])

In [4]:
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG)

out = model(batch)
print("Output shape:", out.shape)
print(out)

Output shape: torch.Size([2, 6, 50257])
tensor([[[-1.2034,  0.3201, -0.7130,  ..., -1.5548, -0.2390, -0.4667],
         [-0.1192,  0.4539, -0.4432,  ...,  0.2392,  1.3469,  1.2430],
         [ 0.5307,  1.6720, -0.4695,  ...,  1.1966,  0.0111,  0.5835],
         [ 0.0139,  1.6755, -0.3388,  ...,  1.1586, -0.0435, -1.0400],
         [ 0.0106, -1.6711,  0.7797,  ...,  0.3561, -0.0867, -0.5452],
         [ 0.1821,  1.1189,  0.1641,  ...,  1.9012,  1.2240,  0.8853]],

        [[-1.0341,  0.2765, -1.1252,  ..., -0.8381,  0.0773,  0.1147],
         [-0.2632,  0.5427, -0.2828,  ...,  0.1357,  0.3707,  1.3615],
         [ 0.9695,  1.2466, -0.3515,  ..., -0.0171, -0.3478,  0.2616],
         [-0.0237, -0.7329,  0.3184,  ...,  1.5946, -0.1334, -0.2981],
         [-0.1876, -0.7909,  0.8811,  ...,  1.1121, -0.3781, -1.4438],
         [ 0.0405,  1.2000,  0.0702,  ...,  1.4740,  1.1567,  1.2077]]],
       grad_fn=<UnsafeViewBackward0>)


## 4.2 Normalizing activations with LayerNorm

In [5]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

## 4.3 Adding GeLU activation functions

In [6]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2 / torch.pi)) *
                                          (x + 0.044715 * x ** 3)))


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
            nn.Dropout(cfg["drop_rate"])
        )

    def forward(self, x):
        return self.net(x)

## 4.4 Understanding shortcut connections

In [7]:
class ExampleWithShortcut(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10, 10)
        self.fc2 = nn.Linear(10, 10)
        self.fc3 = nn.Linear(10, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        identity = x
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x)) + identity # Shortcut connection
        x = self.fc3(x)
        return x

torch.manual_seed(123)
ex_short = ExampleWithShortcut()
inputs = torch.randn(5, 10)
ex_short(inputs)

tensor([[-1.1785],
        [-0.0278],
        [-0.5737],
        [-1.5400],
        [ 0.1513]], grad_fn=<AddmmBackward0>)

## 4.5 Connecting attention and linear layers

In [8]:
from previous_chapters import MultiHeadAttention


class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            block_size=cfg["ctx_len"],
            num_heads=cfg["n_heads"], 
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_resid = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        x = x + self.drop_resid(self.att(self.norm1(x)))
        x = x + self.drop_resid(self.ff(self.norm2(x)))
        return x

In [9]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["ctx_len"], cfg["emb_dim"])
        
        # Use a placeholder for TransformerBlock
        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        
        # Use a placeholder for LayerNorm
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [10]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG)

out = model(batch)
print("Output shape:", out.shape)
print(out)

Output shape: torch.Size([2, 6, 50257])
tensor([[[-0.7971, -0.6232, -0.1815,  ...,  0.1020, -0.0916,  0.1885],
         [ 0.5491, -0.5220,  0.7559,  ..., -0.3137, -0.8780,  0.2182],
         [ 0.3107,  0.0346, -0.4637,  ..., -0.3700, -0.4346, -0.0747],
         [ 0.5681,  0.3940,  0.5397,  ..., -0.1027,  0.5461,  0.4834],
         [-0.2948, -0.1605, -0.5878,  ...,  0.0054, -0.0207, -0.1100],
         [-0.3096, -0.7744, -0.0254,  ...,  0.7480,  0.3515,  0.3208]],

        [[-0.6910, -0.3758, -0.1458,  ..., -0.1824, -0.5231,  0.0873],
         [-0.2562, -0.4204,  1.5507,  ..., -0.7057, -0.3989,  0.0084],
         [-0.4263, -0.2257, -0.2074,  ..., -0.2160, -1.1648,  0.4744],
         [-0.0245,  1.3792,  0.2234,  ..., -0.7153, -0.7858, -0.3762],
         [-0.4696, -0.4584, -0.4812,  ...,  0.5044, -0.8911,  0.1549],
         [-0.7727, -0.6125, -0.3203,  ...,  1.0753, -0.0878,  0.2805]]],
       grad_fn=<UnsafeViewBackward0>)


In [11]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

total_params_gpt2 =  total_params - sum(p.numel() for p in model.tok_emb.parameters())
print(f"Number of trainable parameters considering weight tying: {total_params_gpt2:,}")

Total number of parameters: 163,037,184
Number of trainable parameters considering weight tying: 124,439,808


In [12]:
# Calculate the total size in bytes (assuming float32, 4 bytes per parameter)
total_size_bytes = total_params * 4

# Convert to megabytes
total_size_mb = total_size_bytes / (1024 * 1024)

print(f"Total size of the model: {total_size_mb:.2f} MB")

Total size of the model: 621.94 MB


## 4.6 Implementing the forward pass