# GPT-2 Architecture & Weight Loading

In [3]:
import os
import json
import numpy as np
import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": True
}

## Architecture

In [5]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys = self.W_key(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        queries = self.W_query(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        values = self.W_value(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        attn_scores = queries @ keys.transpose(2, 3)
        attn_scores.masked_fill_(self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)
        context_vec = (attn_weights @ values).transpose(1, 2).contiguous().view(b, num_tokens, self.d_out)
        return self.out_proj(context_vec)


class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        return self.scale * (x - mean) / torch.sqrt(var + self.eps) + self.shift


class GELU(nn.Module):
    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))))


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)


class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"], d_out=cfg["emb_dim"],
            context_length=cfg["context_length"], num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"], qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        return x + shortcut


class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trf_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        return self.out_head(x)

## PyTorch Save/Load

In [6]:
# Save model
model = GPTModel(GPT_CONFIG_124M)
torch.save(model.state_dict(), "model.pth")

# Load model
model = GPTModel(GPT_CONFIG_124M)
model.load_state_dict(torch.load("model.pth", weights_only=True))
model.eval()
print("Model saved and loaded!")

Model saved and loaded!


## Load OpenAI Pretrained Weights

In [10]:
import tensorflow as tf

def download_and_load_gpt2(model_size="124M", models_dir="gpt2"):
    model_dir = os.path.join(models_dir, model_size)
    with open(os.path.join(model_dir, "hparams.json"), "r") as f:
        settings = json.load(f)
    ckpt_path = os.path.join(model_dir, "model.ckpt")
    params = {"blocks": [{} for _ in range(settings["n_layer"])]}
    for name, _ in tf.train.list_variables(ckpt_path):
        variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name))
        variable_name_parts = name.split("/")[1:]
        target_dict = params
        if variable_name_parts[0].startswith("h"):
            layer_number = int(variable_name_parts[0][1:])
            target_dict = params["blocks"][layer_number]
            variable_name_parts = variable_name_parts[1:]
        for key in variable_name_parts[:-1]:
            target_dict = target_dict.setdefault(key, {})
        target_dict[variable_name_parts[-1]] = variable_array
    return settings, params


def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch: {left.shape} vs {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))


def load_weights_into_gpt(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(params["blocks"][b]["attn"]["c_attn"]["w"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(gpt.trf_blocks[b].att.W_value.weight, v_w.T)
        q_b, k_b, v_b = np.split(params["blocks"][b]["attn"]["c_attn"]["b"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(gpt.trf_blocks[b].att.W_value.bias, v_b)
        gpt.trf_blocks[b].att.out_proj.weight = assign(gpt.trf_blocks[b].att.out_proj.weight, params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(gpt.trf_blocks[b].att.out_proj.bias, params["blocks"][b]["attn"]["c_proj"]["b"])
        gpt.trf_blocks[b].ff.layers[0].weight = assign(gpt.trf_blocks[b].ff.layers[0].weight, params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(gpt.trf_blocks[b].ff.layers[0].bias, params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(gpt.trf_blocks[b].ff.layers[2].weight, params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(gpt.trf_blocks[b].ff.layers[2].bias, params["blocks"][b]["mlp"]["c_proj"]["b"])
        gpt.trf_blocks[b].norm1.scale = assign(gpt.trf_blocks[b].norm1.scale, params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(gpt.trf_blocks[b].norm1.shift, params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(gpt.trf_blocks[b].norm2.scale, params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(gpt.trf_blocks[b].norm2.shift, params["blocks"][b]["ln_2"]["b"])
    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["ln_f"]["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["ln_f"]["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])

In [11]:
# Load pretrained weights
settings, params = download_and_load_gpt2("124M", "gpt2")
gpt = GPTModel(GPT_CONFIG_124M)
load_weights_into_gpt(gpt, params)
gpt.to(device)
gpt.eval()
print("Pretrained GPT-2 weights loaded!")

Pretrained GPT-2 weights loaded!


In [12]:
# Cleanup
import os
if os.path.exists("model.pth"):
    os.remove("model.pth")

# üß† GPT-2 Architecture & Weight Loading

This notebook demonstrates:
1. **GPT-2 Architecture Implementation** - Complete model from scratch
2. **PyTorch Model Saving/Loading** - Save and restore trained models
3. **Loading OpenAI Pretrained Weights** - Use official GPT-2 weights

---

## üîß Setup & Imports

In [13]:
import os
import json
import numpy as np
import torch
import torch.nn as nn
import tiktoken

print(f"PyTorch version: {torch.__version__}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

PyTorch version: 2.9.0+cpu
Using device: cpu


---
## üìê GPT-2 Model Configurations

GPT-2 comes in 4 sizes. Here are the configurations:

In [14]:
# Base configuration for GPT-2 124M
GPT_CONFIG_124M = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "emb_dim": 768,          # Embedding dimension
    "n_heads": 12,           # Number of attention heads
    "n_layers": 12,          # Number of layers
    "drop_rate": 0.1,        # Dropout rate
    "qkv_bias": False        # Query-Key-Value bias
}

# Model configurations for different GPT-2 sizes
MODEL_CONFIGS = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

print("üìä Available GPT-2 Configurations:")
print("-" * 60)
print(f"{'Model':<25} {'Embedding':<12} {'Layers':<10} {'Heads':<8}")
print("-" * 60)
for name, config in MODEL_CONFIGS.items():
    print(f"{name:<25} {config['emb_dim']:<12} {config['n_layers']:<10} {config['n_heads']:<8}")

üìä Available GPT-2 Configurations:
------------------------------------------------------------
Model                     Embedding    Layers     Heads   
------------------------------------------------------------
gpt2-small (124M)         768          12         12      
gpt2-medium (355M)        1024         24         16      
gpt2-large (774M)         1280         36         20      
gpt2-xl (1558M)           1600         48         25      


---
## üèóÔ∏è GPT-2 Architecture Components

The GPT-2 model consists of several key components:
1. **MultiHeadAttention** - Self-attention mechanism
2. **LayerNorm** - Layer normalization
3. **GELU** - Activation function
4. **FeedForward** - MLP block
5. **TransformerBlock** - Combines attention and feedforward
6. **GPTModel** - Complete model

### 1Ô∏è‚É£ Multi-Head Attention

In [15]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads  # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x)      # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)  # optional projection

        return context_vec

print("‚úÖ MultiHeadAttention class defined")

‚úÖ MultiHeadAttention class defined


### 2Ô∏è‚É£ Layer Normalization, GELU Activation & FeedForward Network

In [16]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift


class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),  # Expansion
            GELU(),                                          # Activation
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),  # Contraction
        )

    def forward(self, x):
        return self.layers(x)

print("‚úÖ LayerNorm, GELU, and FeedForward classes defined")

‚úÖ LayerNorm, GELU, and FeedForward classes defined


### 3Ô∏è‚É£ Transformer Block

In [17]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x

print("‚úÖ TransformerBlock class defined")

‚úÖ TransformerBlock class defined


### 4Ô∏è‚É£ Complete GPT Model

In [18]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

print("‚úÖ GPTModel class defined")

‚úÖ GPTModel class defined


### üß™ Test the Architecture

In [19]:
# Initialize the model
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"üìä Total parameters: {total_params:,}")

# Accounting for weight tying (token embedding reused in output layer)
total_params_gpt2 = total_params - sum(p.numel() for p in model.out_head.parameters())
print(f"üìä Parameters (with weight tying): {total_params_gpt2:,}")

# Memory size
total_size_bytes = total_params * 4  # 4 bytes per float32
total_size_mb = total_size_bytes / (1024 * 1024)
print(f"üíæ Model size: {total_size_mb:.2f} MB")

üìä Total parameters: 163,009,536
üìä Parameters (with weight tying): 124,412,160
üíæ Model size: 621.83 MB


---
## üíæ Saving and Loading Model Weights in PyTorch

The recommended way is to save a model's `state_dict`, a dictionary mapping each layer to its parameters.

### Saving Model Weights

In [20]:
# Create a model instance
model = GPTModel(GPT_CONFIG_124M)

# Save just the model weights
torch.save(model.state_dict(), "model.pth")
print("‚úÖ Model weights saved to 'model.pth'")

‚úÖ Model weights saved to 'model.pth'


### Loading Model Weights

In [21]:
# Create a new model instance
model = GPTModel(GPT_CONFIG_124M)

# Load the saved weights
model.load_state_dict(torch.load("model.pth", weights_only=True))
model.eval()  # Set to evaluation mode

print("‚úÖ Model weights loaded successfully!")

‚úÖ Model weights loaded successfully!


### Saving Model + Optimizer (for resuming training)

In [22]:
# Create model and optimizer
model = GPTModel(GPT_CONFIG_124M)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

# Save both model and optimizer state
torch.save({
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
}, "model_and_optimizer.pth")

print("‚úÖ Model and optimizer saved to 'model_and_optimizer.pth'")

‚úÖ Model and optimizer saved to 'model_and_optimizer.pth'


### Loading Model + Optimizer (to resume training)

In [23]:
# Load the checkpoint
checkpoint = torch.load("model_and_optimizer.pth", weights_only=True)

# Create new instances
model = GPTModel(GPT_CONFIG_124M)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1)

# Restore states
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

model.train()  # Set to training mode
print("‚úÖ Model and optimizer restored for continued training!")

‚úÖ Model and optimizer restored for continued training!


---
## üåê Loading Pretrained Weights from OpenAI

OpenAI openly shared the weights of their GPT-2 models. Let's load them into our architecture!

### Install Required Dependencies

In [24]:
%pip install tensorflow>=2.15.0 tqdm>=4.66 requests -q

In [25]:
import tensorflow as tf
import requests
from tqdm import tqdm

print(f"TensorFlow version: {tf.__version__}")

TensorFlow version: 2.19.0


### Download and Load GPT-2 Function

In [26]:
def download_and_load_gpt2(model_size="124M", models_dir="gpt2"):
    """
    Download GPT-2 model files from OpenAI and load the weights.
    
    Args:
        model_size: One of "124M", "355M", "774M", or "1558M"
        models_dir: Directory to save the model files
    
    Returns:
        settings: Model configuration dictionary
        params: Model weights dictionary
    """
    # Create model directory
    model_dir = os.path.join(models_dir, model_size)
    
    # Download files if not present
    if not os.path.exists(model_dir):
        print(f"üì• Downloading GPT-2 {model_size} model...")
        os.makedirs(model_dir, exist_ok=True)
        
        base_url = f"https://openaipublic.blob.core.windows.net/gpt-2/models/{model_size}"
        filenames = [
            "checkpoint",
            "encoder.json",
            "hparams.json",
            "model.ckpt.data-00000-of-00001",
            "model.ckpt.index",
            "model.ckpt.meta",
            "vocab.bpe"
        ]
        
        for filename in filenames:
            url = f"{base_url}/{filename}"
            filepath = os.path.join(model_dir, filename)
            
            response = requests.get(url, stream=True)
            total_size = int(response.headers.get('content-length', 0))
            
            with open(filepath, 'wb') as f:
                with tqdm(total=total_size, unit='B', unit_scale=True, desc=filename) as pbar:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                        pbar.update(len(chunk))
        
        print("‚úÖ Download complete!")
    else:
        print(f"‚úÖ GPT-2 {model_size} model already downloaded.")
    
    # Load hyperparameters
    hparams_path = os.path.join(model_dir, "hparams.json")
    with open(hparams_path, "r") as f:
        settings = json.load(f)
    
    # Load weights from TensorFlow checkpoint
    print("üîÑ Loading weights from TensorFlow checkpoint...")
    ckpt_path = os.path.join(model_dir, "model.ckpt")
    
    params = {"blocks": [{} for _ in range(settings["n_layer"])]}
    
    for name, _ in tf.train.list_variables(ckpt_path):
        variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name))
        variable_name_parts = name.split("/")[1:]  # Skip 'model' prefix
        
        target_dict = params
        if variable_name_parts[0].startswith("h"):
            layer_number = int(variable_name_parts[0][1:])
            target_dict = params["blocks"][layer_number]
            variable_name_parts = variable_name_parts[1:]
        
        for key in variable_name_parts[:-1]:
            target_dict = target_dict.setdefault(key, {})
        
        last_key = variable_name_parts[-1]
        target_dict[last_key] = variable_array
    
    print("‚úÖ Weights loaded successfully!")
    return settings, params

print("‚úÖ download_and_load_gpt2 function defined")

‚úÖ download_and_load_gpt2 function defined


### Download GPT-2 124M Weights

In [27]:
# Download and load GPT-2 weights
settings, params = download_and_load_gpt2(model_size="124M", models_dir="gpt2")

print("\nüìã Model Settings:", settings)
print("\nüîë Parameter dictionary keys:", params.keys())

‚úÖ GPT-2 124M model already downloaded.
üîÑ Loading weights from TensorFlow checkpoint...
‚úÖ Weights loaded successfully!

üìã Model Settings: {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}

üîë Parameter dictionary keys: dict_keys(['blocks', 'ln_f', 'wpe', 'wte'])


In [28]:
# Inspect token embedding weights
print("Token embedding shape:", params["wte"].shape)
print("Position embedding shape:", params["wpe"].shape)

Token embedding shape: (50257, 768)
Position embedding shape: (1024, 768)


### Configure Model for OpenAI Weights

In [29]:
# Select model configuration
model_name = "gpt2-small (124M)"
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(MODEL_CONFIGS[model_name])

# OpenAI used 1024 context length and bias in QKV projections
NEW_CONFIG.update({"context_length": 1024, "qkv_bias": True})

print("üìã Model Configuration:")
for key, value in NEW_CONFIG.items():
    print(f"  {key}: {value}")

üìã Model Configuration:
  vocab_size: 50257
  context_length: 1024
  emb_dim: 768
  n_heads: 12
  n_layers: 12
  drop_rate: 0.1
  qkv_bias: True


### Weight Assignment Function

In [30]:
def assign(left, right):
    """
    Assign weights from right to left, checking shape compatibility.
    """
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

print("‚úÖ assign function defined")

‚úÖ assign function defined


### Load Weights into GPT Model

In [33]:
def load_weights_into_gpt(gpt, params):
    """
    Load OpenAI GPT-2 weights into our GPTModel.
    
    Args:
        gpt: GPTModel instance
        params: Dictionary of weights from OpenAI checkpoint
    """
    # Load embeddings
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])

    # Load transformer blocks
    for b in range(len(params["blocks"])):
        # Attention weights (Q, K, V are concatenated in OpenAI checkpoint)
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        # Attention biases
        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(
            gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(
            gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(
            gpt.trf_blocks[b].att.W_value.bias, v_b)

        # Output projection
        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight,
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias,
            params["blocks"][b]["attn"]["c_proj"]["b"])

        # MLP weights
        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight,
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias,
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight,
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias,
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        # Layer norms
        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale,
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift,
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale,
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift,
            params["blocks"][b]["ln_2"]["b"])

    # Final layer norm
    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["ln_f"]["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["ln_f"]["b"])
    
    # Output head (weight tied with token embeddings)
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])

print("‚úÖ load_weights_into_gpt function defined")

‚úÖ load_weights_into_gpt function defined


### Create Model and Load Pretrained Weights

In [34]:
# Create model with the correct configuration
gpt = GPTModel(NEW_CONFIG)
gpt.eval()

# Load OpenAI weights
load_weights_into_gpt(gpt, params)
gpt.to(device)

print("\n‚úÖ Pretrained GPT-2 weights loaded successfully!")


‚úÖ Pretrained GPT-2 weights loaded successfully!


---
## üéØ Text Generation with Pretrained Model

### Helper Functions

In [35]:
# Initialize tokenizer
tokenizer = tiktoken.get_encoding("gpt2")

def text_to_token_ids(text, tokenizer):
    """Convert text to token IDs."""
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    """Convert token IDs back to text."""
    flat = token_ids.squeeze(0)  # remove batch dimension
    return tokenizer.decode(flat.tolist())

print("‚úÖ Tokenizer and helper functions ready")

‚úÖ Tokenizer and helper functions ready


### Generate Function with Temperature and Top-k Sampling

In [36]:
def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):
    """
    Generate text tokens from the model.
    
    Args:
        model: GPTModel instance
        idx: Input token indices (batch_size, seq_len)
        max_new_tokens: Maximum number of new tokens to generate
        context_size: Maximum context length
        temperature: Sampling temperature (0 = greedy, higher = more random)
        top_k: Only sample from top k tokens
        eos_id: End of sequence token ID
    
    Returns:
        Generated token indices
    """
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # Filter logits with top_k sampling
        if top_k is not None:
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

        # Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature
            probs = torch.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)

        if idx_next == eos_id:
            break

        idx = torch.cat((idx, idx_next), dim=1)

    return idx

print("‚úÖ generate function defined")

‚úÖ generate function defined


### Test Text Generation

In [37]:
# Generate text with the pretrained model
torch.manual_seed(123)

token_ids = generate(
    model=gpt,
    idx=text_to_token_ids("Every effort moves you", tokenizer).to(device),
    max_new_tokens=25,
    context_size=NEW_CONFIG["context_length"],
    top_k=50,
    temperature=1.5
)

print("üéØ Generated text:")
print("-" * 50)
print(token_ids_to_text(token_ids, tokenizer))

üéØ Generated text:
--------------------------------------------------
Every effort moves you toward finding an ideal new way to practice something!

What makes us want to be on top of that?




In [38]:
# Try another prompt
torch.manual_seed(42)

prompt = "Artificial intelligence is"
token_ids = generate(
    model=gpt,
    idx=text_to_token_ids(prompt, tokenizer).to(device),
    max_new_tokens=50,
    context_size=NEW_CONFIG["context_length"],
    top_k=40,
    temperature=1.0
)

print(f"üéØ Prompt: '{prompt}'")
print("-" * 50)
print(token_ids_to_text(token_ids, tokenizer))

üéØ Prompt: 'Artificial intelligence is'
--------------------------------------------------
Artificial intelligence is more than just a tool. It has more potential than science to uncover problems and discover new possibilities for humanity. Today's technologies give us insights into life's most profound and immediate dangers. They can provide insight into our world as we know it, and


---
## ‚úÖ Summary

In this notebook, we covered:

1. **GPT-2 Architecture** - Built the complete model from scratch:
   - `MultiHeadAttention` - Self-attention with causal masking
   - `LayerNorm`, `GELU`, `FeedForward` - Supporting components
   - `TransformerBlock` - Combines attention and feedforward with residual connections
   - `GPTModel` - Complete architecture with embeddings and output head

2. **PyTorch Model Saving/Loading**:
   - `torch.save(model.state_dict(), "model.pth")` - Save weights
   - `model.load_state_dict(torch.load("model.pth"))` - Load weights
   - Save/load optimizer state for resuming training

3. **OpenAI Pretrained Weights**:
   - Download from OpenAI's public blob storage
   - Convert TensorFlow checkpoint to PyTorch
   - Generate coherent text with the pretrained model

The model is now ready for fine-tuning on specific tasks!

In [39]:
# Check saved model files
import os

saved_files = ["model.pth", "model_and_optimizer.pth"]
print("üìÅ Saved model files:")
print("-" * 50)
for f in saved_files:
    if os.path.exists(f):
        size_mb = os.path.getsize(f) / (1024 * 1024)
        full_path = os.path.abspath(f)
        print(f"‚úÖ {f} ({size_mb:.2f} MB)")
        print(f"   Path: {full_path}")
    else:
        print(f"‚ùå {f} - not found")
print("-" * 50)
print("\nüí° You can load these files with:")
print("   torch.load('model.pth', weights_only=True)")
print("   torch.load('model_and_optimizer.pth', weights_only=True)")

üìÅ Saved model files:
--------------------------------------------------
‚úÖ model.pth (669.88 MB)
   Path: /content/model.pth
‚úÖ model_and_optimizer.pth (669.89 MB)
   Path: /content/model_and_optimizer.pth
--------------------------------------------------

üí° You can load these files with:
   torch.load('model.pth', weights_only=True)
   torch.load('model_and_optimizer.pth', weights_only=True)
