In [1]:
from modules.gpt2_download import download_and_load_gpt2

In [2]:
# Download and load GPT-2 weights
settings, params = download_and_load_gpt2(model_size="124M", models_dir="../models/gpt2")

print("Settings:", settings)
print("\nParams keys:", params.keys())

File already exists and is up-to-date: gpt2/124M/checkpoint
File already exists and is up-to-date: gpt2/124M/encoder.json
File already exists and is up-to-date: gpt2/124M/hparams.json
File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/124M/model.ckpt.index
File already exists and is up-to-date: gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: gpt2/124M/vocab.bpe
Settings: {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}

Params keys: dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])


In [3]:
from modules import GPT_CONFIG_124M, GPTModel

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

# Copy the base configuration and update with specific model settings
model_name = "gpt2-small (124M)"  # Example model name
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length": 1024, "qkv_bias": True})

gpt = GPTModel(NEW_CONFIG)
print(gpt)
gpt.eval();

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (transformerlayers): Sequential(
    (0): TransformerBlock(
      (attention): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
      (feedforward): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attention): MultiHeadAttention(
        (W_

In [4]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

In [6]:
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    # Use PyTorch 2.9 or newer for stable mps results
    major, minor = map(int, torch.__version__.split(".")[:2])
    if (major, minor) >= (2, 9):
        device = torch.device("mps")
else:
    device = torch.device("cpu")

print("Device:", device)

Device: mps


In [7]:
import numpy as np
import torch

def load_weights_into_gpt(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])

    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.transformerlayers[b].attention.W_query.weight = assign(
            gpt.transformerlayers[b].attention.W_query.weight, q_w.T)
        gpt.transformerlayers[b].attention.W_key.weight = assign(
            gpt.transformerlayers[b].attention.W_key.weight, k_w.T)
        gpt.transformerlayers[b].attention.W_value.weight = assign(
            gpt.transformerlayers[b].attention.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.transformerlayers[b].attention.W_query.bias = assign(
            gpt.transformerlayers[b].attention.W_query.bias, q_b)
        gpt.transformerlayers[b].attention.W_key.bias = assign(
            gpt.transformerlayers[b].attention.W_key.bias, k_b)
        gpt.transformerlayers[b].attention.W_value.bias = assign(
            gpt.transformerlayers[b].attention.W_value.bias, v_b)

        gpt.transformerlayers[b].attention.out_proj.weight = assign(
            gpt.transformerlayers[b].attention.out_proj.weight,
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.transformerlayers[b].attention.out_proj.bias = assign(
            gpt.transformerlayers[b].attention.out_proj.bias,
            params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.transformerlayers[b].feedforward.layers[0].weight = assign(
            gpt.transformerlayers[b].feedforward.layers[0].weight,
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.transformerlayers[b].feedforward.layers[0].bias = assign(
            gpt.transformerlayers[b].feedforward.layers[0].bias,
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.transformerlayers[b].feedforward.layers[2].weight = assign(
            gpt.transformerlayers[b].feedforward.layers[2].weight,
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.transformerlayers[b].feedforward.layers[2].bias = assign(
            gpt.transformerlayers[b].feedforward.layers[2].bias,
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.transformerlayers[b].norm1.scale = assign(
            gpt.transformerlayers[b].norm1.scale,
            params["blocks"][b]["ln_1"]["g"])
        gpt.transformerlayers[b].norm1.shift = assign(
            gpt.transformerlayers[b].norm1.shift,
            params["blocks"][b]["ln_1"]["b"])
        gpt.transformerlayers[b].norm2.scale = assign(
            gpt.transformerlayers[b].norm2.scale,
            params["blocks"][b]["ln_2"]["g"])
        gpt.transformerlayers[b].norm2.shift = assign(
            gpt.transformerlayers[b].norm2.shift,
            params["blocks"][b]["ln_2"]["b"])

    gpt.finalnorm.scale = assign(gpt.finalnorm.scale, params["g"])
    gpt.finalnorm.shift = assign(gpt.finalnorm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])


load_weights_into_gpt(gpt, params)
gpt.to(device);

<h4>whats happening above</h4>

* **Assign Embeddings:** Copy `wpe` and `wte` directly.
    * *Reason:* PyTorch and TensorFlow store simple lookup tables identically.


* **Split Attention:** Cut `c_attn` into 3 equal chunks (Q, K, V).
    * *Reason:* OpenAI bundles Q, K, and V into one matrix for speed; your PyTorch code likely separates them.


* **Transpose Weights (`.T`):** Flip every Linear layer matrix (Attention & MLP).
    * *Reason:* TensorFlow stores weights as `[Input, Output]`, but PyTorch expects `[Output, Input]`.


* **Keep Biases 1D:** Load biases (`b`) without transposing.
    * *Reason:* 1D vectors (lines of numbers) look the same whether flipped or not.


* **Rename Layer Norms:** Map `g`  `scale` and `b`  `shift`.
    * *Reason:* Just a difference in variable naming conventions between frameworks.


* **Tie Weights:** Set the final `out_head` equal to the input `wte`.
    * *Reason:* The model uses the exact same dictionary to read words (input) as it does to write them (output).


In [14]:
from modules import generate, text_to_token_ids, token_ids_to_text
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
torch.manual_seed(123)

token_ids = generate(
    model=gpt,
    idx=text_to_token_ids("Every effort moves you", tokenizer).to(device),
    max_new_tokens=25,
    context_size=NEW_CONFIG["context_length"],
    top_k=50,
    temperature=1.5
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you toward more efficient and efficient processes, like in the car's oil and gas operation," the study said. To see if that
