In [1]:
import os 
from pathlib import Path

project_root = Path.cwd().parents[0]
os.chdir(project_root)
print("Set project_root:", project_root)


Set project_root: /home/sromo/Repos/lm-testbed


In [2]:
%%script echo SKIPPED

import urllib.request
url = (
"https://raw.githubusercontent.com/rasbt/"
"LLMs-from-scratch/main/ch05/"
"01_main-chapter-code/gpt_download.py"
)
filename = url.split('/')[-1]
urllib.request.urlretrieve(url, filename)

SKIPPED


In [3]:
import os
from scripts.gpt_download import download_and_load_gpt2
model_dir = project_root / "data" / "model_parameters" / "openai_GPT2_124M"

model_dir.mkdir(parents=True, exist_ok=True)

settings, params = download_and_load_gpt2(
    model_size="124M", models_dir=model_dir
)

2026-02-16 09:27:15.546741: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


File already exists and is up-to-date: /home/sromo/Repos/lm-testbed/data/model_parameters/openai_GPT2_124M/124M/checkpoint
File already exists and is up-to-date: /home/sromo/Repos/lm-testbed/data/model_parameters/openai_GPT2_124M/124M/encoder.json
File already exists and is up-to-date: /home/sromo/Repos/lm-testbed/data/model_parameters/openai_GPT2_124M/124M/hparams.json
File already exists and is up-to-date: /home/sromo/Repos/lm-testbed/data/model_parameters/openai_GPT2_124M/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: /home/sromo/Repos/lm-testbed/data/model_parameters/openai_GPT2_124M/124M/model.ckpt.index
File already exists and is up-to-date: /home/sromo/Repos/lm-testbed/data/model_parameters/openai_GPT2_124M/124M/model.ckpt.meta
File already exists and is up-to-date: /home/sromo/Repos/lm-testbed/data/model_parameters/openai_GPT2_124M/124M/vocab.bpe


In [4]:
print("Settings:\n", settings)
print("\nParameter dictionary keys:\n", params.keys())

Settings:
 {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}

Parameter dictionary keys:
 dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])


In [5]:
print(params["wte"])
print("\nToken embedding weight tensor dimensions:", params["wte"].shape)

[[-0.11010301 -0.03926672  0.03310751 ... -0.1363697   0.01506208
   0.04531523]
 [ 0.04034033 -0.04861503  0.04624869 ...  0.08605453  0.00253983
   0.04318958]
 [-0.12746179  0.04793796  0.18410145 ...  0.08991534 -0.12972379
  -0.08785918]
 ...
 [-0.04453601 -0.05483596  0.01225674 ...  0.10435229  0.09783269
  -0.06952604]
 [ 0.1860082   0.01665728  0.04611587 ... -0.09625227  0.07847701
  -0.02245961]
 [ 0.05135201 -0.02768905  0.0499369  ...  0.00704835  0.15519823
   0.12067825]]

Token embedding weight tensor dimensions: (50257, 768)


**Loading parameters into our model**

In [6]:
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

In [7]:
from src.configs.GPT2 import GPT_CONFIG_124M

model_name = "gpt2-small (124M)"
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length": 1024})     # Previously we set this to 256
NEW_CONFIG.update({"qkv_bias": True})           # Part of GPT2 architecture, but not commonly used anymore

import json
print(json.dumps(NEW_CONFIG, indent=4))

{
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate_attn": 0.1,
    "drop_rate_shortcut": 0.1,
    "drop_rate_emb": 0.1,
    "qkv_bias": true
}


In [8]:
from src.models.GPT2 import GPTModel

# Initialize model with random weights, use downloaded configuration
gpt = GPTModel(NEW_CONFIG)
gpt.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

In [9]:
# Utility Functions
import torch

def assign(left:torch.Tensor, right:torch.Tensor):
    """ Check if two tensors have the same shape and return right tensor as trainable PyTorch parameters"""
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, "
                         f"Right: {right.shape}"
                         )
    return torch.nn.Parameter(torch.tensor(right))

import numpy as np
import torch.nn as nn

def load_weights_into_gpt(gpt:nn.Module, params:dict):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])

    for b in range(len(params["blocks"])):
        # Attention: Weight Matrices
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        # Attention: Weight Matrices Bias Vectors
        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(gpt.trf_blocks[b].att.W_value.bias, v_b)

        # Attention: Output Projection Matrix
        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight,
            params["blocks"][b]["attn"]["c_proj"]["w"].T
            )
        
        # Attention: Output Projection Matrix Bias
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias,
            params["blocks"][b]["attn"]["c_proj"]["b"]
            )
        
        # Feed Forward Blocks - Layer 1: Weights and Biases
        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight,
            params["blocks"][b]["mlp"]["c_fc"]["w"].T
            )
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias,
            params["blocks"][b]["mlp"]["c_fc"]["b"]
            )
        # Feed Forward Blocks - Layer 2: Weights and Biases
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight,
            params["blocks"][b]["mlp"]["c_proj"]["w"].T
            )
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias,
            params["blocks"][b]["mlp"]["c_proj"]["b"]
            )
        
        # Pre-Norm 1 - Attention
        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale,
            params["blocks"][b]["ln_1"]["g"]
            )
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift,
            params["blocks"][b]["ln_1"]["b"]
            )
        
        # Pre-Norm 2 - FF
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale,
            params["blocks"][b]["ln_2"]["g"]
            )
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift,
            params["blocks"][b]["ln_2"]["b"]
            )
    
    # Final Norm
    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])

    # Linear Output Layer - OpenAI's GPT2 reused the token embedding weights in the output layer
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])


In [10]:
load_weights_into_gpt(gpt, params)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt.to(device)

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

In [11]:
import tiktoken

from src.utils.generate import generate
from src.utils.token_converter import text_to_token_ids, token_ids_to_text

tokenizer = tiktoken.get_encoding("gpt2")

torch.manual_seed(123)
token_ids = generate(
    model=gpt,
    idx=text_to_token_ids("Every effort moves you", tokenizer).to(device),
    max_new_tokens=25,
    context_size=NEW_CONFIG["context_length"],
    top_k=50,
    temperature=1.5
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you as far as the hand can go until the end of your turn unless something interrupts your control flow. As you may observe I


In [12]:
from src.dataloaders.GPT import create_dataloader_v1

# Load the data
file_path = project_root / "data" / "raw" / "the-verdict.txt"
with open(file_path, "r", encoding="utf-8") as f:
    text_data = f.read()

total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))
print("Characters: ", total_characters)
print("Tokens:", total_tokens)
print()

train_ratio = 0.9
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

torch.manual_seed(123)
train_loader = create_dataloader_v1(
    train_data,
    batch_size = 2,
    max_length=256,         # Change back to the 256 context window we previously trained on
    stride=256,
    drop_last=True,
    shuffle=True,
    num_workers=0,
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size = 2,
    max_length=256,
    stride=256,
    drop_last=False,
    shuffle=False,
    num_workers=0,
)

print("Train loader (x-input data, y-target data):")
for x, y in train_loader:
    print(x.shape, y.shape)

print("\nValidation loader: (x-input data, y-target data)")
for x, y in val_loader:
    print(x.shape, y.shape)

Characters:  20479
Tokens: 5145

Train loader (x-input data, y-target data):
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])

Validation loader: (x-input data, y-target data)
torch.Size([2, 256]) torch.Size([2, 256])


In [13]:
from src.loss.cross_entropy import calc_loss_loader 

# USAGE
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt.to(device)
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, gpt, device)
    val_loss = calc_loss_loader(val_loader, gpt, device)
print("Training loss:\n", train_loss)
print("\nValidation loss:\n", val_loss)

Training loss:
 3.7547629674275718

Validation loss:
 3.559634208679199
