## **Building a LLM from Scratch on DGX Spark**

In [25]:
import tiktoken
import torch
import torch.nn as nn
import numpy

## Multi-head Attention
The MultiHeadAttention class below invokes parrallel attention heads (for GPT2-Small) with each attention head learning different Q,K,V transformations to capture diverse patterns - like grammatical structure, semantic meaning, and position relationship between words. Each head computes scaled attention scores (scaled Q. K^T) to create weighted sums of values. The weighted sum of values represent a NEW, CONTEXT-AWARE representation of each token. Essentially, it's the token's original information enriched with relevant information from other tokens it should pay attention to. Casual masking is applied to these attention scores to prevent future token access, and all heads are concatenated to provide a rich, multi-perspective representation of the input.

In [5]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out,
                 context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(
            b, num_tokens, self.num_heads, self.head_dim
        )

        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        attn_scores = queries @ keys.transpose(2, 3)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vec = (attn_weights @ values).transpose(1, 2)

        context_vec = context_vec.contiguous().view(
            b, num_tokens, self.d_out
        )
        context_vec = self.out_proj(context_vec)
        return context_vec

### LayerNorm Class
The LayerNorm class normalizes each token's embedding to have mean=0 and variance=1. Then it applies learned scale and shift parameters. This stablizes training by preventing internal covariate shift - ensuring that values don't become too large or too small as they pass through many layers; similar to how standardizing exam scores makes theme easier to compare

In [6]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

### GELU Activation
Activation functions are critical in deep learning networks as they add non-linearity.GELU stands for Gaussian Error Linear Unit. It is a smooth activation function that applies f(x) = x * Φ(x) where Φ is the cumulative distribution function of the standard normal distribution. It's similar to ReLU but smoother(no sharp corner at zero), allowing small negative values to pass through with reduced magnitude which helps gradients flow better during training - it's a softer gate that does not kill negative values completely 

In [7]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))

### FeedForward Class
The FeedForward class is a simple two-layer neural network that processes each token independently; it expands the embedding dimension by 4. So that means if the input tokens have a dimension of 768, it is multipled by 4 to give 3072. Subsequently GELU is applied for non-linearity activation, and then it is projected back to 768 dimention. This essentially gives each token a chance to "think" about it's contextualized representation from attention, transforming the features through a bottlenceck that learns complex patterns. It is almost like compressing and decompressing information to extract richer features from the information.

In [8]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)

### GPT Model Configurations

In [9]:
# GPT-2 Small (124M parameters)
GPT_CONFIG_124M = {
    "vocab_size": 50257,        # Number of tokens in vocabulary
    "context_length": 1024,     # Maximum sequence length
    "emb_dim": 768,             # Embedding dimension
    "n_heads": 12,              # Number of attention heads
    "n_layers": 12,             # Number of transformer blocks
    "drop_rate": 0.1,           # Dropout probability
    "qkv_bias": False           # Use bias in Q, K, V projections?
}

# GPT-2 Medium (355M parameters)
GPT_CONFIG_355M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1024,            # Larger embedding
    "n_heads": 16,              # More heads
    "n_layers": 24,             # More layers
    "drop_rate": 0.1,
    "qkv_bias": False
}

# For testing (tiny model)
GPT_CONFIG_TINY = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 256,
    "n_heads": 4,
    "n_layers": 4,
    "drop_rate": 0.1,
    "qkv_bias": False
}

### TransformerBlock Class - The result of years of deep learning research

Each token passes through two stages: (1) Multi-Head Attention gathers context from previous tokens (e.g. "sat" learns about "cat"), and (2) Feed-Forward processes this information through a neural network. The critical innovation is Residual connections symbolized by the shortcut variable below. This is where we save the input before each stage and add it back (x = x + shortcut), which prevents vanishing gradients and preserves each token's identity while enriching it with extra context.

Layer Normalization before each stage keep values stable by normalizing to mean = 0 and variance = 1, preventing explosion or vanishing. Dropout has been set in the above configurations to 0.1 which means 10% random zeroing during training helps prevent overfitting by forcing the neural network to not rely on specific neurons. The overall pattern - normalize, transform, add residual-repeated for attention and feed-forward creates contextualized representations: for example, "cat" remains "cat" but is now enriched with "subject who is sitting". Stack 12 of these transformer blocks and you get GPT2-Small.


In [10]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):

        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut
        return x

### GPTModel Class - The Complete Architecture

Takes token IDs as input and outputs probability distributions over the vocabulary for predicting the next tokens. The flow: (1) Convert token IDs to embeddings and add positional information, (2) pass through a stack of 12 transformer blocks (declared above) where each token gathers context (learns more about itself) and processes this information, (3) apply final normalization and project back to vocabulary size to get logits (raw scores) for each possible next token.

The key insight here is that by stacking multiple transformer blocks, each token's representation becomes progressively richer - early layers learn simple patterns like nearby words, middle layers learn grammar and syntax, and deep layers learn complex semantics and reasoning. The output logits can be converted to probabilities with softmax, then sampled to generate the next token, repeating this process to generate entire lengthy sequences of text

In [11]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)

        pos_embeds = self.pos_emb(
            torch.arange(seq_len, device=in_idx.device)
        )
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

### Generating Outputs using the GPTModel

The generate_text_simple function generates text autoregressively (looking backwards) by repeatedly feeding the current sequence to the model, taking the highest-probability next token from the output logits, and appending it to the sequence. This process repeats for max_new_tokens iterations, building up the generated text one token at a time by always selecting the most likely continutation.


In [12]:
def generate_text_simple(model, idx,max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)

        logits = logits[:, -1, :]
        probas = torch.softmax(logits, dim=-1)
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim=1)

    return idx

### Encoding, decoding and gibberish outputs

The text_to_token_ids function converts a text string into token IDs using the tokenizer's vocabulary, and then wraps the result in a PyTorch tensor with a batch dimension so it can be fed into the model, which expects input shape of (batch_size, sequence length)

On the other hand, the token_ids_to_text converts token IDs back to readable text by removing the batch dimension (done through the call to squeeze) and decoding the token ID list back to a string using the tokenizer's vocabulary

Models need batch dimension due to parallelization so it can process multiple sentences at once with a GPU. The unsqueeze function call changes the text "Hello" from sequence length (2,) to (batch, sequence length) of (1,2). This also helps in matrix operations working correctly. The squeeze function removes the batch dimension because the tokenizer expects list of IDs, and those are simpler to work with. unsqueeze(0) adds a batch dimension so PyTorch is happy, squeeze(0) removes it so the tokenizer is happy!

The output of the prompt below is gibberish because the model has random, untrained weights - it was jut initialized with GPTModel(GPT_CONFIG_124M) but never trained or loaded with pre-trained weights, so it has no knowledge of language. To get the coherent text either you (1) load pre-trained weights that most frontier model companies have publicly available, or (2) train the model on text data, which is known as PRE-TRAINING.

In [13]:
torch.manual_seed(1456)
model = GPTModel(GPT_CONFIG_124M)

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

start_context = "What is the meaning of life?"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 What is the meaning of life? Times steroids OPSrem103 LIC superior wherein Tycooningly


### Pre-training the model on the Gutenberg dataset

The cell below imports 50 book titles from the Gutenberg dataset including Alice in Wonderland, A Tale of Two Cities, Sherlock Holmes, Pride and Prejudice amongst others. Subsequently, we read each book's content into memory, and concatenate them all with newlines between books to create one large corpus string called text_data for training. Then we split the text data into training (90%) and validation (10%) sets by calculating a split index at the 90% point - the model will learn from train_data and we will evaluate it's performance using val_data to detect overfitting. 

In [None]:
import requests
import time

def download_gutenberg_corpus(num_books=50):
    os.makedirs("gutenberg_books", exist_ok=True)
    
    book_ids = [
        1342, 84, 1661, 11, 98, 2701, 1260, 174, 46, 345,
        74, 76, 244, 1400, 100, 2600, 1497, 5200, 64317, 16328,
        1952, 215, 1727, 1232, 2814, 145, 408, 1184, 205, 161,
        236, 1080, 514, 158, 43, 2148, 1635, 1259, 996, 6130,
        4300, 120, 219, 768, 1250, 271, 30254, 2097, 135, 2500,
        31284, 209, 863, 1298, 829, 308, 1404, 3825, 730, 1155,
        1250, 521, 1998, 203, 1322, 1984, 844, 2591, 699, 2265,
        140, 25344, 375, 45, 19337, 113, 1399, 2554, 1946, 1257,
        19942, 6593, 160, 41, 1028, 2542, 16389, 526, 829, 1304,
        3090, 580, 2500, 15399, 2160, 1998, 1184, 3207, 996
    ][:num_books]
    
    print(f"Downloading {len(book_ids)} books from Project Gutenberg...")
    print("This may take a few minutes...\n")
    
    successful = 0
    for i, book_id in enumerate(book_ids):
        filepath = os.path.join("gutenberg_books", f"pg{book_id}.txt")
        
        if os.path.exists(filepath):
            print(f"[{i+1}/{len(book_ids)}] ✓ Already exists: pg{book_id}.txt")
            successful += 1
            continue
        
        url = f"https://www.gutenberg.org/files/{book_id}/{book_id}-0.txt"
        
        try:
            print(f"[{i+1}/{len(book_ids)}] Downloading book {book_id}...", end=" ")
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            
            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(response.text)
            
            print("✓")
            successful += 1
            time.sleep(0.5)
            
        except requests.exceptions.HTTPError:
            alt_url = f"https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt"
            try:
                print(f"trying alternate URL...", end=" ")
                response = requests.get(alt_url, timeout=30)
                response.raise_for_status()
                
                with open(filepath, 'w', encoding='utf-8') as f:
                    f.write(response.text)
                
                print("✓")
                successful += 1
                time.sleep(0.5)
                
            except Exception as e:
                print(f"✗ Failed")
        
        except Exception as e:
            print(f"✗ Failed: {e}")
    
    print(f"\n{'='*60}")
    print(f"✓ Download complete!")
    print(f"  Successfully downloaded: {successful}/{len(book_ids)} books")
    print(f"  Location: ./gutenberg_books/")
    print(f"{'='*60}")
    
    return successful

download_gutenberg_corpus(num_books=50)

In [16]:
import os
import glob 
import random

gutenberg_path = "gutenberg_books"

txt_files = glob.glob(os.path.join(gutenberg_path, "*.txt"))
print(f"Found {len(txt_files)} books")

random.seed(42)
selected_files = random.sample(txt_files, min(50, len(txt_files)))

all_texts = []
for filepath in selected_files:
    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
        all_texts.append(f.read())

text_data = "\n\n".join(all_texts)

print(f"Loaded {len(all_texts)} books")
print(f"Total characters: {len(text_data):,}")

Found 50 books
Loaded 50 books
Total characters: 42,475,753


In [17]:
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))
print("Characters:", total_characters)
print("Tokens:", total_tokens)

Characters: 42475753
Tokens: 11798600


In [18]:
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

### GPU Availability Check

In [19]:
import torch

print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print("✓ GPU is ready!")
else:
    print("✗ Still CPU only")

PyTorch: 2.5.1
CUDA available: True
GPU: NVIDIA GB10
✓ GPU is ready!


### GPTDataset Class

The GPTDataset class converts raw text into training examples by tokenizing it and creating sliding windows of input-target pairs—each input is a sequence of tokens, and the target is the same sequence shifted by one position (predicting the next token). For example, if input is [15496, 11, 995], the target is [11, 995, 0], teaching the model to predict each next token in the sequence.


In [20]:
import torch
from torch.utils.data import Dataset, DataLoader
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

### DataLoader Function

The DataLoader function creates a data loader by converting text into a GPTDatasetV1 (which generates input-target pairs), then wraps it in PyTorch's DataLOader to handle batching (grouping multiple examples together), shuffling (randomizing order for better training). and iteration - this allows you to loop through "for input_batch, target_batch in dataloader" during training with each batch containing batch_size (in this case 4) examples ready for the GPU.

After this, we create two separate DataLoaders - one for training data (shuffled to prevent the model from memorizing the order) and one for validation data (not shuffled for consistent evaluation) with batch_size = 2 meaning each iteration provides 2 sequences at once, and stride = context_length (1024) ensuring no overlap between training windows to use data efficiently without reptition.

In [22]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)
val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

print (val_loader)
print(f"Train loader: {len(train_loader)} batches")
print(f"Val loader: {len(val_loader)} batches")

<torch.utils.data.dataloader.DataLoader object at 0xf77a60c5a8a0>
Train loader: 5211 batches
Val loader: 550 batches


### Loss Calculation Functions

calc_loss_batch moves input and target tensors to GPU, runs them through the model to get predictions (logits), and computes cross-entropy loss (measuring how wrong the predictions are). calc_loss_loader evaluates the model on multiple batches from a dataloader and returns the average loss - useful for checking training/validaton performance without training, limiting to num_batches for faster evaluation

In [23]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(
        logits.flatten(0, 1), target_batch.flatten()
    )
    return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(
                input_batch, target_batch, model, device
            )
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

### Initialize Model on GPU and check initial loss

Sets device to "CUDA" for GPU training, creates a fresh GPTModel with random weights, moves it to GPU and calculates initial losses on 5 batches to establish a baseline. Untrained models typically have loss around 10-11 which should decrease dramatically during training as the model learns patters in the text

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("="*60)
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
print("="*60)

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.to(device)

print(f"\nModel parameters: {sum(p.numel() for p in model.parameters()):,}")

with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device, num_batches=5)
    val_loss = calc_loss_loader(val_loader, model, device, num_batches=5)

print(f"\nInitial losses (untrained model):")
print(f"  Training loss: {train_loss:.3f}")
print(f"  Validation loss: {val_loss:.3f}")

Using device: cuda
GPU: NVIDIA GB10

Model parameters: 163,009,536


RuntimeError: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
