# Build your own GPT

### 1. Choose Training and Model Hyperparameters

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# Training hyperparameters
# Number of independent sequences will we process in parallel
batch_size = 16 
# Maximum context length for predictions
block_size = 32
# Number of training iterations
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
#To use a GPU or not!
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200


#Model hyperparameters
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0
# ------------

# Setting a random seed for reproducibility
torch.manual_seed(2024)

<torch._C.Generator at 0x117a80410>

### 2. Loading the Dataset

In [2]:
from urllib import request

url = "https://www.gutenberg.org/cache/epub/2600/pg2600.txt"

response = request.urlopen(url)
raw = response.read()
text = raw.decode("utf-8")

### 3. Preprocessing the Data

In [3]:
# Make our Vocabulary
chars = sorted(list(set(text)))
vocab_size = len(chars)
# Encode the data
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Create train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # 9:1 train test split
train_data = data[:n]
val_data = data[n:]

# Batching!
def get_batch(split):
    # create a batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

### 4. Transformer Ingredients

1. Setting up the loss function

In [4]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

2. Setting up the single headed attention class

In [5]:
class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

3. Setting up a class for multiple attention heads

In [6]:
class MultiHeadAttention(nn.Module):

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

4. Setting up the feedforward layers

In [7]:
class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

### 5. Assembling the Generative Transformer Model

1. Setting up the transformer block

In [8]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

2. Create a simple bigram model

In [9]:
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

### 6. Train the Transformer on Text Data

Calculate the total number of parameters in the model

In [10]:
model = BigramLanguageModel()
m = model.to(device)
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

0.215921 M parameters


Setup the PyTorch optimizer

In [11]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)



It's finally time to train the transformer on our dataset! We're going to use Python's `timeit` library to evaluate the time it takes to pretrain our model:

In [12]:
import timeit

start = timeit.default_timer()
print('Starting time:', start)

for iter in range(max_iters):
    
    # evaluate loss on training and test at various points
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # batch of data
    xb, yb = get_batch('train')

    # evaluating loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

stop = timeit.default_timer()
print('Stopping time:', stop)
print('Time taken: ', round((stop - start)/60.0, 2),'minutes')

Starting time: 13.435623041
step 0: train loss 4.9321, val loss 4.9300
step 100: train loss 2.6375, val loss 2.6250
step 200: train loss 2.4884, val loss 2.4790
step 300: train loss 2.4422, val loss 2.4434
step 400: train loss 2.3417, val loss 2.3535
step 500: train loss 2.2662, val loss 2.2832
step 600: train loss 2.2147, val loss 2.2301
step 700: train loss 2.1631, val loss 2.1869
step 800: train loss 2.1232, val loss 2.1409
step 900: train loss 2.0765, val loss 2.0916
step 1000: train loss 2.0199, val loss 2.0589
step 1100: train loss 2.0109, val loss 2.0402
step 1200: train loss 1.9746, val loss 2.0203
step 1300: train loss 1.9369, val loss 1.9634
step 1400: train loss 1.9100, val loss 1.9552
step 1500: train loss 1.9054, val loss 1.9380
step 1600: train loss 1.8480, val loss 1.9136
step 1700: train loss 1.8453, val loss 1.9062
step 1800: train loss 1.8397, val loss 1.8911
step 1900: train loss 1.8253, val loss 1.8661
step 2000: train loss 1.8108, val loss 1.8559
step 2100: train l

### 7. Generative Pretrained Tolstoy

How good is our model?

In [13]:
# Ask the GPT model to generate text!
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


“Austory have that explainry knew and
the lafting to ‘y the kinds of your?”

“Helvead in its porth art. Somet sert or this in the fact. “Sher of of all the soft, and been list liviked heorse.

Emperor’s some of of the roun bridly that that them, as of mereive,
“insel at that,” he began to but no evalle Moscow on the old goin it?”

“I was what’s nothing who did not ar at as
could it sometheoriers, was away soldiet Prince, ‘What days at whom think it shake fring that head ling what roe whástatin, why there
sot—cornies.sER I!
   C3


Oh, to all went his not was long for the Emperora shoutinal get to her not Cave then the knich
in wenther hossinution, everything majou.”

“This’ with, he asking her hard been all yehearding the Sung art,
to can, exther pervans.”

Bertin that shouting-hoad (he sometthing his changry her you, the hume,
but to ask wort, “and Irmóus for the dest.

“What’s go his Might, only aftensionside to the gave Kutúzov.

“Chas long extreminot when and the held Pzince Vasík