# gpt

> minimum GPT model from scratch.

import

In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader


In [None]:
import pytorch_lightning as L 
import torch.nn as nn
import torch.nn.functional as F


# data

https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [None]:
DATA_FILE = 'data/input.txt'

# read the data file
with open(DATA_FILE, 'r') as f:
    text = f.read()

In [None]:
# print the length of the text
print('Length of text: {} characters'.format(len(text)))

Length of text: 1115393 characters


In [None]:
# look at the first 250 characters in text
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [None]:
# The unique characters in the file, as a sorted list.
vocab = sorted(set(text))
# print out the vocabulary 
print('Vocabulary: {}'.format(vocab))

# the number of unique characters
vocab_size = len(vocab)
# print the number of unique characters
print('Number of unique characters: {}'.format(vocab_size))

Vocabulary: ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Number of unique characters: 65


In [None]:
# a mapping from characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}

# a mapping from indices to characters
idx2char = {i:u for i, u in enumerate(vocab)}

In [None]:
# an encoder function that converts text to a torch tensor
def encoder(text):
    return torch.tensor([char2idx[c] for c in text], dtype=torch.long)

In [None]:
encoder('Hello Olivier!')

tensor([20, 43, 50, 50, 53,  1, 27, 50, 47, 60, 47, 43, 56,  2])

In [None]:
# a decoder function that converts a torch tensor to text
def decoder(tensor):
    return ''.join([idx2char[i.item()] for i in tensor])

In [None]:
decoder(encoder('Hello Olivier!'))

'Hello Olivier!'

a bigger vocabulary would normally imply a shorter encoding length

SentencePiece BPE tiktoken

In [None]:
# encode the whole text
encoded_text = encoder(text)

# print the encoded text (first 500 characters)
print(encoded_text[:500])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57,  1, 47, 57,  1, 41, 

In [None]:
print(decoder(encoded_text[:500]))

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [None]:
# train/validation split (90/10)
train_size = int(0.9 * len(encoded_text))
val_size = len(encoded_text) - train_size
train_text, val_text = torch.utils.data.random_split(encoded_text, [train_size, val_size])

In [None]:
# set the block size to 8 
block_size = 8

and visualise how the self-supervised model do its predictions

In [None]:
x = train_text[0:block_size]
y = train_text[1:block_size+1]
x, y

(tensor([24, 42, 47, 11, 46, 52, 50, 42]),
 tensor([42, 47, 11, 46, 52, 50, 42, 53]))

next token prediction

the model looks at all the previous tokens, up to the length of the block size

In [None]:
# show the context and target for all the training examples
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print('context: {} -> target: {}'.format(context, target))

context: tensor([24]) -> target: 42
context: tensor([24, 42]) -> target: 47
context: tensor([24, 42, 47]) -> target: 11
context: tensor([24, 42, 47, 11]) -> target: 46
context: tensor([24, 42, 47, 11, 46]) -> target: 52
context: tensor([24, 42, 47, 11, 46, 52]) -> target: 50
context: tensor([24, 42, 47, 11, 46, 52, 50]) -> target: 42
context: tensor([24, 42, 47, 11, 46, 52, 50, 42]) -> target: 53


In [None]:
batch_size = 4


the dataset class

In [None]:
class TextDataset(Dataset):
    def __init__(self, text, block_size=8):
        self.text = text
        self.block_size = block_size
        
    def __len__(self):
        return len(self.text) // self.block_size
    
    def __getitem__(self, idx):
        # get a block of size block_size starting from index idx * block_size
        start_idx = idx * self.block_size
        end_idx = start_idx + self.block_size
        data = self.text[start_idx:end_idx]
        target = self.text[start_idx+1:end_idx+1]
        return data, target


dataloader

In [None]:
# create separate train and validation datasets
train_dataset = TextDataset(train_text, block_size=block_size)
val_dataset = TextDataset(val_text, block_size=block_size)

# create separate train and validation dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
xb, yb = next(iter(train_dataloader))
xb, yb

(tensor([[49,  7, 21, 43,  1, 42,  1, 63],
         [ 1, 58,  1, 41, 19, 59, 52,  8],
         [56, 53,  1, 44, 16, 53,  1, 52],
         [ 1, 46, 53, 53, 57, 10, 43, 58]]),
 tensor([[ 7, 21, 43,  1, 42,  1, 63, 30],
         [58,  1, 41, 19, 59, 52,  8, 58],
         [53,  1, 44, 16, 53,  1, 52, 28],
         [46, 53, 53, 57, 10, 43, 58, 56]]))

In [None]:
# loop through the batches and print the context and target for each batch
for b in range(batch_size):
    print('Batch {}'.format(b+1))
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print('context: {} -> target: {}'.format(context, target))

Batch 1
context: tensor([49]) -> target: 7
context: tensor([49,  7]) -> target: 21
context: tensor([49,  7, 21]) -> target: 43
context: tensor([49,  7, 21, 43]) -> target: 1
context: tensor([49,  7, 21, 43,  1]) -> target: 42
context: tensor([49,  7, 21, 43,  1, 42]) -> target: 1
context: tensor([49,  7, 21, 43,  1, 42,  1]) -> target: 63
context: tensor([49,  7, 21, 43,  1, 42,  1, 63]) -> target: 30
Batch 2
context: tensor([1]) -> target: 58
context: tensor([ 1, 58]) -> target: 1
context: tensor([ 1, 58,  1]) -> target: 41
context: tensor([ 1, 58,  1, 41]) -> target: 19
context: tensor([ 1, 58,  1, 41, 19]) -> target: 59
context: tensor([ 1, 58,  1, 41, 19, 59]) -> target: 52
context: tensor([ 1, 58,  1, 41, 19, 59, 52]) -> target: 8
context: tensor([ 1, 58,  1, 41, 19, 59, 52,  8]) -> target: 58
Batch 3
context: tensor([56]) -> target: 53
context: tensor([56, 53]) -> target: 1
context: tensor([56, 53,  1]) -> target: 44
context: tensor([56, 53,  1, 44]) -> target: 16
context: tensor

# Bigram model

In [None]:
# define a Bigram language model in PyTorch-Lightning
class BigramModel(L.LightningModule):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super().__init__()
        self.save_hyperparameters()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x):
        x = self.embedding(x)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat.view(-1, self.hparams.vocab_size), y.view(-1))
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat.view(-1, self.hparams.vocab_size), y.view(-1))
        self.log('val_loss', loss)
        return loss
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)
    
    def generate(self, starting_text, length):
        with torch.no_grad():
            # convert starting_text to tensor
            input_seq = encoder(starting_text).unsqueeze(0)
            # move input_seq to device
            input_seq = input_seq.to(self.device)
            # generate sequence of length 'length'
            for i in range(length):
                # get output probabilities from model
                output_probs = self(input_seq)[:,-1,:]
                # sample the next token from the output probabilities
                next_token = torch.multinomial(F.softmax(output_probs, dim=-1), num_samples=1)
                # append the next token to the input sequence
                input_seq = torch.cat([input_seq, next_token], dim=1)
        # convert the output sequence to text
        output_text = ''.join([idx2char[idx] for idx in input_seq.squeeze().tolist()])
        return output_text

initialise the model

In [None]:
bigram_model = BigramModel(vocab_size, embedding_dim=32, hidden_dim=64)
bigram_model

BigramModel(
  (embedding): Embedding(65, 32)
  (fc1): Linear(in_features=32, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=65, bias=True)
)

forward pass

In [None]:
bigram_model(xb)

tensor([[[-0.0898,  0.1966, -0.2269,  ...,  0.0039, -0.0763, -0.1268],
         [ 0.1250, -0.0110,  0.0631,  ...,  0.0591,  0.2566,  0.0475],
         [-0.0113,  0.0902,  0.1115,  ..., -0.0293,  0.0861,  0.1530],
         ...,
         [ 0.0069,  0.1929, -0.0406,  ...,  0.2338, -0.1185, -0.1099],
         [-0.2324, -0.0831, -0.3524,  ..., -0.2395,  0.1335,  0.0672],
         [ 0.0411,  0.1028, -0.1139,  ..., -0.1857, -0.0310,  0.0017]],

        [[-0.2324, -0.0831, -0.3524,  ..., -0.2395,  0.1335,  0.0672],
         [-0.3195,  0.5125, -0.0533,  ...,  0.2048,  0.1758,  0.0952],
         [-0.2324, -0.0831, -0.3524,  ..., -0.2395,  0.1335,  0.0672],
         ...,
         [-0.1921,  0.1451, -0.0171,  ..., -0.1576, -0.1355, -0.0993],
         [-0.2719,  0.0547, -0.0394,  ...,  0.0629,  0.0370,  0.3403],
         [-0.2136,  0.0358,  0.0807,  ..., -0.3666,  0.1520, -0.0436]],

        [[-0.4426, -0.0566,  0.0663,  ...,  0.0090,  0.2780,  0.1964],
         [ 0.0708,  0.2102,  0.3248,  ..., -0

training step

In [None]:
bigram_model.training_step((xb, yb), 0)

/home/ma/miniconda3/envs/myl/lib/python3.10/site-packages/pytorch_lightning/core/module.py:420: You are trying to `self.log()` but the `self.trainer` reference is not registered on the model yet. This is most likely because the model hasn't been passed to the `Trainer`


tensor(4.1391, grad_fn=<NllLossBackward0>)

generate text

In [None]:
bigram_model.generate('Hello', 100)

"HelloFMPdOlgqbqMfgkh,v':okQqg&$XWIK?TEU.kM Zsoh- tTwkWBg\ndk;XSoKIkSmiF'DjmE'h,OazprsW!C:mLtgWPnDtnVixaZm\n"

use a larger batch size for actual training

In [None]:
batch_size = 32
# create separate train and validation dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

trainer

In [None]:
# define a trainer object
trainer = L.Trainer(max_epochs=1)

Trainer will use only 1 of 6 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=6)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
/home/ma/miniconda3/envs/myl/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:191: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/ma/miniconda3/envs/myl/lib/python3.10/site-pac ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
# train the model
trainer.fit(bigram_model, train_dataloader, val_dataloader)

You are using a CUDA device ('NVIDIA A100-PCIE-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 2.1 K 
1 | fc1       | Linear    | 2.1 K 
2 | fc2       | Linear    | 4.2 K 
----------------------------------------
8.4 K     Trainable params
0         Non-trainable params
8.4 K     Total params
0.034     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/ma/miniconda3/envs/myl/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=71` in the `DataLoader` to improve performance.
/home/ma/miniconda3/envs/myl/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=71` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


metrics

In [None]:
# check the training loss
trainer.logged_metrics


{'train_loss': tensor(3.2474), 'val_loss': tensor(3.3175)}

generate

In [None]:
input_text = "hello"

# generate a sequence of length 100
print(bigram_model.generate(input_text, 100))

helloin eenslorE lnln ezln,isya  iaminN faw t
n sou  nuaDb   crei Iwido i 
e
yk  ro w  'ItIirhaiiihvet  e


In [None]:
xb, yb

(tensor([[49,  7, 21, 43,  1, 42,  1, 63],
         [ 1, 58,  1, 41, 19, 59, 52,  8],
         [56, 53,  1, 44, 16, 53,  1, 52],
         [ 1, 46, 53, 53, 57, 10, 43, 58]]),
 tensor([[ 7, 21, 43,  1, 42,  1, 63, 30],
         [58,  1, 41, 19, 59, 52,  8, 58],
         [53,  1, 44, 16, 53,  1, 52, 28],
         [46, 53, 53, 57, 10, 43, 58, 56]]))

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()