# gpt

> minimum GPT model from scratch.

import

In [1]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader


In [2]:
import pytorch_lightning as L 
import torch.nn as nn
import torch.nn.functional as F


# data

https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [3]:
DATA_FILE = 'data/input.txt'

# read the data file
with open(DATA_FILE, 'r') as f:
    text = f.read()

In [4]:
# print the length of the text
print('Length of text: {} characters'.format(len(text)))

Length of text: 1115393 characters


In [5]:
# look at the first 250 characters in text
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [6]:
# The unique characters in the file, as a sorted list.
vocab = sorted(set(text))
# print out the vocabulary 
print('Vocabulary: {}'.format(vocab))

# the number of unique characters
vocab_size = len(vocab)
# print the number of unique characters
print('Number of unique characters: {}'.format(vocab_size))

Vocabulary: ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Number of unique characters: 65


In [7]:
# a mapping from characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}

# a mapping from indices to characters
idx2char = {i:u for i, u in enumerate(vocab)}

In [8]:
# an encoder function that converts text to a torch tensor
def encoder(text):
    return torch.tensor([char2idx[c] for c in text], dtype=torch.long)

In [9]:
encoder('Hello Olivier!')

tensor([20, 43, 50, 50, 53,  1, 27, 50, 47, 60, 47, 43, 56,  2])

In [10]:
# a decoder function that converts a torch tensor to text
def decoder(tensor):
    return ''.join([idx2char[i.item()] for i in tensor])

In [11]:
decoder(encoder('Hello Olivier!'))

'Hello Olivier!'

a bigger vocabulary would normally imply a shorter encoding length

SentencePiece BPE tiktoken

In [12]:
# encode the whole text
encoded_text = encoder(text)

# print the encoded text (first 500 characters)
print(encoded_text[:500])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57,  1, 47, 57,  1, 41, 

In [13]:
print(decoder(encoded_text[:500]))

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [14]:
# train/validation split (90/10)
train_size = int(0.9 * len(encoded_text))
val_size = len(encoded_text) - train_size
train_text, val_text = torch.utils.data.random_split(encoded_text, [train_size, val_size])

In [15]:
# set the block size to 8 
block_size = 8

and visualise how the self-supervised model do its predictions

In [16]:
x = train_text[0:block_size]
y = train_text[1:block_size+1]
x, y

(tensor([57, 56, 47, 40, 57, 43, 58, 52]),
 tensor([56, 47, 40, 57, 43, 58, 52, 53]))

next token prediction

the model looks at all the previous tokens, up to the length of the block size

In [17]:
# show the context and target for all the training examples
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print('context: {} -> target: {}'.format(context, target))

context: tensor([57]) -> target: 56
context: tensor([57, 56]) -> target: 47
context: tensor([57, 56, 47]) -> target: 40
context: tensor([57, 56, 47, 40]) -> target: 57
context: tensor([57, 56, 47, 40, 57]) -> target: 43
context: tensor([57, 56, 47, 40, 57, 43]) -> target: 58
context: tensor([57, 56, 47, 40, 57, 43, 58]) -> target: 52
context: tensor([57, 56, 47, 40, 57, 43, 58, 52]) -> target: 53


In [18]:
batch_size = 4


the dataset class

In [19]:
class TextDataset(Dataset):
    def __init__(self, text, block_size=8):
        self.text = text
        self.block_size = block_size
        
    def __len__(self):
        return len(self.text) // self.block_size
    
    def __getitem__(self, idx):
        # get a block of size block_size starting from index idx * block_size
        start_idx = idx * self.block_size
        end_idx = start_idx + self.block_size
        data = self.text[start_idx:end_idx]
        target = self.text[start_idx+1:end_idx+1]
        return data, target


dataloader

In [20]:
# create separate train and validation datasets
train_dataset = TextDataset(train_text, block_size=block_size)
val_dataset = TextDataset(val_text, block_size=block_size)

# create separate train and validation dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [21]:
xb, yb = next(iter(train_dataloader))
xb, yb

(tensor([[52, 47, 56, 13,  1,  1, 57, 51],
         [27, 53, 52, 51,  1, 47,  5,  6],
         [ 1,  6, 46, 58, 46, 52, 51, 52],
         [15, 56, 41, 56, 63, 46, 39, 57]]),
 tensor([[47, 56, 13,  1,  1, 57, 51, 57],
         [53, 52, 51,  1, 47,  5,  6, 59],
         [ 6, 46, 58, 46, 52, 51, 52, 53],
         [56, 41, 56, 63, 46, 39, 57, 58]]))

In [22]:
# loop through the batches and print the context and target for each batch
for b in range(batch_size):
    print('Batch {}'.format(b+1))
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print('context: {} -> target: {}'.format(context, target))

Batch 1
context: tensor([52]) -> target: 47
context: tensor([52, 47]) -> target: 56
context: tensor([52, 47, 56]) -> target: 13
context: tensor([52, 47, 56, 13]) -> target: 1
context: tensor([52, 47, 56, 13,  1]) -> target: 1
context: tensor([52, 47, 56, 13,  1,  1]) -> target: 57
context: tensor([52, 47, 56, 13,  1,  1, 57]) -> target: 51
context: tensor([52, 47, 56, 13,  1,  1, 57, 51]) -> target: 57
Batch 2
context: tensor([27]) -> target: 53
context: tensor([27, 53]) -> target: 52
context: tensor([27, 53, 52]) -> target: 51
context: tensor([27, 53, 52, 51]) -> target: 1
context: tensor([27, 53, 52, 51,  1]) -> target: 47
context: tensor([27, 53, 52, 51,  1, 47]) -> target: 5
context: tensor([27, 53, 52, 51,  1, 47,  5]) -> target: 6
context: tensor([27, 53, 52, 51,  1, 47,  5,  6]) -> target: 59
Batch 3
context: tensor([1]) -> target: 6
context: tensor([1, 6]) -> target: 46
context: tensor([ 1,  6, 46]) -> target: 58
context: tensor([ 1,  6, 46, 58]) -> target: 46
context: tensor([

# Bigram model

In [23]:
# define a Bigram language model in PyTorch-Lightning
class BigramModel(L.LightningModule):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super().__init__()
        self.save_hyperparameters()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x):
        x = self.embedding(x)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat.view(-1, self.hparams.vocab_size), y.view(-1))
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat.view(-1, self.hparams.vocab_size), y.view(-1))
        self.log('val_loss', loss)
        return loss
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)
    
    def generate(self, starting_text, length):
        with torch.no_grad():
            # convert starting_text to tensor
            input_seq = encoder(starting_text).unsqueeze(0)
            # move input_seq to device
            input_seq = input_seq.to(self.device)
            # generate sequence of length 'length'
            for i in range(length):
                # get output probabilities from model
                output_probs = self(input_seq)[:,-1,:]
                # sample the next token from the output probabilities
                next_token = torch.multinomial(F.softmax(output_probs, dim=-1), num_samples=1)
                # append the next token to the input sequence
                input_seq = torch.cat([input_seq, next_token], dim=1)
        # convert the output sequence to text
        output_text = ''.join([idx2char[idx] for idx in input_seq.squeeze().tolist()])
        return output_text

initialise the model

In [24]:
bigram_model = BigramModel(vocab_size, embedding_dim=32, hidden_dim=64)
bigram_model

BigramModel(
  (embedding): Embedding(65, 32)
  (fc1): Linear(in_features=32, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=65, bias=True)
)

forward pass

In [25]:
bigram_model(xb)

tensor([[[ 0.2678, -0.1369, -0.4027,  ...,  0.1395, -0.1126,  0.1175],
         [ 0.4385, -0.5030,  0.1241,  ...,  0.4192, -0.2588,  0.2549],
         [ 0.0025, -0.2363, -0.2232,  ...,  0.1668,  0.0379,  0.1697],
         ...,
         [ 0.0095, -0.2617, -0.1210,  ..., -0.1893,  0.0417,  0.1678],
         [-0.0532, -0.2724, -0.2155,  ...,  0.1709, -0.3195,  0.1353],
         [ 0.2438, -0.5204, -0.4409,  ...,  0.0225,  0.1857, -0.2476]],

        [[-0.0637, -0.1605, -0.2607,  ..., -0.0072, -0.1926,  0.1827],
         [-0.0352, -0.1897, -0.2359,  ...,  0.5171, -0.0788,  0.3116],
         [ 0.2678, -0.1369, -0.4027,  ...,  0.1395, -0.1126,  0.1175],
         ...,
         [ 0.4385, -0.5030,  0.1241,  ...,  0.4192, -0.2588,  0.2549],
         [ 0.0431, -0.1951, -0.1140,  ...,  0.1812, -0.2083,  0.1417],
         [ 0.1701, -0.4269,  0.0718,  ...,  0.2989,  0.0870,  0.3772]],

        [[ 0.0095, -0.2617, -0.1210,  ..., -0.1893,  0.0417,  0.1678],
         [ 0.1701, -0.4269,  0.0718,  ...,  0

training step

In [26]:
bigram_model.training_step((xb, yb), 0)

/home/ma/miniconda3/envs/myl/lib/python3.10/site-packages/pytorch_lightning/core/module.py:420: You are trying to `self.log()` but the `self.trainer` reference is not registered on the model yet. This is most likely because the model hasn't been passed to the `Trainer`


tensor(4.2066, grad_fn=<NllLossBackward0>)

generate text

In [27]:
bigram_model.generate('Hello', 100)

"Hello$Y.t3YVnmOxkYsBQqGl,ulvlAZywHMkHfTYQypbWpADSYM'dvU.,:aNlpOpezpWqwm&!!Usw&hf?cGsBBI\nyaG:VUXdmCFuvbdmO"

use a larger batch size for actual training

In [28]:
batch_size = 32
# create separate train and validation dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

trainer

In [29]:
# define a trainer object
trainer = L.Trainer(max_epochs=1)

Trainer will use only 1 of 6 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=6)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
/home/ma/miniconda3/envs/myl/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:191: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/ma/miniconda3/envs/myl/lib/python3.10/site-pac ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [30]:
# train the model
trainer.fit(bigram_model, train_dataloader, val_dataloader)

You are using a CUDA device ('NVIDIA A100-PCIE-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 2.1 K 
1 | fc1       | Linear    | 2.1 K 
2 | fc2       | Linear    | 4.2 K 
----------------------------------------
8.4 K     Trainable params
0         Non-trainable params
8.4 K     Total params
0.034     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/ma/miniconda3/envs/myl/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=71` in the `DataLoader` to improve performance.
/home/ma/miniconda3/envs/myl/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=71` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


metrics

In [31]:
# check the training loss
trainer.logged_metrics


{'train_loss': tensor(3.2826), 'val_loss': tensor(3.3157)}

generate

In [32]:
input_text = "hello"

# generate a sequence of length 100
print(bigram_model.generate(input_text, 100))

helloeoe eS

i uhm: S awh?mTt  OuOfene  n l
,dt dheoggn neter 
  etoounneweiuy K r  oH th ei amcnnhrL o t


# Attention

In [33]:
# use masked matrix multiplication to compute cumulative averages
T, E = 4, 5  # sequence length, embedding dimension
x = torch.randn(T, E)  # input tensor
x

tensor([[-1.1302,  0.6888, -1.1523, -1.2340,  1.1439],
        [ 0.2148,  0.0503, -2.5361,  0.5335, -1.0008],
        [-0.0530,  0.7277,  1.4551,  0.4140,  0.5039],
        [ 0.4920, -0.9831, -1.5567,  0.4762, -0.5163]])

In [34]:
mask = torch.tril(torch.ones(T, T)) # mask matrix
mask

tensor([[1., 0., 0., 0.],
        [1., 1., 0., 0.],
        [1., 1., 1., 0.],
        [1., 1., 1., 1.]])

In [35]:
import torch.nn.functional as F

# initialize the weights with zeros
weights = torch.zeros(T, T)

# replace the upper triangular part of the weights with -inf
weights = weights.masked_fill(mask==0, float('-inf'))

# apply softmax to the weights
weights = F.softmax(weights, dim=-1)

weights

tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500]])

In [36]:
weights @ x

tensor([[-1.1302,  0.6888, -1.1523, -1.2340,  1.1439],
        [-0.4577,  0.3696, -1.8442, -0.3503,  0.0715],
        [-0.3228,  0.4890, -0.7444, -0.0955,  0.2157],
        [-0.1191,  0.1209, -0.9475,  0.0474,  0.0327]])

move on to the attention model by using an affinity based weighing scheme

the affinity will be calculated by the dot product of the query and key

In [37]:
# define queries, keys, and values using linear layers
H = 8
key = nn.Linear(E, H, bias=False)
query = nn.Linear(E, H, bias=False)
value = nn.Linear(E, H, bias=False)

In [38]:
query(x)

tensor([[-0.0243, -0.7339,  0.1138,  1.1829, -0.2428, -0.3732,  0.4777,  1.2436],
        [ 0.5204,  0.0916,  0.1016, -0.2374,  0.6966,  0.3451,  1.1526,  0.1647],
        [-0.5639, -0.0049,  0.0393, -0.2037, -0.5386, -0.3055, -0.6389, -0.3777],
        [ 0.5188,  0.2483, -0.4000, -0.3071,  0.6248,  0.7772,  0.4243,  0.0901]],
       grad_fn=<MmBackward0>)

In [39]:
key

Linear(in_features=5, out_features=8, bias=False)

In [40]:
key(x)

tensor([[-0.8429, -0.2807, -0.5572,  0.4662, -0.2232, -1.7832, -0.7288, -0.2520],
        [-0.5455, -1.0265,  0.9197, -1.1817, -1.0555, -0.2829, -0.4550,  1.0635],
        [ 0.8573,  0.9310, -0.7485,  0.2056,  0.8744,  0.2372,  0.5925, -0.4311],
        [-0.4999, -0.8215,  0.9373, -0.6269, -0.9207,  0.2695, -0.5260,  0.6619]],
       grad_fn=<MmBackward0>)

In [41]:
weights = query(x) @ key(x).transpose(-1, -2)
weights = weights.masked_fill(mask==0, float('-inf'))
weights = F.softmax(weights, dim=-1)
weights

tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.2501, 0.7499, 0.0000, 0.0000],
        [0.5914, 0.3762, 0.0324, 0.0000],
        [0.0145, 0.0312, 0.9104, 0.0440]], grad_fn=<SoftmaxBackward0>)

In [42]:
weights @ x

tensor([[-1.1302,  0.6888, -1.1523, -1.2340,  1.1439],
        [-0.1216,  0.2101, -2.1900,  0.0914, -0.4644],
        [-0.5893,  0.4499, -1.5884, -0.5156,  0.3162],
        [-0.0364,  0.6308,  1.1606,  0.3965,  0.4215]], grad_fn=<MmBackward0>)