In [1]:
#| default_exp speedup

In [2]:
#| export
import random, math, torch, numpy as np, matplotlib.pyplot as plt
from tinyai.model import *
from tinyai.learner import *
from tinyai.hooks import *
from tinyai.init import *
import fastcore.all as fc
from functools import partial
import time

In [3]:
#| export
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [4]:
import tiktoken
import os

enc = tiktoken.get_encoding("gpt2")

def get_tokens(input_file):
    with open(input_file) as f:
        text = f.read()
    tokens = enc.encode(text)
    return tokens

cwd = os.getcwd()
input_file = f"{cwd}/fast-nanogpt/input.txt"
tokens = get_tokens(input_file)[:20000]
train, valid = tokens[:int(len(tokens)*0.8)], tokens[int(len(tokens)*0.8):]

In [5]:
tds = DataSet(torch.tensor(train), T=512)
# vds = DataSet(torch.tensor(valid))
dls = DataLoaders.from_dd([tds, None], batch_size=4)
# dls = DataLoaders.from_dd([tds, vds], batch_size=4)
x, y = next(iter(dls.train))
x.shape, y.shape, len(tds), len(dls.train)

(torch.Size([4, 512]), torch.Size([4, 512]), 31, 8)

In [6]:
stats = ActivationStats(fc.risinstance(Block))
cbs = [TrainCB(), InitWeightsCB(), DeviceCB(), MetricsCB(), ProgressCB()]
def fit(model, epochs=1, xtra_cbs=None):
    lrn = Learner(model, dls=dls, opt_func=optim.AdamW, cbs=cbs + fc.L(xtra_cbs), lr=3e-4)
    lrn.fit(epochs, valid=False)
    return lrn

In [7]:
??get_model

[0;31mSignature:[0m [0mget_model[0m[0;34m([0m[0mproj[0m[0;34m=[0m[0;34m<[0m[0;32mclass[0m [0;34m'tinyai.init.ResidualLinear'[0m[0;34m>[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mSource:[0m   
[0;32mdef[0m [0mget_model[0m[0;34m([0m[0mproj[0m[0;34m=[0m[0mResidualLinear[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;32mreturn[0m [0mGPT[0m[0;34m([0m[0mGPTConfig[0m[0;34m([0m[0;34m)[0m[0;34m,[0m [0mproj[0m[0;34m=[0m[0mproj[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mFile:[0m      ~/playground/notebooks/fast-nanogpt/tinyai/init.py
[0;31mType:[0m      function

In [8]:
#| export
import time

class TimeCallback(Callback):
    def before_batch(self, learn):
        self.t0 = time.time()

    def _log(self, d):
        pass

    def after_batch(self, learn):
        t1 = time.time()
        dt = (t1 - self.t0) * 1000
        x, _ = learn.batch
        tokens_per_sec = x.shape[0] * x.shape[1] / (t1 - self.t0)

        print(
            f"step {learn.iter}, loss: {learn.loss.item():.2f}, time: {dt:.2f}msi, tok/sec: {tokens_per_sec:.0f}"
        )

## Baseline

In [9]:
set_seed(1337)
model = get_model()
fit(model, xtra_cbs=[TimeCallback()])

loss,epoch,train
9.016,0,train


step 0, loss: 10.95, time: 1536.75msi, tok/sec: 1333
step 1, loss: 9.47, time: 494.82msi, tok/sec: 4139
step 2, loss: 9.20, time: 513.43msi, tok/sec: 3989
step 3, loss: 8.89, time: 501.96msi, tok/sec: 4080
step 4, loss: 8.75, time: 508.30msi, tok/sec: 4029
step 5, loss: 8.46, time: 501.91msi, tok/sec: 4080
step 6, loss: 8.13, time: 506.05msi, tok/sec: 4047
step 7, loss: 8.04, time: 386.95msi, tok/sec: 3970


<tinyai.learner.Learner at 0x7f902c332650>

## TODO: what are dtypes
1. a100 architecture ref
2. auto mixed precision ref

In [10]:
# Use TensorFloat32
# Only available for Ampere GPUs
torch.set_float32_matmul_precision('high')

In [11]:
clean_mem()

## TODO: compile
1. gelu example
2. why compile? explain hbm to sm round trip
3. trouble shooting, no speed up for old cards

In [12]:
model = get_model()
model = torch.compile(model)

In [13]:
fit(model, xtra_cbs=[TimeCallback()])

loss,epoch,train
9.02,0,train


step 0, loss: 11.01, time: 26799.57msi, tok/sec: 76
step 1, loss: 9.41, time: 467.46msi, tok/sec: 4381
step 2, loss: 9.24, time: 482.88msi, tok/sec: 4241
step 3, loss: 8.77, time: 472.84msi, tok/sec: 4331
step 4, loss: 8.70, time: 479.20msi, tok/sec: 4274
step 5, loss: 8.52, time: 478.02msi, tok/sec: 4284
step 6, loss: 8.21, time: 474.55msi, tok/sec: 4316
step 7, loss: 8.05, time: 40773.21msi, tok/sec: 38


<tinyai.learner.Learner at 0x7f9000bd4a30>

compile primarily helps with memory bandwidth bound workloads by reducing data round trips between HBM and SM, in which case SM is so fast that it keeps waiting for data to arrive from HBM. Older cards may not see much speed up because they are slow on computation.

See https://huggingface.co/docs/transformers/perf_torch_compile for compile speed up benchmarks.

In [14]:
#| export
class CompileCB(Callback):
    def before_fit(self, learn):
        learn.model = torch.compile(learn.model)

## Flash attention

Flash attention is more memory efficient, it never materializes the full attention matrix.


In [15]:
#| export
class FastCausalSelfAttention(CausalSelfAttention):

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
        # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, T, C) -> (B, T, nh, hs) -> (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        # attention (materializes the large (T,T) matrix for all the queries and keys)
        # att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        # att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        # att = F.softmax(att, dim=-1)
        # y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
        # output projection
        y = self.c_proj(y)
        return y

In [16]:
model = GPT(GPTConfig(), proj=ResidualLinear, attn=FastCausalSelfAttention)

In [17]:
fit(model, xtra_cbs=[CompileCB(), TimeCallback()])

loss,epoch,train
8.989,0,train


step 0, loss: 10.98, time: 29787.93msi, tok/sec: 69
step 1, loss: 9.51, time: 464.56msi, tok/sec: 4408
step 2, loss: 9.15, time: 477.11msi, tok/sec: 4292
step 3, loss: 8.74, time: 470.22msi, tok/sec: 4355
step 4, loss: 8.63, time: 472.31msi, tok/sec: 4336
step 5, loss: 8.43, time: 467.48msi, tok/sec: 4381
step 6, loss: 8.17, time: 469.59msi, tok/sec: 4361
step 7, loss: 8.08, time: 354.83msi, tok/sec: 4329


<tinyai.learner.Learner at 0x7f8f42c61f90>

## Use kernel friendly numbers

A lot of cuda kernels are written in terms of power of 2, and if the input is not a power of 2, it will spin up a kernel that is a power of 2, and then do some extra work to handle the rest.

So look up the nn code, if a number is power of 2, it is nice number. Otherwise it is a ugly number, and see if you can make increase it to the nearest power of 2.


In [18]:
??GPTConfig

[0;31mInit signature:[0m
[0mGPTConfig[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mblock_size[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m1024[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvocab_size[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m50257[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_layer[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m12[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_head[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m12[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_embd[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m768[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m      GPTConfig(block_size: int = 1024, vocab_size: int = 50257, n_layer: int = 12, n_head: int = 12, n_embd: int = 768)
[0;31mSource:[0m        
[0;34m@[0m[0mdataclass[0m[0;34m[0m
[0;34m[0m[0;32mclass[0m [0mGPTConfig[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0mblock_size[0m[0;3

In [19]:
#| export
def get_model():
    return GPT(GPTConfig(vocab_size=50304), proj=ResidualLinear, attn=FastCausalSelfAttention)

In [20]:
model = get_model()
model

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50304, 768)
    (wpe): Embedding(1024, 768)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): FastCausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): ResidualLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): ResidualLinear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50304, bias=False)
)

In [21]:
fit(model, xtra_cbs=[CompileCB(), TimeCallback()])

loss,epoch,train
9.046,0,train


step 0, loss: 11.01, time: 30026.68msi, tok/sec: 68
step 1, loss: 9.60, time: 463.61msi, tok/sec: 4417
step 2, loss: 9.22, time: 470.80msi, tok/sec: 4350
step 3, loss: 8.82, time: 458.75msi, tok/sec: 4464
step 4, loss: 8.70, time: 465.70msi, tok/sec: 4398
step 5, loss: 8.52, time: 459.88msi, tok/sec: 4453
step 6, loss: 8.19, time: 458.02msi, tok/sec: 4471
step 7, loss: 8.08, time: 361.25msi, tok/sec: 4252


<tinyai.learner.Learner at 0x7f8f3b670100>