In [1]:
#| default_exp speedup

In [2]:
#| export
import random, math, torch, numpy as np, matplotlib.pyplot as plt
from tinyai.model import *
from tinyai.learner import *
from tinyai.hooks import *
from tinyai.init import *
import fastcore.all as fc
from functools import partial
import time

In [3]:
#| export
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [4]:
import tiktoken
import os

enc = tiktoken.get_encoding("gpt2")

def get_tokens(input_file):
    with open(input_file) as f:
        text = f.read()
    tokens = enc.encode(text)
    return tokens

cwd = os.getcwd()
input_file = f"{cwd}/fast-nanogpt/input.txt"
tokens = get_tokens(input_file)[:20000]
train, valid = tokens[:int(len(tokens)*0.8)], tokens[int(len(tokens)*0.8):]

In [5]:
tds = DataSet(torch.tensor(train), T=512)
# vds = DataSet(torch.tensor(valid))
dls = DataLoaders.from_dd([tds, None], batch_size=4)
# dls = DataLoaders.from_dd([tds, vds], batch_size=4)
x, y = next(iter(dls.train))
x.shape, y.shape, len(tds), len(dls.train)

(torch.Size([4, 512]), torch.Size([4, 512]), 31, 8)

In [6]:
stats = ActivationStats(fc.risinstance(Block))
cbs = [TrainCB(), InitWeightsCB(), DeviceCB(), MetricsCB(), ProgressCB()]
def fit(model, epochs=1, xtra_cbs=None):
    lrn = Learner(model, dls=dls, opt_func=optim.AdamW, cbs=cbs + fc.L(xtra_cbs), lr=3e-4)
    lrn.fit(epochs, valid=False)
    return lrn

In [7]:
??get_model

[0;31mSignature:[0m [0mget_model[0m[0;34m([0m[0mproj[0m[0;34m=[0m[0;34m<[0m[0;32mclass[0m [0;34m'tinyai.init.ResidualLinear'[0m[0;34m>[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mSource:[0m   
[0;32mdef[0m [0mget_model[0m[0;34m([0m[0mproj[0m[0;34m=[0m[0mResidualLinear[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;32mreturn[0m [0mGPT[0m[0;34m([0m[0mGPTConfig[0m[0;34m([0m[0;34m)[0m[0;34m,[0m [0mproj[0m[0;34m=[0m[0mproj[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mFile:[0m      ~/playground/notebooks/fast-nanogpt/tinyai/init.py
[0;31mType:[0m      function

In [8]:
#| export
import time

class TimeCallback(Callback):
    def before_batch(self, learn):
        self.t0 = time.time()

    def _log(self, d):
        pass

    def after_batch(self, learn):
        t1 = time.time()
        dt = (t1 - self.t0) * 1000
        x, _ = learn.batch
        tokens_per_sec = x.shape[0] * x.shape[1] / (t1 - self.t0)

        print(
            f"step {learn.iter}, loss: {learn.loss.item():.2f}, time: {dt:.2f}msi, tok/sec: {tokens_per_sec:.0f}"
        )

## Baseline

In [9]:
set_seed(1337)
model = get_model()
fit(model, xtra_cbs=[TimeCallback()])

loss,epoch,train
9.016,0,train


step 0, loss: 10.95, time: 1293.64msi, tok/sec: 1583
step 1, loss: 9.47, time: 501.65msi, tok/sec: 4083
step 2, loss: 9.20, time: 509.18msi, tok/sec: 4022
step 3, loss: 8.89, time: 503.73msi, tok/sec: 4066
step 4, loss: 8.75, time: 512.82msi, tok/sec: 3994
step 5, loss: 8.46, time: 506.60msi, tok/sec: 4043
step 6, loss: 8.13, time: 513.54msi, tok/sec: 3988
step 7, loss: 8.04, time: 393.00msi, tok/sec: 3908


<tinyai.learner.Learner at 0x7f66b33d29e0>

## TODO: what are dtypes
![](https://devblogs.nvidia.com/wp-content/uploads/2020/05/TensorFloat32-TF32.jpg)

In [10]:
# Use TensorFloat32
# Only available for Ampere GPUs
torch.set_float32_matmul_precision('high')

In [11]:
clean_mem()

In [12]:
model = get_model()
fit(model, xtra_cbs=[TimeCallback()])

loss,epoch,train
9.02,0,train


step 0, loss: 11.01, time: 582.29msi, tok/sec: 3517
step 1, loss: 9.41, time: 500.18msi, tok/sec: 4095
step 2, loss: 9.24, time: 519.80msi, tok/sec: 3940
step 3, loss: 8.77, time: 509.55msi, tok/sec: 4019
step 4, loss: 8.70, time: 523.91msi, tok/sec: 3909
step 5, loss: 8.52, time: 505.33msi, tok/sec: 4053
step 6, loss: 8.21, time: 513.93msi, tok/sec: 3985
step 7, loss: 8.05, time: 392.91msi, tok/sec: 3909


<tinyai.learner.Learner at 0x7f66b2bd0e50>

Enable [auto mixed precision](https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html#adding-torch-autocast)

Lower Precision could speed up training and inference time. 
**Precision support matrix**

|             | Ampere                                       | Turing                 | Volta                  |
|-------------|----------------------------------------------|------------------------|------------------------|
| Tensor Core | FP64, TF32, bfloat16, FP16, INT8, INT4, INT1 | FP16, INT8, INT4, INT1 | FP16                   |
| CUDA® Core  | FP64, FP32, FP16, bfloat16, INT8             | FP64, FP32, FP16, INT8 | FP64, FP32, FP16, INT8 |

In [13]:
#| export
torch_dtype_float16 = (
    torch.bfloat16
    if torch.cuda.is_bf16_supported()
    else torch.float16
)

In [14]:
#| export
class MixedPrecisionTrainCB(TrainCB):

    def predict(self, learn):
        with torch.autocast(device_type=default_device, enabled=learn.training, dtype=torch_dtype_float16):
            learn.preds, learn.loss = learn.model(*learn.batch)


In [15]:
torch.cuda.is_bf16_supported()

False

In [16]:
cbs = [MixedPrecisionTrainCB(), InitWeightsCB(), DeviceCB(), MetricsCB(), ProgressCB()]

In [17]:
fit(model, xtra_cbs=[TimeCallback()])

loss,epoch,train
9.247,0,train


step 0, loss: 10.93, time: 215.91msi, tok/sec: 9486
step 1, loss: 9.53, time: 210.75msi, tok/sec: 9718
step 2, loss: 9.31, time: 210.66msi, tok/sec: 9722
step 3, loss: 8.97, time: 210.56msi, tok/sec: 9726
step 4, loss: 8.97, time: 212.47msi, tok/sec: 9639
step 5, loss: 8.85, time: 211.79msi, tok/sec: 9670
step 6, loss: 8.63, time: 211.27msi, tok/sec: 9694
step 7, loss: 8.62, time: 168.40msi, tok/sec: 9121


<tinyai.learner.Learner at 0x7f66b2d9f790>

## TODO: compile
1. gelu example
2. why compile? explain hbm to sm round trip
3. trouble shooting, no speed up for old cards

In [18]:
model = get_model()
model = torch.compile(model)

In [19]:
fit(model, xtra_cbs=[TimeCallback()])

loss,epoch,train
9.3,0,train


step 0, loss: 10.98, time: 20864.58msi, tok/sec: 98
step 1, loss: 9.48, time: 200.13msi, tok/sec: 10234
step 2, loss: 9.37, time: 177.37msi, tok/sec: 11547
step 3, loss: 9.02, time: 175.94msi, tok/sec: 11640
step 4, loss: 9.06, time: 175.40msi, tok/sec: 11676
step 5, loss: 8.94, time: 175.58msi, tok/sec: 11664
step 6, loss: 8.73, time: 176.13msi, tok/sec: 11627
step 7, loss: 8.66, time: 18331.78msi, tok/sec: 84


<tinyai.learner.Learner at 0x7f66b23ceb00>

compile primarily helps with memory bandwidth bound workloads by reducing data round trips between HBM and SM, in which case SM is so fast that it keeps waiting for data to arrive from HBM. Older cards may not see much speed up because they are slow on computation.

See https://huggingface.co/docs/transformers/perf_torch_compile for compile speed up benchmarks.

In [20]:
#| export
class CompileCB(Callback):
    def before_fit(self, learn):
        learn.model = torch.compile(learn.model)

## Flash attention

Flash attention is more memory efficient, it never materializes the full attention matrix.


In [21]:
#| export
class FastCausalSelfAttention(CausalSelfAttention):

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
        # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, T, C) -> (B, T, nh, hs) -> (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        # attention (materializes the large (T,T) matrix for all the queries and keys)
        # att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        # att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        # att = F.softmax(att, dim=-1)
        # y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
        # output projection
        y = self.c_proj(y)
        return y

In [22]:
model = GPT(GPTConfig(), proj=ResidualLinear, attn=FastCausalSelfAttention)

In [23]:
fit(model, xtra_cbs=[CompileCB(), TimeCallback()])

loss,epoch,train
9.255,0,train


step 0, loss: 10.79, time: 14346.44msi, tok/sec: 143
step 1, loss: 9.44, time: 166.82msi, tok/sec: 12277
step 2, loss: 9.81, time: 160.57msi, tok/sec: 12755
step 3, loss: 8.95, time: 158.10msi, tok/sec: 12954
step 4, loss: 8.85, time: 157.99msi, tok/sec: 12963
step 5, loss: 8.79, time: 158.66msi, tok/sec: 12908
step 6, loss: 8.63, time: 157.99msi, tok/sec: 12963
step 7, loss: 8.60, time: 14282.34msi, tok/sec: 108


<tinyai.learner.Learner at 0x7f66e8f6d210>

## Use kernel friendly numbers

A lot of cuda kernels are written in terms of power of 2, and if the input is not a power of 2, it will spin up a kernel that is a power of 2, and then do some extra work to handle the rest.

So look up the nn code, if a number is power of 2, it is nice number. Otherwise it is a ugly number, and see if you can make increase it to the nearest power of 2.


In [24]:
??GPTConfig

[0;31mInit signature:[0m
[0mGPTConfig[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mblock_size[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m1024[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvocab_size[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m50257[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_layer[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m12[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_head[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m12[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_embd[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m768[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m      GPTConfig(block_size: int = 1024, vocab_size: int = 50257, n_layer: int = 12, n_head: int = 12, n_embd: int = 768)
[0;31mSource:[0m        
[0;34m@[0m[0mdataclass[0m[0;34m[0m
[0;34m[0m[0;32mclass[0m [0mGPTConfig[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0mblock_size[0m[0;3

In [25]:
#| export
def get_model():
    return GPT(GPTConfig(vocab_size=50304), proj=ResidualLinear, attn=FastCausalSelfAttention)

In [26]:
model = get_model()
model

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50304, 768)
    (wpe): Embedding(1024, 768)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): FastCausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): ResidualLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): ResidualLinear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50304, bias=False)
)

In [27]:
fit(model, xtra_cbs=[CompileCB(), TimeCallback()])

loss,epoch,train
9.126,0,train


step 0, loss: 10.95, time: 14853.84msi, tok/sec: 138
step 1, loss: 9.40, time: 187.86msi, tok/sec: 10902
step 2, loss: 9.05, time: 140.23msi, tok/sec: 14604
step 3, loss: 8.84, time: 140.18msi, tok/sec: 14610
step 4, loss: 8.85, time: 140.44msi, tok/sec: 14583
step 5, loss: 8.70, time: 140.34msi, tok/sec: 14593
step 6, loss: 8.52, time: 140.44msi, tok/sec: 14583
step 7, loss: 8.57, time: 14763.75msi, tok/sec: 104


<tinyai.learner.Learner at 0x7f66e1868040>