In [None]:
#| default_exp speedup

In [7]:
#| export
import random, math, torch, numpy as np, matplotlib.pyplot as plt
from tinyai.learner import *
from tinyai.model import *
from tinyai.hooks import *
from tinyai.init import *
import fastcore.all as fc
from functools import partial
import time

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
import tiktoken
import os

enc = tiktoken.get_encoding("gpt2")

def get_tokens(input_file):
    with open(input_file) as f:
        text = f.read()
    tokens = enc.encode(text)
    return tokens

cwd = os.getcwd()
input_file = f"{cwd}/fast-nanogpt/input.txt"
tokens = get_tokens(input_file)[:20000]
train, valid = tokens[:int(len(tokens)*0.8)], tokens[int(len(tokens)*0.8):]

In [4]:
tds = DataSet(torch.tensor(train), T=512)
# vds = DataSet(torch.tensor(valid))
dls = DataLoaders.from_dd([tds, None], batch_size=4)
# dls = DataLoaders.from_dd([tds, vds], batch_size=4)
x, y = next(iter(dls.train))
x.shape, y.shape, len(tds), len(dls.train)

(torch.Size([4, 512]), torch.Size([4, 512]), 31, 8)

In [21]:
stats = ActivationStats(fc.risinstance(Block))
cbs = [TrainCB(), InitWeightsCB(), DeviceCB(), MetricsCB(), ProgressCB()]
def fit(model, epochs=1, xtra_cbs=None):
    lrn = Learner(model, dls=dls, opt_func=optim.AdamW, cbs=cbs + fc.L(xtra_cbs), lr=3e-4)
    lrn.fit(epochs, valid=False)
    return lrn

In [22]:
import time


class TimeCallback(Callback):
    def before_batch(self, learn):
        self.t0 = time.time()

    def after_batch(self, learn):
        t1 = time.time()
        dt = (t1 - self.t0) * 1000
        x, _ = learn.batch
        tokens_per_sec = x.shape[0] * x.shape[1] / (t1 - self.t0)
        print(f"step {learn.iter}, loss: {learn.loss.item():.2f}, time: {dt:.2f}msi, tok/sec: {tokens_per_sec:.0f}")

## Baseline

In [10]:
set_seed(1337)
model = get_model()
fit(model, xtra_cbs=[TimeCallback()])

loss,epoch,train
9.016,0,train


step 0, loss: 10.95, time: 1522.80msi, tok/sec: 1345
step 1, loss: 9.47, time: 479.22msi, tok/sec: 4274
step 2, loss: 9.20, time: 496.17msi, tok/sec: 4128
step 3, loss: 8.89, time: 484.28msi, tok/sec: 4229
step 4, loss: 8.75, time: 488.27msi, tok/sec: 4194
step 5, loss: 8.46, time: 484.12msi, tok/sec: 4230
step 6, loss: 8.13, time: 487.60msi, tok/sec: 4200
step 7, loss: 8.04, time: 374.17msi, tok/sec: 4105


<tinyai.learner.Learner at 0x7efde1c87070>

## TODO: what are dtypes
1. a100 architecture ref
2. auto mixed precision ref

In [None]:
# Use TensorFloat32
# Only available for Ampere GPUs
torch.set_float32_matmul_precision('high')

In [18]:
clean_mem()

## TODO: compile
1. gelu example
2. why compile? explain hbm to sm round trip
3. trouble shooting, no speed up for old cards

In [19]:
model1 = get_model()
model1 = torch.compile(model1, fullgraph=True)

In [23]:
fit(model1, xtra_cbs=[TimeCallback()])

loss,epoch,train
8.896,0,train


step 0, loss: 10.91, time: 40303.59msi, tok/sec: 51
step 1, loss: 9.42, time: 462.27msi, tok/sec: 4430
step 2, loss: 8.97, time: 477.25msi, tok/sec: 4291
step 3, loss: 8.66, time: 465.62msi, tok/sec: 4398
step 4, loss: 8.54, time: 471.61msi, tok/sec: 4343
step 5, loss: 8.38, time: 472.55msi, tok/sec: 4334
step 6, loss: 8.06, time: 468.77msi, tok/sec: 4369
step 7, loss: 8.01, time: 367.74msi, tok/sec: 4177


<tinyai.learner.Learner at 0x7fa3cd55b0d0>

compile primarily helps with memory bandwidth bound workloads by reducing data round trips between HBM and SM, in which case SM is so fast that it keeps waiting for data to arrive from HBM. Older cards may not see much speed up because they are slow on computation.

See https://huggingface.co/docs/transformers/perf_torch_compile for compile speed up benchmarks.