In [18]:
import torch
import torch
from triton.testing import do_bench
from torchvision.models import resnet

device = torch.device("cuda") if torch.cuda.is_available() else "cpu"

In [19]:
def run_batch_train(model, optimizer, batch=16):
    x = torch.randn(batch, 3, 224, 224).to(device)
    optimizer.zero_grad()
    out = model(x)
    out.sum().backward()
    optimizer.step()
    
def run_batch_inference(model, batch=16):
    x = torch.randn(batch, 3, 224, 224).to(device)
    model(x)

In [20]:
model = resnet.resnet152(weights=resnet.ResNet152_Weights.IMAGENET1K_V1).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
batch = 16
compiled_model = torch.compile(model)

In [21]:
import torch.utils.benchmark as benchmark

t_model = benchmark.Timer(
    stmt='run_batch_train(model, optimizer, batch)',
    setup='from __main__ import run_batch_train',
    globals={'model': model, 'optimizer': optimizer, 'batch':batch})

t_compiled = benchmark.Timer(
    stmt='run_batch_train(model, optimizer, batch)',
    setup='from __main__ import run_batch_train',
    globals={'model': compiled_model, 'optimizer': optimizer, 'batch':batch})

In [23]:
t_model_avg = t_model.timeit(100).mean
t_compiled_avg = t_compiled.timeit(100).mean

In [24]:
print(f"Inference speedup: {100*(t_model_avg-t_compiled_avg) / t_model_avg: .2f}%")

Inference speedup:  3.38%


In [None]:
time = benchmark.Timer(
    stmt='run_batch_inference(model, optimizer, batch)',
    setup='from __main__ import run_batch_train',
    globals={'model': model, 'batch':batch})

t1 = benchmark.Timer(
    stmt='run_batch_inference(model, optimizer, batch)',
    setup='from __main__ import run_batch_train',
    globals={'model': compiled_model, 'batch':batch})

t0_avg = t0.timeit(100)
t1_avg = t1.timeit(100)

print(t0_avg)
print(t1_avg)