In [1]:
import torch
import time
import os

from torch import nn
import torchvision.models as models
from triton.testing import do_bench
import torch._dynamo

In [2]:
torch.set_float32_matmul_precision('high')
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
def run_benchmark(fn):
    exec_time, prctl20, prctl80 = do_bench(fn,warmup=100,rep=1000)
    print(f"Exec time (median): {exec_time}")
    print(f"Exec time (20th percentile): {prctl20}")
    print(f"Exec time (80th percentile): {prctl80}\n")
    return exec_time

## 1. ResNet50 Speedup on NVIDIA A10G

In [4]:
def run_batch(model, optimizer):
    x = torch.randn(16, 3, 224, 224).to(device)
    optimizer.zero_grad()
    out = model(x)
    out.sum().backward()
    optimizer.step()

In [5]:
model = models.resnet50().to(device)

In [6]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# Benchmark Eager
print("Resnet50 Eager mode")
exec_time = run_benchmark(lambda: run_batch(model, optimizer))

# Benchmark torch.compile defaults
print("Resnet50 Compiled defaults")
opt_model = torch.compile(model)
opt_exec_time = run_benchmark(lambda: run_batch(opt_model, optimizer))

# Print speedups
print(f"speedup: {100*(exec_time-opt_exec_time) / opt_exec_time: .2f}%")

Resnet50 Eager mode
Exec time (median): 50.48729705810547
Exec time (20th percentile): 50.44060134887695
Exec time (80th percentile): 50.53480911254883

Resnet50 Compiled defaults
Exec time (median): 46.86796569824219
Exec time (20th percentile): 46.85209655761719
Exec time (80th percentile): 46.89039611816406

speedup:  7.72%


## 2. Custom model Speedup on NVIDIA A10G

In [7]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(1024, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        
    def forward(self, x):
        x = self.fc1(x).relu() ** 2
        return self.fc2(x).relu() ** 2

In [8]:
model = MLP().to(device)
x = torch.randn(1024, 1024).to(device)

In [10]:
# Benchmark Eager
exec_time = run_benchmark(lambda: model(x).sum().backward())

torch._dynamo.reset()
# Benchmark torch.compile defaults
cmodel = torch.compile(model, backend='inductor')
opt_exec_time = run_benchmark(lambda: cmodel(x).sum().backward())

# Print speedups
print(f"speedup: {100*(exec_time-opt_exec_time) / opt_exec_time: .2f}%")

Exec time (median): 0.7167999744415283
Exec time (20th percentile): 0.7147520184516907
Exec time (80th percentile): 0.7188479900360107

Exec time (median): 0.6021119952201843
Exec time (20th percentile): 0.6000639796257019
Exec time (80th percentile): 0.6031360030174255

speedup:  19.05%


## 3. HuggingFace model Speedup on NVIDIA A10G

In [11]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import load_dataset

def run_inference(model, input_values):
    
    # retrieve logits
    logits = model(input_values).logits
    
    # take argmax and decode
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)

In [12]:
# load model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self").cuda()

# load dummy dataset and read soundfiles
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")

# tokenize
input_values = processor(ds[0]["audio"]["array"], return_tensors="pt", padding="longest").input_values.cuda()

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Found cached dataset librispeech_asr_dummy (/root/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr_dummy/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc)
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [13]:
exec_time = run_benchmark(lambda: run_inference(model, input_values))

torch._dynamo.reset()
model = torch.compile(model, mode="max-autotune")
opt_exec_time = run_benchmark(lambda: run_inference(model, input_values))

# Print speedups
print(f"speedup: {100*(exec_time-opt_exec_time) / opt_exec_time: .2f}%")

Exec time (median): 30.599727630615234
Exec time (20th percentile): 30.556320190429688
Exec time (80th percentile): 30.62679672241211

Exec time (median): 28.694766998291016
Exec time (20th percentile): 28.68025016784668
Exec time (80th percentile): 28.719148635864258

speedup:  6.64%


In [14]:
torch._dynamo.list_backends()

['aot_ts_nvfuser',
 'cudagraphs',
 'inductor',
 'ipex',
 'nvprims_nvfuser',
 'onnxrt',
 'tvm']