In [1]:
import torch.utils.benchmark as benchmark
import torch
import torch.nn as nn
from torchvision.models import resnet
import torch._dynamo

device = torch.device("cuda") if torch.cuda.is_available() else "cpu"

# How to use `torch.compile()`

In [2]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(32, 64)

    def forward(self, x):
        x = self.fc1(x)
        x = torch.nn.functional.relu(x)
        return x

model = MLP()
input = torch.randn(8, 32)

torch._dynamo.reset() # Only needed if you call this cell repeatedly
compiled_model = torch.compile(model)

# Alternatively you can also pass the backend
compiled_model = torch.compile(model, backend='inductor')

output = model(input)
# triggers compilation of forward graph on the first run
output_compiled = compiled_model(input)

torch.all(output == output_compiled)

tensor(True)

# Benchmark Resnet18

In [3]:
def run_batch_inference(model, batch=1):
    x = torch.randn(batch, 3, 224, 224).to(device)
    model(x)

def run_batch_train(model, optimizer, batch=16):
    x = torch.randn(batch, 3, 224, 224).to(device)
    optimizer.zero_grad()
    out = model(x)
    out.sum().backward()
    optimizer.step()
    
model = resnet.resnet18(weights=resnet.ResNet18_Weights.IMAGENET1K_V1).to(device)

In [4]:
batch = 16
torch._dynamo.reset()
compiled_model = torch.compile(model)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

t_model = benchmark.Timer(
    stmt='run_batch_train(model, optimizer, batch)',
    setup='from __main__ import run_batch_train',
    globals={'model': model,'optimizer':optimizer, 'batch':batch})

t_compiled_model = benchmark.Timer(
    stmt='run_batch_train(model, optimizer, batch)',
    setup='from __main__ import run_batch_train',
    globals={'model': compiled_model, 'optimizer':optimizer, 'batch':batch})

t_model_runs = t_model.timeit(100)
t_compiled_model_runs = t_compiled_model.timeit(100)

print(t_model_runs)
print(t_compiled_model_runs)

print(f"\nResnet18 Training speedup: {100*(t_model_runs.mean - t_compiled_model_runs.mean) / t_model_runs.mean: .2f}%")

<torch.utils.benchmark.utils.common.Measurement object at 0x7f1800eabb80>
run_batch_train(model, optimizer, batch)
setup: from __main__ import run_batch_train
  20.54 ms
  1 measurement, 100 runs , 1 thread
<torch.utils.benchmark.utils.common.Measurement object at 0x7f17b8972e00>
run_batch_train(model, optimizer, batch)
setup: from __main__ import run_batch_train
  19.56 ms
  1 measurement, 100 runs , 1 thread

Resnet18 Training speedup:  4.78%


In [5]:
batch = 1
torch._dynamo.reset()
compiled_model = torch.compile(model, mode='reduce-overhead')

t_model = benchmark.Timer(
    stmt='run_batch_inference(model, batch)',
    setup='from __main__ import run_batch_inference',
    globals={'model': model, 'batch':batch})

t_compiled_model = benchmark.Timer(
    stmt='run_batch_inference(model, batch)',
    setup='from __main__ import run_batch_inference',
    globals={'model': compiled_model, 'batch':batch})

t_model_runs = t_model.timeit(100)
t_compiled_model_runs = t_compiled_model.timeit(100)

print(f"\nResnet18 Inference speedup: {100*(t_model_runs.mean - t_compiled_model_runs.mean) / t_model_runs.mean: .2f}%")


Resnet18 Inference speedup:  31.43%


# Huggingface

In [6]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import load_dataset

def run_hf_inference(model, input_values):
    
    # retrieve logits
    logits = model(input_values).logits
    
    # take argmax and decode
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)

In [7]:
# load model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self").cuda()

# load dummy dataset and read soundfiles
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")

# tokenize
input_values = processor(ds[0]["audio"]["array"], return_tensors="pt", padding="longest").input_values.cuda()

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Found cached dataset librispeech_asr_dummy (/root/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr_dummy/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc)
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [9]:
batch = 1
torch._dynamo.reset()
compiled_model = torch.compile(model, mode='max-autotune')

t_model = benchmark.Timer(
    stmt='run_hf_inference(model, input_values)',
    setup='from __main__ import run_hf_inference',
    globals={'model': model, 'input_values':input_values})

t_compiled_model = benchmark.Timer(
    stmt='run_hf_inference(model, input_values)',
    setup='from __main__ import run_hf_inference',
    globals={'model': compiled_model, 'input_values':input_values})

t_model_runs = t_model.timeit(100)
t_compiled_model_runs = t_compiled_model.timeit(100)

print(f"\nHuggingface Inference speedup: {100*(t_model_runs.mean - t_compiled_model_runs.mean) / t_model_runs.mean: .2f}%")


Huggingface Inference speedup:  4.27%
