# Setup and Installations

In [None]:
import io
import numpy as np
from time import perf_counter

from torch import nn
import torchvision.models as models
import torch.onnx


# Model and Data Load

In [None]:
model = models.resnet18(pretrained=True)       # We now have an instance of the pretrained model

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


  0%|          | 0.00/44.7M [00:00<?, ?B/s]

In [None]:
model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [None]:
dummy_input = torch.randn(1, 3, 224, 224)

# ONNX Compilation, Export, and ONNX Runtime Inference

In [None]:
input_names = [ "actual_input" ]
output_names = [ "output" ]

torch.onnx.export(model,
                 dummy_input,
                 "resnet18.onnx",
                 verbose=False,
                 input_names=input_names,
                 output_names=output_names,
                 export_params=True,
                 )

In [None]:
!pip install onnx
!pip install onnxruntime

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting onnx
  Downloading onnx-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m82.8 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf<4,>=3.20.2
  Downloading protobuf-3.20.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m70.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: protobuf, onnx
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.19.6
    Uninstalling protobuf-3.19.6:
      Successfully uninstalled protobuf-3.19.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.11.0 requires protobuf<3.20

In [None]:
import onnxruntime

ort_session = onnxruntime.InferenceSession("resnet18.onnx")

def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

# compute ONNX Runtime output prediction
ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(dummy_input)}
ort_outs = ort_session.run(None, ort_inputs)

# compare ONNX Runtime and PyTorch results
torch_out = model(dummy_input) #torch.randn(1, 3, 224, 224)
np.testing.assert_allclose(to_numpy(torch_out), ort_outs[0], rtol=1e-03, atol=1e-05)

# ONNX Static Graph Optimization

In [None]:
!pip install onnxoptimizer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting onnxoptimizer
  Downloading onnxoptimizer-0.3.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (645 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m645.8/645.8 KB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: onnxoptimizer
Successfully installed onnxoptimizer-0.3.9


In [None]:
!python -m onnxoptimizer resnet18.onnx resnet18_opt.onnx

In [None]:
import time

dummy_input = torch.randn(1, 3, 224, 224)

def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

def time_ort_model_evaluation(model_path):
    sess_options = onnxruntime.SessionOptions()
    sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
    session = onnxruntime.InferenceSession(model_path, sess_options)

    time_per_inference = []
    for _ in range(10):
        dummy_input = torch.randn(1, 3, 224, 224)
        # compute ONNX Runtime output prediction
        ort_inputs = {session.get_inputs()[0].name: to_numpy(dummy_input)}
        start = perf_counter()
        session.run(None, ort_inputs)
        time_per_inference.append((1000 * (perf_counter() - start)))

    return np.mean(time_per_inference)

print('Average runtime of ONNX Model in GPU: ' + str(time_ort_model_evaluation('resnet18.onnx')))
print('Average runtime of ONNX Optimized Model in GPU: ' + str(time_ort_model_evaluation('resnet18_opt.onnx')))


Average runtime of ONNX Model in GPU: 52.80123090000188
Average runtime of ONNX Optimized Model in GPU: 43.55868289999307


# ONNX Runtime Optimization - Quantization

In [None]:
import os 

def quantize_onnx_model(onnx_model_path, quantized_model_path):
    from onnxruntime.quantization import quantize_dynamic, QuantType
    import onnx
    onnx_opt_model = onnx.load(onnx_model_path)
    quantize_dynamic(onnx_model_path,
                     quantized_model_path,
                     weight_type=QuantType.QUInt8) #QInt8

    print(f"quantized model saved to:{quantized_model_path}")

quantize_onnx_model('resnet18_opt.onnx', 'resnet18_opt_quant.onnx')

print('ONNX full precision model size (MB):', os.path.getsize("resnet18_opt.onnx")/(1024*1024))
print('ONNX quantized model size (MB):', os.path.getsize("resnet18_opt_quant.onnx")/(1024*1024))

quantized model saved to:resnet18_opt_quant.onnx
ONNX full precision model size (MB): 44.58288764953613
ONNX quantized model size (MB): 11.200801849365234


**Quantization on GPU**


Hardware support is required to achieve better performance with quantization on GPUs. You need a device that supports Tensor Core int8 computation, like T4 or A100. Older hardware will not benefit from quantization.

# Time Comparison of ONNX and ONNX Quantized Models

In [None]:
import time

dummy_input = torch.randn(1, 3, 224, 224)

def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

def time_ort_model_evaluation(model_path):
    sess_options = onnxruntime.SessionOptions()
    sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
    session = onnxruntime.InferenceSession(model_path, sess_options)

    time_per_inference = []
    for _ in range(10):
        dummy_input = torch.randn(1, 3, 224, 224)
        # compute ONNX Runtime output prediction
        ort_inputs = {session.get_inputs()[0].name: to_numpy(dummy_input)}
        start = perf_counter()
        session.run(None, ort_inputs)
        time_per_inference.append((1000 * (perf_counter() - start)))

    return np.mean(time_per_inference)

print('Average runtime of ONNX Model in TPU: ' + str(time_ort_model_evaluation('resnet18.onnx')))
print('Average runtime of ONNX Quantized Model in TPU: ' + str(time_ort_model_evaluation('resnet18_opt_quant.onnx')))


Average runtime of ONNX Model in GPU: 30.944632099988212
Average runtime of ONNX Quantized Model in GPU: 50.33703259999811


# Visualizing ONNX Models

In [None]:
!pip install netron

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting netron
  Downloading netron-6.7.6-py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: netron
Successfully installed netron-6.7.6


In [None]:
import netron
netron.start('resnet18.onnx')
from google.colab import output
output.serve_kernel_port_as_iframe(8081, height='800')

Serving 'resnet18.onnx' at http://localhost:23825


<IPython.core.display.Javascript object>

In [None]:
import netron
from google.colab import output
import portpicker

port = portpicker.pick_unused_port()

# Read the model file and start the netron browser.
with output.temporary():
  netron.start('resnet18_opt_quant.onnx', port, browse=False)

output.serve_kernel_port_as_iframe(port, height='800')


<IPython.core.display.Javascript object>

In [None]:
import netron
from google.colab import output
import portpicker

port = portpicker.pick_unused_port()

# Read the model file and start the netron browser.
with output.temporary():
  netron.start('resnet18.onnx', port, browse=False)

output.serve_kernel_port_as_iframe(port, height='800')

<IPython.core.display.Javascript object>