# Benchmark of inference tools on GPU

Comparisson of different tools for model inference on GPU.

|                     |                                         |
|---------------------|-------------------------------------------|
|**Hard ware**         | Intel Ice Lake with NVIDIA® Tesla® T4 with 4 CPUs and 16 Gb RAM.  |
|**Software Platform**| host |
|**Tools to compare**| Torch Script, Torch Trace, ONNX, Open Vino|

### Common

In [2]:
%load_ext autoreload
%autoreload 2

In [22]:
import timm
import torch as th
import typing as tp
import numpy as np
import cv2
import yaml
import json
import pandas as pd

from src.utils import get_batch, MAX_UINT8, benchmark


DATA_DIR = "./images"
BATCH_SIZES = [1,2,4,8,16]
PLATFORM = "1GPU"
WARMUP = 5
N_RUNS = 10
VERBOSE = False
N_PRINT = 5
DEVICE = "cuda"


In [4]:
batches = dict()
for batch_sz in BATCH_SIZES:
    batch = get_batch(DATA_DIR, batch_sz)
    batches[batch_sz] = batch
    del batch

In [5]:
inference_records = []

### Torch Model

In [6]:
from src.model.torch import ModelTorch

In [12]:
records = []
with open("./config/torch.yaml") as fp:
    cfg_torch = yaml.safe_load(fp)

cfg_torch["device"] = DEVICE
model_torch = ModelTorch(cfg_torch)
for batch_sz in BATCH_SIZES:
    avg, stdev = benchmark(
        model=model_torch,
        input_shape=(batch_sz, 3, MAX_UINT8, MAX_UINT8),
        nwarmup=WARMUP,
        nruns=N_RUNS,
        print_step=N_PRINT,
        verbose=VERBOSE
    )
    records.append(
        {
            "time": avg,
            "platform": PLATFORM, 
            "batch_sz": batch_sz,
            "tool": "Torch"
        }
    )
inference_records.extend(records)
pd.DataFrame(records)


Warm up ...
Start timing ...
Iteration 10/20, avg batch time 40.72 ± 0.34 ms.
Iteration 20/20, avg batch time 40.96 ± 0.39 ms.
Input shape: (1, 3, 255, 255)
Output features size: (17,)
Average throughput: 24.413 images/second
Warm up ...
Start timing ...
Iteration 10/20, avg batch time 67.94 ± 1.47 ms.
Iteration 20/20, avg batch time 67.86 ± 1.13 ms.
Input shape: (2, 3, 255, 255)
Output features size: (2, 17)
Average throughput: 29.472 images/second
Warm up ...
Start timing ...
Iteration 10/20, avg batch time 128.25 ± 0.70 ms.
Iteration 20/20, avg batch time 128.64 ± 1.57 ms.
Input shape: (4, 3, 255, 255)
Output features size: (4, 17)
Average throughput: 31.094 images/second
Warm up ...
Start timing ...
Iteration 10/20, avg batch time 267.29 ± 2.03 ms.
Iteration 20/20, avg batch time 267.72 ± 1.60 ms.
Input shape: (8, 3, 255, 255)
Output features size: (8, 17)
Average throughput: 29.882 images/second
Warm up ...
Start timing ...
Iteration 10/20, avg batch time 559.57 ± 10.73 ms.
Iterat

Unnamed: 0,time,platform,batch_sz,tool
0,0.040963,1GPU,1,Torch
1,0.06786,1GPU,2,Torch
2,0.128642,1GPU,4,Torch
3,0.267724,1GPU,8,Torch
4,0.557194,1GPU,16,Torch


### TorchScript Model

In [14]:
from src.model.torch_jit import ModelTorchJIT

tmp = th.jit.script(model_torch.model)
th.jit.save(tmp, "weights/model_scripted.th")

with open("./config/torch_scripted.yaml") as fp:
    cfg_scripted = yaml.safe_load(fp)
cfg_scripted["device"] = "cuda"
model_scripted = ModelTorchJIT(cfg_scripted)
records = []
for batch_sz in BATCH_SIZES:
    avg, stdev = benchmark(
        model=model_scripted,
        input_shape=(batch_sz, 3, MAX_UINT8, MAX_UINT8),
        nwarmup=WARMUP,
        nruns=N_RUNS,
        print_step=N_PRINT,
        verbose=VERBOSE
    )
    records.append(
        {
            "time": avg,
            "platform": PLATFORM, 
            "batch_sz": batch_sz,
            "tool": "TorchScript"
        }
    )
inference_records.extend(records)
pd.DataFrame(records)


 does not have profile information (Triggered internally at /opt/pytorch/pytorch/third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  return forward_call(*args, **kwargs)


Unnamed: 0,time,platform,batch_sz,tool
0,0.004041,1GPU,1,TorchScript
1,0.005542,1GPU,2,TorchScript
2,0.007792,1GPU,4,TorchScript
3,0.009527,1GPU,8,TorchScript
4,0.015114,1GPU,16,TorchScript


### TorchTrace Model

In [None]:
from src.model.torch_jit import ModelTorchJIT

In [14]:
tmp = th.jit.trace(
    model_torch.model,
    th.rand(1, 3, MAX_UINT8, MAX_UINT8).to(th.float32)
)
th.jit.save(tmp, "weights/model_traced.th")

In [15]:
with open("./config/torch_traced.yaml") as fp:
    cfg_traced = yaml.safe_load(fp)
cfg_traced["device"] = DEVICE
model_traced = ModelTorchJIT(cfg_traced)
records = []
for batch_sz in BATCH_SIZES:
    avg, _=benchmark(
        model=model_traced,
        input_shape=(batch_sz, 3, MAX_UINT8, MAX_UINT8),
        nwarmup=WARMUP,
        nruns=N_RUNS,
        print_step=N_PRINT,
        verbose=VERBOSE
    )
    records.append(
        {
            "time": avg,
            "platform": PLATFORM, 
            "batch_sz": batch_sz,
            "tool": "TorchTrace"
        }
    )
inference_records.extend(records)
pd.DataFrame(records)


Warm up ...
Start timing ...
Iteration 5/10, avg batch time 3.99 ± 0.01 ms.
Iteration 10/10, avg batch time 4.01 ± 0.02 ms.
Input shape: (1, 3, 255, 255)
Output features size: (17,)
Average throughput: 249.401 images/second
Warm up ...
Start timing ...
Iteration 5/10, avg batch time 3.99 ± 0.01 ms.
Iteration 10/10, avg batch time 4.00 ± 0.01 ms.
Input shape: (1, 3, 255, 255)
Output features size: (17,)
Average throughput: 250.245 images/second
Warm up ...
Start timing ...
Iteration 5/10, avg batch time 4.00 ± 0.01 ms.
Iteration 10/10, avg batch time 4.00 ± 0.01 ms.
Input shape: (1, 3, 255, 255)
Output features size: (17,)
Average throughput: 249.701 images/second
Warm up ...
Start timing ...
Iteration 5/10, avg batch time 2.74 ± 0.01 ms.
Iteration 10/10, avg batch time 2.73 ± 0.01 ms.
Input shape: (1, 3, 255, 255)
Output features size: (17,)
Average throughput: 365.654 images/second
Warm up ...
Start timing ...
Iteration 5/10, avg batch time 2.73 ± 0.01 ms.
Iteration 10/10, avg batch t

Unnamed: 0,time,platform,batch_sz,tool
0,0.00401,1GPU,1,TorchTrace
1,0.003996,1GPU,2,TorchTrace
2,0.004005,1GPU,4,TorchTrace
3,0.002735,1GPU,8,TorchTrace
4,0.002717,1GPU,16,TorchTrace


### ONNX Model

In [17]:
from src.model.onnx import ModelONNX

In [18]:
th.onnx.export(
    model_torch.model,
    th.rand(1, 3, MAX_UINT8, MAX_UINT8).to(th.float32),
    "weights/model.onnx",
    verbose=True,
    input_names=['input'],
    output_names=['output'],
    dynamic_axes = {
        'input': [0], 
        'output': [0]
    },
)



Exported graph: graph(%input : Float(*, 3, 255, 255, strides=[195075, 65025, 255, 1], requires_grad=0, device=cpu),
      %fc.weight : Float(17, 512, strides=[512, 1], requires_grad=1, device=cpu),
      %fc.bias : Float(17, strides=[1], requires_grad=1, device=cpu),
      %onnx::Conv_193 : Float(64, 3, 7, 7, strides=[147, 49, 7, 1], requires_grad=0, device=cpu),
      %onnx::Conv_194 : Float(64, strides=[1], requires_grad=0, device=cpu),
      %onnx::Conv_196 : Float(64, 64, 3, 3, strides=[576, 9, 3, 1], requires_grad=0, device=cpu),
      %onnx::Conv_197 : Float(64, strides=[1], requires_grad=0, device=cpu),
      %onnx::Conv_199 : Float(64, 64, 3, 3, strides=[576, 9, 3, 1], requires_grad=0, device=cpu),
      %onnx::Conv_200 : Float(64, strides=[1], requires_grad=0, device=cpu),
      %onnx::Conv_202 : Float(64, 64, 3, 3, strides=[576, 9, 3, 1], requires_grad=0, device=cpu),
      %onnx::Conv_203 : Float(64, strides=[1], requires_grad=0, device=cpu),
      %onnx::Conv_205 : Float(64

In [19]:
with open("./config/onnx_gpu.yaml") as fp:
    cfg_onnx = yaml.safe_load(fp)
cfg_onnx["device"] = DEVICE
model_onnx = ModelONNX(cfg_onnx)
records = []
for batch_sz in BATCH_SIZES:
    avg, _ = benchmark(
        model=model_onnx,
        input_shape=(batch_sz, 3, MAX_UINT8, MAX_UINT8),
        nwarmup=WARMUP,
        nruns=N_RUNS,
        print_step=N_PRINT,
        verbose=VERBOSE
    )
    records.append(
        {
            "time": avg,
            "platform": PLATFORM, 
            "batch_sz": batch_sz,
            "tool": "TorchTrace"
        }
    )
inference_records.extend(records)
pd.DataFrame(records)


2023-05-21 22:23:22.976463627 [W:onnxruntime:Default, onnxruntime_pybind_state.cc:541 CreateExecutionProviderInstance] Failed to create CUDAExecutionProvider. Please reference https://onnxruntime.ai/docs/reference/execution-providers/CUDA-ExecutionProvider.html#requirements to ensure all dependencies are met.


Unnamed: 0,time,platform,batch_sz,tool
0,0.018136,1GPU,1,TorchTrace
1,0.018139,1GPU,2,TorchTrace
2,0.018288,1GPU,4,TorchTrace
3,0.018123,1GPU,8,TorchTrace
4,0.017804,1GPU,16,TorchTrace


### OpenVino Model

In [None]:
# bash command to convert ONNX -> OpenVino
!mo --input_model weights/model.onnx --output_dir weights/openvino

In [31]:
from src.model.open_vino import ModelOpenVino

ModuleNotFoundError: No module named 'openvino'

In [None]:
with open("./config/onnx_gpu.yaml") as fp:
    cfg_ov = yaml.safe_load(fp)
cfg_ov["device"] = DEVICE

model_ov = ModelOpenVino(cfg_ov)
records = []
for batch_sz in BATCH_SIZES:
    avg, _ = benchmark(
        model=model_ov,
        input_shape=(batch_sz, 3, MAX_UINT8, MAX_UINT8),
        nwarmup=WARMUP,
        nruns=N_RUNS,
        print_step=N_PRINT,
        verbose=VERBOSE
    )
    records.append(
        {
            "time": avg,
            "platform": PLATFORM, 
            "batch_sz": batch_sz,
            "tool": "TorchTrace"
        }
    )
inference_records.extend(records)
pd.DataFrame(records)


### TensorRT32 Model

In [37]:
import tensorrt as trt
import torch_tensorrt
from src.model.torch_jit import ModelTorchJIT

In [33]:
trt_model = torch_tensorrt.compile(
    model_torch.model,                                       
    inputs = [torch_tensorrt.Input((1, 3, MAX_UINT8, MAX_UINT8))], 
    enabled_precisions = th.float32,                # <- изменения здесь
    workspace_size = 1 << 30,                     
)
th.jit.save(trt_model, "weights/model_trt_fp32.ts")

In [44]:
with open("./config/tensorrt_fp32.yaml") as fp:
    cfg_trt = yaml.safe_load(fp)
cfg_trt["device"] = DEVICE
model_trt = ModelTorchJIT(cfg_trt)
records = []
for batch_sz in BATCH_SIZES:
    avg, _ = benchmark(
        model=model_trt,
        input_shape=(batch_sz, 3, MAX_UINT8, MAX_UINT8),
        nwarmup=WARMUP,
        nruns=N_RUNS,
        print_step=N_PRINT,
        verbose=VERBOSE
    )
    records.append(
        {
            "time": avg,
            "platform": PLATFORM, 
            "batch_sz": batch_sz,
            "tool": "TensorRT32"
        }
    )
inference_records.extend(records)
pd.DataFrame(records)

(0.004198403358459473, 0.00020665497439227944)

### TensorRT16 Model

In [37]:
import tensorrt as trt
import torch_tensorrt
from src.model.torch_jit import ModelTorchJIT

In [49]:
trt_model = torch_tensorrt.compile(
    model_torch.model,                                       
    inputs = [torch_tensorrt.Input((1, 3, MAX_UINT8, MAX_UINT8))], 
    enabled_precisions = th.float16,                # <- изменения здесь
    workspace_size = 1 << 30,                     
)
th.jit.save(trt_model, "weights/model_trt_fp16.ts")



In [50]:
with open("./config/tensorrt_fp16.yaml") as fp:
    cfg_trt = yaml.safe_load(fp)
cfg_trt["device"] = DEVICE
model_trt = ModelTorchJIT(cfg_trt)
records = []
for batch_sz in BATCH_SIZES:
    avg, _ = benchmark(
        model=model_trt,
        input_shape=(batch_sz, 3, MAX_UINT8, MAX_UINT8),
        nwarmup=WARMUP,
        nruns=N_RUNS,
        print_step=N_PRINT,
        verbose=VERBOSE
    )
    records.append(
        {
            "time": avg,
            "platform": PLATFORM, 
            "batch_sz": batch_sz,
            "tool": "TensorRT32"
        }
    )
inference_records.extend(records)
pd.DataFrame(records)

(0.0028227734565734863, 4.588646928421068e-05)

In [23]:
with open(
    "results/inference_results_1gpu.json", 
    "w",
    encoding="utf8"
) as fp:
    json.dump(fp=fp, obj=inference_records)