In [1]:

from polygraphy.backend.trt import CreateConfig, Profile, TrtRunner, TacticRecorder, TacticReplayer, TacticReplayData
from polygraphy.backend.trt import network_from_onnx_path, engine_from_network, create_config, save_engine

from polygraphy.backend.onnxrt import OnnxrtRunner, session_from_onnx
from polygraphy.backend.onnx import modify_outputs, onnx_from_path, save_onnx
from polygraphy.backend.onnx.util import str_from_onnx, all_tensor_names

from polygraphy.comparator import Comparator, CompareFunc, DataLoader
from polygraphy.common import TensorMetadata
from polygraphy.json import load_json


ONNX_MODEL = "./models/crnn.onnx"

In [2]:
# Inspect the onnx model
onnx_proto = onnx_from_path(ONNX_MODEL)
onnx_str = str_from_onnx(onnx_proto, mode='attrs')
print(onnx_str)

[I] Loading model: ./models/crnn.onnx
Name: torch-jit-export | Opset: 11

---- 1 Graph Input(s) ----
{input_0 [dtype=float32, shape=('batch', 3, 32, 1024)]}

---- 1 Graph Output(s) ----
{output_0 [dtype=float32, shape=('batch', 256, 8103)]}

---- 86 Initializer(s) ----
{551 [dtype=float32, shape=(32, 3, 3, 3)],
 552 [dtype=float32, shape=(32,)],
 554 [dtype=float32, shape=(64, 32, 3, 3)],
 555 [dtype=float32, shape=(64,)],
 557 [dtype=float32, shape=(128, 64, 3, 3)],
 558 [dtype=float32, shape=(128,)],
 560 [dtype=float32, shape=(128, 64, 1, 1)],
 561 [dtype=float32, shape=(128,)],
 563 [dtype=float32, shape=(128, 128, 3, 3)],
 564 [dtype=float32, shape=(128,)],
 566 [dtype=float32, shape=(128, 128, 1, 1)],
 567 [dtype=float32, shape=(128,)],
 569 [dtype=float32, shape=(256, 128, 3, 3)],
 570 [dtype=float32, shape=(256,)],
 572 [dtype=float32, shape=(256, 128, 1, 1)],
 573 [dtype=float32, shape=(256,)],
 575 [dtype=float32, shape=(256, 256, 3, 3)],
 576 [dtype=float32, shape=(256,)],
 

In [3]:
# Modify the onnx model's outputs

modified = modify_outputs(onnx_proto,['646','351'])# mark tensor 646 and 351 as outputs, or you can use constants.MARK_ALL to mark all tensors as outputs
onnx_modified = str_from_onnx(modified, mode='attrs')
print(onnx_modified)
_=save_onnx(modified,"modified.onnx")

Name: torch-jit-export | Opset: 11

---- 1 Graph Input(s) ----
{input_0 [dtype=float32, shape=('batch', 3, 32, 1024)]}

---- 2 Graph Output(s) ----
{646 [shape=()],
 351 [shape=()]}

---- 86 Initializer(s) ----
{551 [dtype=float32, shape=(32, 3, 3, 3)],
 552 [dtype=float32, shape=(32,)],
 554 [dtype=float32, shape=(64, 32, 3, 3)],
 555 [dtype=float32, shape=(64,)],
 557 [dtype=float32, shape=(128, 64, 3, 3)],
 558 [dtype=float32, shape=(128,)],
 560 [dtype=float32, shape=(128, 64, 1, 1)],
 561 [dtype=float32, shape=(128,)],
 563 [dtype=float32, shape=(128, 128, 3, 3)],
 564 [dtype=float32, shape=(128,)],
 566 [dtype=float32, shape=(128, 128, 1, 1)],
 567 [dtype=float32, shape=(128,)],
 569 [dtype=float32, shape=(256, 128, 3, 3)],
 570 [dtype=float32, shape=(256,)],
 572 [dtype=float32, shape=(256, 128, 1, 1)],
 573 [dtype=float32, shape=(256,)],
 575 [dtype=float32, shape=(256, 256, 3, 3)],
 576 [dtype=float32, shape=(256,)],
 578 [dtype=float32, shape=(256, 256, 1, 1)],
 579 [dtype=fl

In [3]:
builder, network, parser = network_from_onnx_path(ONNX_MODEL)

In [4]:
recorder = TacticRecorder("trt_tactics.json")
optimization_profiles = [Profile().add("input_0", min=(1,3,32,1024), opt=(8,3,32,1024), max=(16,3,32,1024))]
trt_config = CreateConfig(max_workspace_size=4096000000,
                          tf32=None,
                          fp16=True,
                          int8=None,
                          profiles=optimization_profiles,
                          calibrator=None,
                          strict_types=None, 
                          load_timing_cache=None,
                          algorithm_selector=recorder,
                          sparse_weights=None,
                          tactic_sources=[0,1,2],
                          restricted=None)

engine = engine_from_network((builder,network), config=trt_config)
save_engine(engine,"crnn_fp16.plan")

# To ensure the reproducibility of the engine build

# replayer = TacticReplayer("trt_tactics.json")
# optimization_profiles = [Profile().add("input_0", min=(1,3,32,1024), opt=(8,3,32,1024), max=(16,3,32,1024))]
# trt_config = CreateConfig(max_workspace_size=2048000000,
#                          tf32=None,
#                          fp16=True,
#                          int8=None,
#                          profiles=optimization_profiles,
#                          calibrator=None,
#                          strict_types=None, 
#                          load_timing_cache=None,
#                          algorithm_selector=replayer,
#                          sparse_weights=None,
#                          tactic_sources=[0,1,2],
#                          restricted=None)

[I]     Configuring with profiles: [Profile().add(input_0, min=(1, 3, 32, 1024), opt=(8, 3, 32, 1024), max=(16, 3, 32, 1024))]
[38;5;12m[I] Building engine with configuration:
    Workspace            | 4096000000 bytes (3906.25 MiB)
    Precision            | TF32: False, FP16: True, INT8: False, Strict Types: False
    Tactic Sources       | ['CUBLAS', 'CUBLAS_LT', 'CUDNN']
    Safety Restricted   : False
    Profiles             | 1 profile(s)[0m
[I] Saving tactic replay file to trt_tactics.json
[38;5;10m[I] Finished engine building in 164.395 seconds[0m
[I] Saving engine to crnn_fp16.plan


<tensorrt.tensorrt.ICudaEngine at 0x7f97b81b9f30>

In [5]:
# Run inference with comparator

import numpy as np
  
meta = TensorMetadata()
meta.add("input_0", np.float32, [16,3,32,1024])
loader = DataLoader(input_metadata=meta)

onnx_session = session_from_onnx(ONNX_MODEL)

runners = [TrtRunner(engine),
           OnnxrtRunner(onnx_session)]

run_results = Comparator.run(runners, data_loader=loader, save_inputs_path="inputs.json")


[I] Will generate inference input data according to provided TensorMetadata: {input_0 [dtype=float32, shape=(16, 3, 32, 1024)]}
[38;5;12m[I] trt-runner-N0-07/23/21-16:58:58     | Activating and starting inference[0m
[I] trt-runner-N0-07/23/21-16:58:58    
    ---- Model Input(s) ----
    {input_0 [dtype=float32, shape=(-1, 3, 32, 1024)]}
[I] Saving inference input data to inputs.json
[I] trt-runner-N0-07/23/21-16:58:58    
    ---- Model Output(s) ----
    {output_0 [dtype=float32, shape=(16, 256, 8103)]}
[38;5;10m[I] trt-runner-N0-07/23/21-16:58:58     | Completed 1 iteration(s) in 55.41 ms | Average inference time: 55.41 ms.[0m
[38;5;12m[I] onnxrt-runner-N0-07/23/21-16:58:58  | Activating and starting inference[0m
[I] onnxrt-runner-N0-07/23/21-16:58:58 
    ---- Model Input(s) ----
    {input_0 [dtype=float32, shape=('batch', 3, 32, 1024)]}
[I] onnxrt-runner-N0-07/23/21-16:58:58 
    ---- Model Output(s) ----
    {output_0 [dtype=float32, shape=(16, 256, 8103)]}
[38;5;10m[I] o

In [6]:
# Compare results across runners

compare = CompareFunc.basic_compare_func(check_shapes=True, rtol=10, atol=0.15, fail_fast=None, check_error_stat='max')

accuracy_result = Comparator.compare_accuracy(run_results,compare_func=compare)

[38;5;12m[I] Accuracy Comparison | trt-runner-N0-07/23/21-16:58:58 vs. onnxrt-runner-N0-07/23/21-16:58:58[0m
[38;5;12m[I]     Comparing Output: 'output_0' (dtype=float32, shape=(16, 256, 8103)) with 'output_0' (dtype=float32, shape=(16, 256, 8103)) | Tolerance: [abs=0.15, rel=10] | Checking max error[0m
[I]         trt-runner-N0-07/23/21-16:58:58: output_0 | Stats: mean=0.00012341, std-dev=0.009961, var=9.9221e-05, median=0, min=0 at (0, 0, 5), max=1 at (0, 128, 0)
[I]             ---- Histogram ----
                Bin Range  |  Num Elems | Visualization
                (0  , 0.1) |   33185792 | ########################################
                (0.1, 0.2) |          2 | 
                (0.2, 0.3) |         16 | 
                (0.3, 0.4) |         54 | 
                (0.4, 0.5) |        145 | 
                (0.5, 0.6) |        173 | 
                (0.6, 0.7) |        138 | 
                (0.7, 0.8) |        131 | 
                (0.8, 0.9) |        536 | 
       

In [16]:
import tensorrt as trt

EXCLUDE_LAYERS = [trt.LayerType.SHAPE,trt.LayerType.CONSTANT,trt.LayerType.CONCATENATION,trt.LayerType.GATHER,trt.LayerType.SLICE,trt.LayerType.SHUFFLE]
num_layer = network.num_layers
for i in range(num_layer-1,255,-1):
    layer = network.get_layer(i)
    if layer.type not in EXCLUDE_LAYERS:
        print("setting layer_{} to fp32".format(i))      
        layer.reset_precision()
        layer.precision = trt.float32
        layer.set_output_type(0,trt.float32)
          
trt_config = CreateConfig(max_workspace_size=4096000000,
                          tf32=None,
                          fp16=True,
                          int8=None,
                          profiles=optimization_profiles,
                          calibrator=None,
                          strict_types=True, 
                          load_timing_cache=None,
                          algorithm_selector=None,
                          sparse_weights=None,
                          tactic_sources=[0,1,2],
                          restricted=None)

engine_v2 = engine_from_network((builder,network), config=trt_config)
save_engine(engine_v2,"crnn_fp16_v2.plan")   

setting layer_264 to fp32
setting layer_260 to fp32
[I]     Configuring with profiles: [Profile().add(input_0, min=(1, 3, 32, 1024), opt=(8, 3, 32, 1024), max=(16, 3, 32, 1024))]
[38;5;12m[I] Building engine with configuration:
    Workspace            | 4096000000 bytes (3906.25 MiB)
    Precision            | TF32: False, FP16: True, INT8: False, Strict Types: True
    Tactic Sources       | ['CUBLAS', 'CUBLAS_LT', 'CUDNN']
    Safety Restricted   : False
    Profiles             | 1 profile(s)[0m
[38;5;10m[I] Finished engine building in 99.156 seconds[0m
[I] Saving engine to crnn_fp16_v2.plan


<tensorrt.tensorrt.ICudaEngine at 0x7f973442a6b0>

In [17]:

runners_v2 = [TrtRunner(engine_v2),
              OnnxrtRunner(onnx_session)]

loader = []
for input_data_path in ['inputs.json']:
    loader.extend(load_json(input_data_path, description='input data'))

run_results_v2 = Comparator.run(runners_v2, data_loader=loader)

[I] Loading input data from inputs.json
[38;5;12m[I] trt-runner-N4-07/23/21-17:12:21     | Activating and starting inference[0m
[I] trt-runner-N4-07/23/21-17:12:21    
    ---- Model Input(s) ----
    {input_0 [dtype=float32, shape=(-1, 3, 32, 1024)]}
[I] trt-runner-N4-07/23/21-17:12:21    
    ---- Model Output(s) ----
    {output_0 [dtype=float32, shape=(16, 256, 8103)]}
[38;5;10m[I] trt-runner-N4-07/23/21-17:12:21     | Completed 1 iteration(s) in 62.53 ms | Average inference time: 62.53 ms.[0m
[38;5;12m[I] onnxrt-runner-N4-07/23/21-17:12:21  | Activating and starting inference[0m
[I] onnxrt-runner-N4-07/23/21-17:12:21 
    ---- Model Input(s) ----
    {input_0 [dtype=float32, shape=('batch', 3, 32, 1024)]}
[I] onnxrt-runner-N4-07/23/21-17:12:21 
    ---- Model Output(s) ----
    {output_0 [dtype=float32, shape=(16, 256, 8103)]}
[38;5;10m[I] onnxrt-runner-N4-07/23/21-17:12:21  | Completed 1 iteration(s) in 766 ms | Average inference time: 766 ms.[0m


In [18]:
accuracy_result = Comparator.compare_accuracy(run_results_v2,compare_func=compare)

[38;5;12m[I] Accuracy Comparison | trt-runner-N4-07/23/21-17:12:21 vs. onnxrt-runner-N4-07/23/21-17:12:21[0m
[38;5;12m[I]     Comparing Output: 'output_0' (dtype=float32, shape=(16, 256, 8103)) with 'output_0' (dtype=float32, shape=(16, 256, 8103)) | Tolerance: [abs=0.15, rel=10] | Checking max error[0m
[I]         trt-runner-N4-07/23/21-17:12:21: output_0 | Stats: mean=0.00012341, std-dev=0.0099673, var=9.9347e-05, median=0, min=0 at (0, 0, 5), max=1 at (0, 147, 0)
[I]         onnxrt-runner-N4-07/23/21-17:12:21: output_0 | Stats: mean=0.00012341, std-dev=0.0099966, var=9.9931e-05, median=1.6983e-08, min=1.2611e-15 at (13, 98, 5007), max=0.99999 at (13, 98, 0)
[I]         Error Metrics: output_0
[I]             Minimum Required Tolerance: max error | [abs=0.16535] OR [rel=8.1344]
[I]             Absolute Difference | Stats: mean=3.0931e-06, std-dev=0.00021703, var=4.7103e-08, median=1.1644e-08, min=0 at (1, 28, 3290), max=0.16535 at (8, 243, 0)
[I]             Relative Difference |

In [19]:
# Validate NaN and inf
Comparator.validate(run_results_v2, check_inf=True, check_nan=True)

[38;5;12m[I] Output Validation | Runners: ['trt-runner-N4-07/23/21-17:12:21', 'onnxrt-runner-N4-07/23/21-17:12:21'][0m
[38;5;12m[I]     trt-runner-N4-07/23/21-17:12:21     | Validating output: output_0 (check_inf=True, check_nan=True)[0m
[I]         mean=0.00012341, std-dev=0.0099673, var=9.9347e-05, median=0, min=0 at (0, 0, 5), max=1 at (0, 147, 0)
[38;5;10m[I]         PASSED | Output: output_0 is valid[0m
[38;5;12m[I]     onnxrt-runner-N4-07/23/21-17:12:21  | Validating output: output_0 (check_inf=True, check_nan=True)[0m
[I]         mean=0.00012341, std-dev=0.0099966, var=9.9931e-05, median=1.6983e-08, min=1.2611e-15 at (13, 98, 5007), max=0.99999 at (13, 98, 0)
[38;5;10m[I]         PASSED | Output: output_0 is valid[0m
[38;5;10m[I]     PASSED | Output Validation[0m


True

In [20]:
# direct access to the run_result of the TRTrunner
run_results_v2[0][1][0]['output_0']

array([[[9.5947266e-01, 4.1723251e-07, 1.7881393e-07, ...,
         3.5762787e-07, 7.1823597e-05, 5.5968761e-05],
        [9.3115234e-01, 8.9406967e-07, 4.1723251e-07, ...,
         5.9604645e-07, 5.4240227e-05, 3.5381317e-04],
        [8.8964844e-01, 5.4836273e-06, 1.6093254e-06, ...,
         1.6093254e-06, 1.4090538e-04, 1.0423660e-03],
        ...,
        [9.9023438e-01, 2.9802322e-07, 0.0000000e+00, ...,
         0.0000000e+00, 5.4788589e-04, 1.2218952e-04],
        [8.9843750e-01, 4.2915344e-06, 2.3841858e-07, ...,
         2.3841858e-07, 8.1872940e-04, 3.8695335e-04],
        [9.5507812e-01, 2.1457672e-06, 2.9802322e-07, ...,
         1.1920929e-07, 3.2043457e-04, 6.7806244e-04]],

       [[9.7509766e-01, 1.1920929e-07, 5.9604645e-08, ...,
         1.7881393e-07, 2.0921230e-05, 5.6862831e-05],
        [9.6582031e-01, 1.7881393e-07, 1.1920929e-07, ...,
         2.3841858e-07, 2.8312206e-05, 1.8465519e-04],
        [9.4189453e-01, 6.5565109e-07, 2.3841858e-07, ...,
         5.960