In [1]:
!python3 -c 'import tensorrt; print("TensorRT version: {}".format(tensorrt.__version__))'

TensorRT version: 8.4.2.4


In [2]:
import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt

import torchvision
import torch.utils.data as data
import torchvision.transforms as transforms

import os
import numpy as np
import time

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
IMAGENET_DIR = '/ImageNet'
BATCH_SIZE = 1
LOADER_WORKERS = 4

In [4]:
TRT_LOGGER = trt.Logger()
TRT_MODEL_FILE = '../Flask/Models/efficientnet_b0-trt_int8.plan'

In [5]:
trt_runtime = trt.Runtime(TRT_LOGGER)
with open(TRT_MODEL_FILE, 'rb') as f:
    engine_data = f.read()
engine = trt_runtime.deserialize_cuda_engine(engine_data)

In [6]:
list(engine)

['input_0', 'output_0']

In [7]:
print(engine.get_binding_index('input_0'), engine.get_binding_index('output_0'))
print(engine.get_binding_name(0), engine.get_binding_name(1))
print(engine.get_binding_dtype(0), engine.get_binding_shape(0))
print(engine.get_binding_dtype(1), engine.get_binding_shape(1))

0 1
input_0 output_0
DataType.FLOAT (1, 3, 224, 224)
DataType.FLOAT (1, 1000)


## Inference pipeline

Starting with a deserialized engine, TensorRT inference pipeline consists of the following steps:

- Create an execution context and specify input shape (based on the image dimensions for inference).
- Allocate CUDA device memory for input and output.
- Allocate CUDA page-locked host memory to efficiently copy back the output.
- Transfer the processed image data into input memory using asynchronous host-to-device CUDA copy.
- Kickoff the TensorRT inference pipeline using the asynchronous execute API.
- Transfer the segmentation output back into pagelocked host memory using device-to-host CUDA copy.
- Synchronize the stream used for data transfers and inference execution to ensure all operations are completes.
- Finally, write out the segmentation output to an image file for visualization.

In [8]:
input_shape = engine.get_binding_shape(0)
input_dtype = engine.get_binding_dtype(0)

output_shape = engine.get_binding_shape(1)
output_dtype = engine.get_binding_dtype(1)

In [9]:
size = trt.volume(engine.get_binding_shape(0))
dtype = trt.nptype(engine.get_binding_dtype(0))    
input_hmem = cuda.pagelocked_empty(size, dtype)
input_dmem = cuda.mem_alloc(input_hmem.nbytes)

size = trt.volume(engine.get_binding_shape(1))
dtype = trt.nptype(engine.get_binding_dtype(1))    
output_hmem = cuda.pagelocked_empty(size, dtype)
output_dmem = cuda.mem_alloc(output_hmem.nbytes)

bindings = [int(input_dmem), int(output_dmem)]


context = engine.create_execution_context()
stream = cuda.Stream()

In [10]:
def infer(input_tensor, context, bindings, input_hmem, input_dmem, output_hmem, output_dmem, stream):
    np.copyto(input_hmem, input_tensor)
    cuda.memcpy_htod_async(input_dmem, input_hmem, stream)
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(output_hmem, output_dmem, stream)
    stream.synchronize()
    return output_hmem

In [11]:
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [12]:
dataset = torchvision.datasets.ImageNet(root=IMAGENET_DIR, transform=transform, split='val')
loader = data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=LOADER_WORKERS)

In [13]:
n_top1 = 0
n_top5 = 0
cnt = 0

s = time.time()
pred_tms = []
for images, labels in loader:

    ss = time.time()
    output = infer(images.ravel(), 
                   context, 
                   bindings, 
                   input_hmem, input_dmem, 
                   output_hmem, output_dmem, 
                   stream)
    output = output.reshape(output_shape)
    pred_tms.append(time.time()-ss)

    cnt += output.shape[0]

    top1_id = output.argmax(1)
    top5_id = output.argsort(1)[:, ::-1][:, :5]

    n_top1 += np.equal(top1_id, labels).sum()
    n_top5 += np.max(np.isin(top5_id, labels), 1).sum()

    print(f"\rstep: {cnt}/{len(dataset)}", end='')
    
total_tm = time.time() - s
print()
print(f"top-1:  {n_top1/cnt:0.4f}")
print(f"top-5:  {n_top5/cnt:0.4f}")
print(f"Batch Size: {BATCH_SIZE}")
print(f"Total Time: {total_tm:0.4f} ({total_tm/len(dataset):0.4f})")
print(f"Average Prediction Time: {np.mean(pred_tms):0.4f}")

step: 50000/50000
top-1:  0.0010
top-5:  0.0051
Batch Size: 1
Total Time: 124.2600 (0.0025)
Average Prediction Time: 0.0015
