In [None]:
from torchvision import models
import cv2
import torch
from torchvision.transforms import Resize, Compose, ToTensor, Normalize
import onnx

In [2]:
def preprocess_image(img_path):
    # transformations for the input data
    transforms = Compose([
        ToTensor(),
        Resize(224),
        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # read input image
    input_img = cv2.imread(img_path)
    # do transformations
    input_data = transforms(input_img)
    batch_data = torch.unsqueeze(input_data, 0)
    return batch_data

def postprocess(output_data):
    # get class names
    with open("imagenet_classes.txt") as f:
        classes = [line.strip() for line in f.readlines()]
    # calculate human-readable value by softmax
    confidences = torch.nn.functional.softmax(output_data, dim=1)[0] * 100
    # find top predicted classes
    _, indices = torch.sort(output_data, descending=True)
    i = 0
    # print the top classes predicted by the model
    while confidences[indices[0][i]] > 0.5:
        class_idx = indices[0][i]
        print(
            "class:",
            classes[class_idx],
            ", confidence:",
            confidences[class_idx].item(),
            "%, index:",
            class_idx.item(),
        )
        i += 1


In [3]:
input = preprocess_image("turkish_coffee.jpg").cuda()
model = models.resnet50(pretrained=True)
model.eval()
model.cuda()
output = model(input)

postprocess(output)


class: cup , confidence: 94.98273468017578 %, index: 968
class: espresso , confidence: 3.9471163749694824 %, index: 967
class: coffee mug , confidence: 0.6194139122962952 %, index: 504


In [5]:
ONNX_FILE_PATH = 'resnet50.onnx'
torch.onnx.export(model, input, ONNX_FILE_PATH, input_names=['input'],
                  output_names=['output'], export_params=True)

In [6]:
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import tensorrt as trt


#### 1. Create Builder
To create a builder, you must first create a logger. Then use the logger to create the builder.

In [7]:
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(TRT_LOGGER)

[02/28/2023-12:42:52] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage. See `CUDA_MODULE_LOADING` in https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars


#### 2. Create Network
After the builder has been created, the first step in optimizing a model is to create a network definition.
The EXPLICIT_BATCH flag is required in order to import models using the ONNX parser.

In [8]:
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))

#### 3. Import model using ONNX Parser
Now, the network definition must be populated from the ONNX representation. You can create an ONNX parser to populate the network as follows:


In [9]:
parser = trt.OnnxParser(network, TRT_LOGGER)
success = parser.parse_from_file(ONNX_FILE_PATH)
for idx in range(parser.num_errors):
    print(parser.get_error(idx))

#### 4. Building an engine
The next step is to create a build configuration specifying how TensorRT should optimize the model. This interface has many properties that you can set in order to control how TensorRT optimizes the network. One important property is the maximum workspace size. Layer implementations often require a temporary workspace, and this parameter limits the maximum size that any layer in the network can use. If insufficient workspace is provided, it is possible that TensorRT will not be able to find an implementation for a layer:

In [10]:
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 22) # 1 MiB

After the configuration has been specified, the engine can be built and serialized with:

In [11]:
serialized_engine = builder.build_serialized_network(network, config)

It may be useful to save the engine to a file for future use. You can do that like so:

In [12]:
with open("sample.engine", "wb") as f:
    f.write(serialized_engine)

#### 5. Deserialize an Engine
To perform inference, deserialize the engine using the Runtime interface. Like the builder, the runtime requires an instance of the logger.

In [13]:
runtime = trt.Runtime(TRT_LOGGER)

First load the engine from a file. Then deserialize the engine from a memory buffer:

In [14]:
with open("sample.engine", "rb") as f:
    serialized_engine = f.read()

In [15]:
engine = runtime.deserialize_cuda_engine(serialized_engine)

#### 6. Performing Inference

The engine holds the optimized model, but to perform inference requires additional state for intermediate activations. An engine can have multiple execution contexts, allowing one set of weights to be used for multiple overlapping inference tasks. 

In [16]:
context = engine.create_execution_context()

[02/28/2023-12:43:10] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage. See `CUDA_MODULE_LOADING` in https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars


Allocate some host and device buffers for inputs and outputs:

In [17]:
# Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host inputs/outputs.
h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=np.float32)
h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=np.float32)
# Allocate device memory for inputs and outputs.
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
# Create a stream in which to copy inputs/outputs and run inference.
stream = cuda.Stream()

  h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=np.float32)
  h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=np.float32)


In [28]:
host_input = np.array(preprocess_image("turkish_coffee.jpg").numpy(), dtype=np.float32, order='C')
# Transfer input data to the GPU.
cuda.memcpy_htod_async(d_input, host_input, stream)
# Run inference.
context. execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
# Transfer predictions back from the GPU.
cuda.memcpy_dtoh_async(h_output, d_output, stream)
# Synchronize the stream
stream.synchronize()

Create some space to store intermediate activation values. Since the engine holds the network definition and trained parameters, additional space is necessary. 

In [29]:
output_data = torch.Tensor(h_output).unsqueeze(0)

In [31]:
postprocess(output_data)

class: cup , confidence: 94.98615264892578 %, index: 968
class: espresso , confidence: 3.944007635116577 %, index: 967
class: coffee mug , confidence: 0.6195164918899536 %, index: 504
