In [1]:
from torchvision import models
import cv2
import torch
from torchvision.transforms import Resize, Compose, ToTensor, Normalize

In [2]:
def preprocess_image(img_path):
    # transformations for the input data
    transforms = Compose([
        ToTensor(),
        Resize(224),
        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    # read input image
    input_img = cv2.imread(img_path)
    # do transformations
    input_data = transforms(input_img)
    batch_data = torch.unsqueeze(input_data, 0)
    return batch_data

def postprocess(output_data):
    # get class names
    with open("../data/imagenet-classes.txt") as f:
        classes = [line.strip() for line in f.readlines()]
    # calculate human-readable value by softmax
    confidences = torch.nn.functional.softmax(output_data, dim=1)[0] * 100
    # find top predicted classes
    _, indices = torch.sort(output_data, descending=True)
    i = 0
    # print the top classes predicted by the model
    while confidences[indices[0][i]] > 50:
        class_idx = indices[0][i]
        print(
            "class:",
            classes[class_idx],
            ", confidence:",
            confidences[class_idx].item(),
            "%, index:",
            class_idx.item(),
        )
        i += 1

In [3]:
input = preprocess_image("../data/hotdog.jpg").cuda()
model = models.resnet50(pretrained=True)
model.eval()
model.cuda()
output = model(input)

postprocess(output)



class: hotdog, hot dog, red hot , confidence: 60.50566864013672 %, index: 934


In [8]:
ONNX_FILE_PATH = '../deploy_tools/mono3d.onnx'
# torch.onnx.export(model, input, ONNX_FILE_PATH, input_names=['input'],
#                   output_names=['output'], export_params=True)



In [3]:
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import tensorrt as trt


In [4]:
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(TRT_LOGGER)


[06/09/2023-22:11:35] [TRT] [W] CUDA lazy loading is not enabled. Enabling it can significantly reduce device memory usage and speed up TensorRT initialization. See "Lazy Loading" section of CUDA documentation https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#lazy-loading


In [5]:
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))

In [9]:
parser = trt.OnnxParser(network, TRT_LOGGER)
success = parser.parse_from_file(ONNX_FILE_PATH)
for idx in range(parser.num_errors):
    print(parser.get_error(idx))


[06/09/2023-22:12:35] [TRT] [W] onnx2trt_utils.cpp:374: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[06/09/2023-22:12:35] [TRT] [W] onnx2trt_utils.cpp:400: One or more weights outside the range of INT32 was clamped
[06/09/2023-22:12:35] [TRT] [W] Tensor DataType is determined at build time for tensors not marked as input or output.


In [7]:
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 22) # 1 MiB


In [8]:
serialized_engine = builder.build_serialized_network(network, config)

In [11]:
with open("../deploy_tools/mono3d.engine", "wb") as f:
    f.write(serialized_engine)

In [10]:
runtime = trt.Runtime(TRT_LOGGER)

In [11]:
with open("../deploy_tools/mono3d.engine", "rb") as f:
    serialized_engine = f.read()


In [12]:
engine = runtime.deserialize_cuda_engine(serialized_engine)

In [13]:
engine

<tensorrt.tensorrt.ICudaEngine at 0x7fcb8c1162f0>

In [14]:
context = engine.create_execution_context()

[06/09/2023-22:13:27] [TRT] [E] 1: Unexpected exception vector<bool>::_M_range_check: __n (which is 0) >= this->size() (which is 0)


In [16]:
# Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host inputs/outputs.
h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=np.float32)
h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=np.float32)
h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=np.float32)
# Allocate device memory for inputs and outputs.
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
# Create a stream in which to copy inputs/outputs and run inference.
stream = cuda.Stream()

  h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=np.float32)
  h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=np.float32)


In [17]:
host_input = np.array(preprocess_image("../data/hotdog.jpg").numpy(), dtype=np.float32, order='C')
# Transfer input data to the GPU.
cuda.memcpy_htod_async(d_input, host_input, stream)
# Run inference.
context. execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
# Transfer predictions back from the GPU.
cuda.memcpy_dtoh_async(h_output, d_output, stream)
# Synchronize the stream
stream.synchronize()


In [18]:
output_data = torch.Tensor(h_output).unsqueeze(0)

In [19]:
postprocess(output_data)

class: hotdog, hot dog, red hot , confidence: 60.50568771362305 %, index: 934
