In [33]:
import tensorrt as trt
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit

def load_engine(engine_file_path):
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    with open(engine_file_path, 'rb') as f:
        runtime = trt.Runtime(TRT_LOGGER)
        engine = runtime.deserialize_cuda_engine(f.read())
    return engine

def allocate_buffers(engine):
    # Allocate pagelocked host memory for inputs and outputs
    h_inputs = [cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(i)), dtype=np.float32) for i in range(3)]
    h_outputs = [cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(i + 3)), dtype=np.float32) for i in range(3)]

    # Allocate device memory for inputs and outputs
    d_inputs = [cuda.mem_alloc(h.nbytes) for h in h_inputs]
    d_outputs = [cuda.mem_alloc(h.nbytes) for h in h_outputs]

    # Create a CUDA stream for asynchronous execution
    stream = cuda.Stream()

    return h_inputs, h_outputs, d_inputs, d_outputs, stream

def do_inference(context, h_input, h_output, d_input, d_output, stream, batch_size=1):
    cuda.memcpy_htod_async(d_input, h_input, stream)
    context.execute_async(batch_size=batch_size, bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(h_output, d_output, stream)
    stream.synchronize()
    return h_output

def main(engine_file_path, input_data):
    # Load engine
    engine = load_engine(engine_file_path)
    
    # Allocate buffers
    h_input, h_output, d_input, d_output, stream = allocate_buffers(engine)
    
    # Create context
    context = engine.create_execution_context()
    
    # Prepare input data
    np.copyto(h_input, input_data.ravel())
    
    # Perform inference
    output = do_inference(context, h_input, h_output, d_input, d_output, stream)
    
    print("Inference output:", output)




In [38]:
h_input = cuda.pagelocked_empty(trt.volume((16000,32,4)), dtype=np.float32)

In [87]:
cuda.mem_alloc(h_input.nbytes)

<pycuda._driver.DeviceAllocation at 0x758aaafa6020>

In [36]:
engine.get_binding_shape(0)

  engine.get_binding_shape(0)


(-1, 32, 4)

In [34]:
engine = load_engine("deployed_models/hydra/end2end_fp16.engine")

# Inputs 
engine.get_binding_shape(engine.get_binding_index("voxels"))
engine.get_binding_shape(engine.get_binding_index("num_points"))
engine.get_binding_shape(engine.get_binding_index("coors"))

# Outputs
engine.get_binding_shape(engine.get_binding_index("cls_score0"))
engine.get_binding_shape(engine.get_binding_index("bbox_pred0"))
engine.get_binding_shape(engine.get_binding_index("dir_cls_pred0"))

allocate_buffers(engine)

  engine.get_binding_shape(engine.get_binding_index("voxels"))
  engine.get_binding_shape(engine.get_binding_index("voxels"))
  engine.get_binding_shape(engine.get_binding_index("num_points"))
  engine.get_binding_shape(engine.get_binding_index("num_points"))
  engine.get_binding_shape(engine.get_binding_index("coors"))
  engine.get_binding_shape(engine.get_binding_index("coors"))
  engine.get_binding_shape(engine.get_binding_index("cls_score0"))
  engine.get_binding_shape(engine.get_binding_index("cls_score0"))
  engine.get_binding_shape(engine.get_binding_index("bbox_pred0"))
  engine.get_binding_shape(engine.get_binding_index("bbox_pred0"))
  engine.get_binding_shape(engine.get_binding_index("dir_cls_pred0"))
  engine.get_binding_shape(engine.get_binding_index("dir_cls_pred0"))
  h_inputs = [cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(i)), dtype=np.float32) for i in range(3)]


MemoryError: cuMemHostAlloc failed: out of memory