In [1]:
from __future__ import division

import os
import sys
import logging
import torch
import numpy as np

from thop import profile
sys.path.append("../")

from utils.darts_utils import create_exp_dir, plot_op, plot_path_width, objective_acc_lat
try:
    from utils.darts_utils import compute_latency_ms_tensorrt as compute_latency
    print("use TensorRT for latency test")
except:
    from utils.darts_utils import compute_latency_ms_pytorch as compute_latency
    print("use PyTorch for latency test")


from model_stages_trt import BiSeNet

def main():
    
    print("begin")
    # preparation ################
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    seed = 12345
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    
    # Configuration ##############
    use_boundary_2 = False
    use_boundary_4 = False
    use_boundary_8 = True
    use_boundary_16 = False
    use_conv_last = False
    n_classes = 19
    
#     # STDC1Seg-50 250.4FPS on NVIDIA GTX 1080Ti
#     backbone = 'STDCNet813'
#     methodName = 'STDC1-Seg'
#     inputSize = 512
#     inputScale = 50
#     inputDimension = (1, 3, 512, 1024)

    # # STDC1Seg-75 126.7FPS on NVIDIA GTX 1080Ti
    # backbone = 'STDCNet813'
    # methodName = 'STDC1-Seg'
    # inputSize = 768
    # inputScale = 75
    # inputDimension = (1, 3, 768, 1536)

    # # STDC2Seg-50 188.6FPS on NVIDIA GTX 1080Ti
    backbone = 'STDCNet1446'
    methodName = 'train_STDC2-Seg/pths'
    inputSize = 512
    inputScale = 50
    inputDimension = (1, 3, 512, 1024)

    # STDC2Seg-75 97.0FPS on NVIDIA GTX 1080Ti
#     backbone = 'STDCNet1446'
#     methodName = 'train_STDC2-Seg/pths'
#     inputSize = 768
#     inputScale = 75
#     inputDimension = (1, 3, 768, 1536)
    
    model = BiSeNet(backbone=backbone, n_classes=n_classes, 
    use_boundary_2=use_boundary_2, use_boundary_4=use_boundary_4, 
    use_boundary_8=use_boundary_8, use_boundary_16=use_boundary_16, 
    input_size=inputSize, use_conv_last=use_conv_last)
    

    print('loading parameters...')
    respth = '/xiaoou/STDC-Seg-master/STDC-Seg-master/checkpoint/{}/'.format(methodName)
    save_pth = os.path.join(respth, 'model_maxmIOU{}.pth'.format(inputScale))
    model.load_state_dict(torch.load(save_pth))
    model = model.cuda()
    #####################################################

    latency = compute_latency(model, inputDimension)
    print("{}{} FPS:".format(methodName, inputScale) + str(1000./latency))
    logging.info("{}{} FPS:".format(methodName, inputScale) + str(1000./latency))

    # calculate FLOPS and params
    '''
    model = model.cpu()
    flops, params = profile(model, inputs=(torch.randn(inputDimension),), verbose=False)
    print("params = {}MB, FLOPs = {}GB".format(params / 1e6, flops / 1e9))
    logging.info("params = {}MB, FLOPs = {}GB".format(params / 1e6, flops / 1e9))
    '''


if __name__ == '__main__':
    main() 


use TensorRT for latency test
begin
BiSeNet backbone:  STDCNet1446
backbone:  STDCNet1446
loading parameters...


  size_array = [int(s) for s in feat32.size()[2:]]
  size_array = [int(s) for s in feat.size()[2:]]
  size_array = [int(s) for s in feat.size()[2:]]
ONNX's Upsample/Resize operator did not match Pytorch's Interpolation until opset 11. Attributes to determine how to transform the input were added in onnx:Resize in opset 11 to support Pytorch's behavior (like coordinate_transformation_mode and nearest_mode).
We recommend using opset 11 and above for models using this operator. 
  "" + str(_export_onnx_opset_version) + ". "


graph(%input : Float(1, 3, 768, 1536),
      %cp.backbone.features.0.conv.weight : Float(32, 3, 3, 3),
      %cp.backbone.features.0.bn.weight : Float(32),
      %cp.backbone.features.0.bn.bias : Float(32),
      %cp.backbone.features.0.bn.running_mean : Float(32),
      %cp.backbone.features.0.bn.running_var : Float(32),
      %cp.backbone.features.0.bn.num_batches_tracked : Long(),
      %cp.backbone.features.1.conv.weight : Float(64, 32, 3, 3),
      %cp.backbone.features.1.bn.weight : Float(64),
      %cp.backbone.features.1.bn.bias : Float(64),
      %cp.backbone.features.1.bn.running_mean : Float(64),
      %cp.backbone.features.1.bn.running_var : Float(64),
      %cp.backbone.features.1.bn.num_batches_tracked : Long(),
      %cp.backbone.features.2.conv_list.0.conv.weight : Float(128, 64, 1, 1),
      %cp.backbone.features.2.conv_list.0.bn.weight : Float(128),
      %cp.backbone.features.2.conv_list.0.bn.bias : Float(128),
      %cp.backbone.features.2.conv_list.0.bn.running_mea

100%|██████████| 294/294 [00:06<00:00, 48.95it/s]

MAX_BATCH_SIZE:  1
train_STDC2-Seg/pths75 FPS:48.89856866671484





In [1]:
from __future__ import division

import os
import sys
import logging
import torch
import numpy as np

from thop import profile
sys.path.append("../")

from utils.darts_utils import create_exp_dir, plot_op, plot_path_width, objective_acc_lat
try:
    from utils.darts_utils import compute_latency_ms_tensorrt as compute_latency
    print("use TensorRT for latency test")
except:
    from utils.darts_utils import compute_latency_ms_pytorch as compute_latency
    print("use PyTorch for latency test")


from model_stages_trt import BiSeNet

def main():
    
    print("begin")
    # preparation ################
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    seed = 12345
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    
    # Configuration ##############
    use_boundary_2 = False
    use_boundary_4 = False
    use_boundary_8 = True
    use_boundary_16 = False
    use_conv_last = False
    n_classes = 19
    
#     # STDC1Seg-50 250.4FPS on NVIDIA GTX 1080Ti
#     backbone = 'STDCNet813'
#     methodName = 'STDC1-Seg'
#     inputSize = 512
#     inputScale = 50
#     inputDimension = (1, 3, 512, 1024)

    # # STDC1Seg-75 126.7FPS on NVIDIA GTX 1080Ti
    # backbone = 'STDCNet813'
    # methodName = 'STDC1-Seg'
    # inputSize = 768
    # inputScale = 75
    # inputDimension = (1, 3, 768, 1536)

    # # STDC2Seg-50 188.6FPS on NVIDIA GTX 1080Ti
    backbone = 'STDCNet1446'
    methodName = 'train_STDC2-Seg/pths'
    inputSize = 512
    inputScale = 50
    inputDimension = (1, 3, 512, 1024)

    # STDC2Seg-75 97.0FPS on NVIDIA GTX 1080Ti
#     backbone = 'STDCNet1446'
#     methodName = 'train_STDC2-Seg/pths'
#     inputSize = 768
#     inputScale = 75
#     inputDimension = (1, 3, 768, 1536)
    
    model = BiSeNet(backbone=backbone, n_classes=n_classes, 
    use_boundary_2=use_boundary_2, use_boundary_4=use_boundary_4, 
    use_boundary_8=use_boundary_8, use_boundary_16=use_boundary_16, 
    input_size=inputSize, use_conv_last=use_conv_last)
    

    print('loading parameters...')
    respth = '/xiaoou/STDC-Seg-master/STDC-Seg-master/checkpoint/{}/'.format(methodName)
    save_pth = os.path.join(respth, 'model_maxmIOU{}.pth'.format(inputScale))
    model.load_state_dict(torch.load(save_pth))
    model = model.cuda()
    #####################################################

    latency = compute_latency(model, inputDimension)
    print("{}{} FPS:".format(methodName, inputScale) + str(1000./latency))
    logging.info("{}{} FPS:".format(methodName, inputScale) + str(1000./latency))

    # calculate FLOPS and params
    '''
    model = model.cpu()
    flops, params = profile(model, inputs=(torch.randn(inputDimension),), verbose=False)
    print("params = {}MB, FLOPs = {}GB".format(params / 1e6, flops / 1e9))
    logging.info("params = {}MB, FLOPs = {}GB".format(params / 1e6, flops / 1e9))
    '''


if __name__ == '__main__':
    main() 


RuntimeError: cuInit failed: no CUDA-capable device is detected

In [17]:
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit


MAX_BATCH_SIZE = 1
MAX_WORKSPACE_SIZE = 1 << 30
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
DTYPE = trt.float32

# Model
INPUT_NAME = 'input'
OUTPUT_NAME = 'output'

def allocate_buffers(engine):
    h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0))* engine.max_batch_size, dtype=trt.nptype(DTYPE))
    h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1))* engine.max_batch_size, dtype=trt.nptype(DTYPE))
    d_input = cuda.mem_alloc(h_input.nbytes)
    d_output = cuda.mem_alloc(h_output.nbytes)
    return h_input, d_input, h_output, d_output


def build_engine(model_file):
#     TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network,trt.OnnxParser(network, TRT_LOGGER) as parser:
        builder.max_workspace_size = MAX_WORKSPACE_SIZE
        builder.max_batch_size = MAX_BATCH_SIZE
        with open(model_file, 'rb') as model:
            parser.parse(model.read())
            engine = builder.build_cuda_engine(network)
    return engine
            
        


def load_input(input_size, host_buffer):
    assert len(input_size) == 4
    b, c, h, w = input_size
    dtype = trt.nptype(DTYPE)
    img_array = np.random.randn(MAX_BATCH_SIZE, c, h, w).astype(dtype).ravel()
    np.copyto(host_buffer, img_array)
    
def do_inference(context, h_input, d_input, h_output, d_output, iterations=None):
    # Transfer input data to the GPU.
    cuda.memcpy_htod(d_input, h_input)
    # warm-up
    for _ in range(10):
        context.execute(batch_size=MAX_BATCH_SIZE, bindings=[int(d_input), int(d_output)])
    # test proper iterations
    if iterations is None:
        elapsed_time = 0
        iterations = 100
        while elapsed_time < 1:
            t_start = time.time()
            for _ in range(iterations):
                context.execute(batch_size=MAX_BATCH_SIZE, bindings=[int(d_input), int(d_output)])
            elapsed_time = time.time() - t_start
            iterations *= 2
        FPS = iterations / elapsed_time
        iterations = int(FPS * 3)
    # Run inference.
    t_start = time.time()
    for _ in tqdm(range(iterations)):
        context.execute(batch_size=MAX_BATCH_SIZE, bindings=[int(d_input), int(d_output)])
    elapsed_time = time.time() - t_start
    latency = elapsed_time / iterations * 1000
    return latency


def compute_latency_ms_tensorrt(model, input_size, iterations=None):
    # print('input_size: ', input_size)
    model = model.cuda()
    model.eval()
    _, c, h, w = input_size
    dummy_input = torch.randn(MAX_BATCH_SIZE, c, h, w, device='cuda')
    torch.onnx.export(model, dummy_input, "model.onnx", verbose=True, input_names=["input"], output_names=["output"], export_params=True,)

    with build_engine("model.onnx") as engine:
        print('engine', engine)
        h_input, d_input, h_output, d_output = allocate_buffers(engine)
        load_input(input_size, h_input)
        with engine.create_execution_context() as context:
            latency = do_inference(context, h_input, d_input, h_output, d_output, iterations=iterations)
    # FPS = 1000 / latency (in ms)
    print('MAX_BATCH_SIZE: ', MAX_BATCH_SIZE)
    return latency/ MAX_BATCH_SIZE

In [10]:
# def build_engine(model_file):
# #     TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
#     with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network,trt.OnnxParser(network, TRT_LOGGER) as parser:
#         builder.max_workspace_size = MAX_WORKSPACE_SIZE
#         builder.max_batch_size = MAX_BATCH_SIZE
#         with open(model_file, 'rb') as model:
#             parser.parse(model.read())
#             engine = builder.build_cuda_engine(network)
#     return engine
# with build_engine("model.onnx") as engine:
#     print('engine', engine)

In [13]:
import tensorrt as trt
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
MAX_WORKSPACE_SIZE = 1 << 30
MAX_BATCH_SIZE = 1
DTYPE = trt.float32
INPUT_NAME = 'input'
OUTPUT_NAME = 'output'
# with trt.Builder(TRT_LOGGER) as builder:
#     builder.max_workspace_size = MAX_WORKSPACE_SIZE
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
def build_engine(model_file):
#     TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network,trt.OnnxParser(network, TRT_LOGGER) as parser:
        builder.max_workspace_size = MAX_WORKSPACE_SIZE
        builder.max_batch_size = MAX_BATCH_SIZE
        with open(model_file, 'rb') as model:
#             print(model.read())
            parser.parse(model.read())
            engine = builder.build_cuda_engine(network)
#             print(type(engine))
    return engine

In [14]:
import pycuda.driver as cuda

In [15]:
engine = build_engine("model.onnx")
# h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0))* engine.max_batch_size, dtype=trt.nptype(DTYPE))

In [16]:
type(engine)

tensorrt.tensorrt.ICudaEngine