## Deploy model in TensorRT (engine)

### Convert

In [1]:
from io import BytesIO
import onnx
import torch
from ultralytics import YOLO
from TensorRT.models.common import PostDetect, optim
import onnxsim

pt_model = 'models/best.pt'

PostDetect.conf_thres = 0.25
PostDetect.iou_thres = 0.65
PostDetect.topk = 100

b = 1
YOLOv8 = YOLO(pt_model)
model = YOLOv8.model.fuse().eval()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
for m in model.modules():
    optim(m)
    m.to(device)
model.to(device)
fake_input = torch.randn((1,3,640,640)).to(device)
for _ in range(2):
    model(fake_input)
save_path = pt_model.replace('.pt', '.onnx')
with BytesIO() as f:
    torch.onnx.export(
        model,
        fake_input,
        f,
        opset_version=11,
        input_names=['images'],
        output_names=['num_dets', 'bboxes', 'scores', 'labels'])
    f.seek(0)
    onnx_model = onnx.load(f)
onnx.checker.check_model(onnx_model)
shapes = [b, 1, b, 100, 4, b, 100, b, 100]
for i in onnx_model.graph.output:
    for j in i.type.tensor_type.shape.dim:
        j.dim_param = str(shapes.pop(0))
try:
    onnx_model, check = onnxsim.simplify(onnx_model)
    assert check, 'assert check failed'
except Exception as e:
    print(f'Simplifier failure: {e}')
onnx.save(onnx_model, save_path)
print(f'ONNX export success, saved as {save_path}')

Model summary (fused): 268 layers, 43607379 parameters, 0 gradients


ONNX export success, saved as models/best.onnx


In [None]:
from TensorRT.models import EngineBuilder

onnx_model = 'models/best.onnx'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

builder = EngineBuilder(onnx_model, device)
builder.seg = True
builder.build(fp16=True,
              input_shape=[1, 3, 640, 640],
              iou_thres=0.65,
              conf_thres=0.25,
              topk=100)

In [3]:
import cv2
import numpy as np
import random
from TensorRT.models.utils import blob, det_postprocess, letterbox
from TensorRT.models.cudart_api import TRTEngine

engine_path = 'models/best.engine'
Engine = TRTEngine(engine_path)
H, W = Engine.inp_info[0].shape[-2:]

cap = cv2.VideoCapture('../vidl.mp4')

if (cap.isOpened()== False):
    print("Error opening video stream or file")

while(cap.isOpened()):
    ret, frame = cap.read()
    if ret == True:
        draw = frame.copy()
        bgr, ratio, dwdh = letterbox(frame, (W, H))
        rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
        tensor = blob(rgb, return_seg=False)
        dwdh = np.array(dwdh * 2, dtype=np.float32)
        tensor = np.ascontiguousarray(tensor)
        # inference
        data = Engine(tensor)
        bboxes, scores, labels = det_postprocess(data)
        bboxes -= dwdh
        bboxes /= ratio

        for (bbox, score, label) in zip(bboxes, scores, labels):
            bbox = bbox.round().astype(np.int32).tolist()
            cls_id = int(label)
            cls = 'face'
            color = (255, 0, 0)
            cv2.rectangle(draw, bbox[:2], bbox[2:], color, 2)
            cv2.putText(draw, f'{cls}:{score:.3f}', (bbox[0], bbox[1] - 2), cv2.FONT_HERSHEY_SIMPLEX, 0.75, [225, 255, 255], thickness=2)
        
        cv2.imshow('video', draw)

    # Press Q on keyboard to  exit
    if cv2.waitKey(25) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

[05/22/2023-11:20:09] [TRT] [W] TensorRT was linked against cuDNN 8.9.0 but loaded cuDNN 8.3.2
[05/22/2023-11:20:09] [TRT] [W] TensorRT was linked against cuDNN 8.9.0 but loaded cuDNN 8.3.2


QObject::moveToThread: Current thread (0x563d0c80eb20) is not the object's thread (0x563e3a65ed60).
Cannot move to target thread (0x563d0c80eb20)

QObject::moveToThread: Current thread (0x563d0c80eb20) is not the object's thread (0x563e3a65ed60).
Cannot move to target thread (0x563d0c80eb20)

QObject::moveToThread: Current thread (0x563d0c80eb20) is not the object's thread (0x563e3a65ed60).
Cannot move to target thread (0x563d0c80eb20)

QObject::moveToThread: Current thread (0x563d0c80eb20) is not the object's thread (0x563e3a65ed60).
Cannot move to target thread (0x563d0c80eb20)

QObject::moveToThread: Current thread (0x563d0c80eb20) is not the object's thread (0x563e3a65ed60).
Cannot move to target thread (0x563d0c80eb20)

QObject::moveToThread: Current thread (0x563d0c80eb20) is not the object's thread (0x563e3a65ed60).
Cannot move to target thread (0x563d0c80eb20)

QObject::moveToThread: Current thread (0x563d0c80eb20) is not the object's thread (0x563e3a65ed60).
Cannot move to tar

## Deploy model in Tensorflow (TFlite)

### Convert

In [None]:
from ultralytics import YOLO

YOLOv8 = YOLO('models/best.pt')
model = YOLOv8.model.fuse().eval()

# Input to the model
x = torch.randn(1, 3, 640, 640, requires_grad=True)
torch_out = model(x)

onnx_model = "models/best.onnx"
# Export the model
torch.onnx.export(model,                      # model being run
                  x,                          # model input (or a tuple for multiple inputs)
                  onnx_model,                 # where to save the model (can be a file or file-like object)
                  export_params=True,         # store the trained parameter weights inside the model file
                  opset_version=11,           # the ONNX version to export the model to
                  do_constant_folding=True,   # whether to execute constant folding for optimization
                  input_names = ['images'],   # the model's input names
                  output_names = ['output0'], # the model's output names
                 )
print(f'ONNX export success, saved as {onnx_model}')

In [1]:
import subprocess

onnx_model = 'best.onnx'
tf_model = 'saved_model'

subprocess.run(f"onnx2tf -i {onnx_model} -o {tf_model} -nuo --non_verbose", shell=True)

/bin/sh: 1: onnx2tf: not found


CompletedProcess(args='onnx2tf -i best.onnx -o saved_model -nuo --non_verbose', returncode=127)

In [None]:
import subprocess

onnx_model = 'models/best.onnx'
tf_model = 'models/saved_model'
subprocess.run(f"onnx2tf -i {onnx_model} -o {tf_model} -nuo --non_verbose", shell=True)

### Tensorflow

In [None]:
import cv2
import random
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

def box_iou_batch(
	boxes_a: np.ndarray, boxes_b: np.ndarray
) -> np.ndarray:

    def box_area(box):
        return (box[2] - box[0]) * (box[3] - box[1])

    area_a = box_area(boxes_a.T)
    area_b = box_area(boxes_b.T)

    top_left = np.maximum(boxes_a[:, None, :2], boxes_b[:, :2])
    bottom_right = np.minimum(boxes_a[:, None, 2:], boxes_b[:, 2:])

    area_inter = np.prod(
    	np.clip(bottom_right - top_left, a_min=0, a_max=None), 2)
        
    return area_inter / (area_a[:, None] + area_b - area_inter)

def non_max_suppression(
   predictions: np.ndarray, iou_threshold: float = 0.5
) -> np.ndarray:
    rows, columns = predictions.shape

    sort_index = np.flip(predictions[:, 4].argsort())
    predictions = predictions[sort_index]

    boxes = predictions[:, :4]
    categories = predictions[:, 5]
    ious = box_iou_batch(boxes, boxes)
    ious = ious - np.eye(rows)
    # print(ious)

    keep = np.ones(rows, dtype=bool)

    for index, (iou, category) in enumerate(zip(ious, categories)):
        if not keep[index]:
            continue

        condition = (iou > iou_threshold) & (categories == category)
        keep = keep & ~condition

    return keep[sort_index.argsort()]

def xywh2xyxy(x):
    # Convert bounding box (x, y, w, h) to bounding box (x1, y1, x2, y2)
    y = np.copy(x)
    y[..., 0] = x[..., 0] - x[..., 2] / 2
    y[..., 1] = x[..., 1] - x[..., 3] / 2
    y[..., 2] = x[..., 0] + x[..., 2] / 2
    y[..., 3] = x[..., 1] + x[..., 3] / 2
    return y

def letterbox(
    im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleup=True, stride=32
):
    # Resize and pad image while meeting stride-multiple constraints
    shape = im.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scaleup:  # only scale down, do not scale up (for better val mAP)
        r = min(r, 1.0)

    # Compute padding
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding

    if auto:  # minimum rectangle
        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding

    dw /= 2  # divide padding into 2 sides
    dh /= 2

    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    im = cv2.copyMakeBorder(
        im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color
    )  # add border
    return im, r, (dw, dh)

thresh = 0.7

# Load the TFLite model and allocate tensors.
model = tf.saved_model.load("models/saved_model")

cap = cv2.VideoCapture('video.mp4')

if (cap.isOpened()== False):
    print("Error opening video stream or file")

while(cap.isOpened()):
    ret, frame = cap.read()
    if ret == True:
        image, ratio, dwdh = letterbox(frame, auto=False)
        image = np.expand_dims(image, 0)
        image = np.ascontiguousarray(image)

        im = image.astype(np.float32)
        im /= 255

        predict = np.array(model(im))
        boxes = predict[0].T
        i = 0
        while i < len(boxes):
            if boxes[i][-1] < thresh:
                boxes = np.delete(boxes, i, 0)
                i -= 1
            i += 1
        boxes = xywh2xyxy(np.concatenate((boxes, np.zeros((boxes.shape[0], 1))), axis=1))
        indices = non_max_suppression(boxes, 0.3)
        for i in boxes[indices]:
            bbox = i[:4].copy()
            bbox[0] -= dwdh[0]
            bbox[1] -= dwdh[1]
            bbox[2] -= dwdh[0]
            bbox[3] -= dwdh[1]
            bbox /= ratio
            bbox = bbox.round().astype(np.int32).tolist()
            color = (0,255,0)
            cv2.rectangle(frame, tuple(bbox[:2]), tuple(bbox[2:]), color, 2)

            cv2.putText(frame,
                        f'face:{int(i[4]*100)}', (bbox[0], bbox[1] - 2),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.60, [225, 255, 255],
                        thickness=1)
        cv2.imshow("video", frame)
        # Press Q on keyboard to  exit
        if cv2.waitKey(25) & 0xFF == ord('q'):
            break

# Destroys all the windows created
cv2.destroyAllWindows()

### TFlite

In [27]:
import cv2
import random
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

def letterbox(
    im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleup=True, stride=32
):
    # Resize and pad image while meeting stride-multiple constraints
    shape = im.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scaleup:  # only scale down, do not scale up (for better val mAP)
        r = min(r, 1.0)

    # Compute padding
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding

    if auto:  # minimum rectangle
        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding

    dw /= 2  # divide padding into 2 sides
    dh /= 2

    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    im = cv2.copyMakeBorder(
        im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color
    )  # add border
    return im, r, (dw, dh)

thresh = 0.7

# Load the TFLite model and allocate tensors.
interpreter = tf.lite.Interpreter(model_path="models/best_saved_model/best_float32.tflite")

#  Allocate tensors.
interpreter.allocate_tensors()
# Get input and output tensors.
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Test the model on random input data.
input_shape = input_details[0]["shape"]

cap = cv2.VideoCapture('../vids.mp4')

if (cap.isOpened()== False):
    print("Error opening video stream or file")

while(cap.isOpened()):
    ret, frame = cap.read()
    if ret == True:
        image, ratio, dwdh = letterbox(frame, auto=False)
        image = np.expand_dims(image, 0)
        image = np.ascontiguousarray(image)

        im = image.astype(np.float32)
        im /= 255
        
        interpreter.set_tensor(input_details[0]["index"], im)
        interpreter.invoke()
        # The function get_tensor() returns a copy of the tensor data.
        # Use tensor() in order to get a pointer to the tensor.
        output_data = interpreter.get_tensor(output_details[0]["index"])

        # for i, a in enumerate(output_data):
        #     print(i, a.shape)
        for i, (x0, y0, x1, y1, scores) in enumerate(output_data):
            for j in range(len(x0)):
                if scores[j] >= thresh:
                    box = np.array([x0[j], y0[j], x1[j], y1[j]])
                    box[0] -= dwdh[0]
                    box[1] -= dwdh[1]
                    box[2] -= dwdh[0]
                    box[3] -= dwdh[1]
                    box /= ratio
                    box = box.round().astype(np.int32).tolist()

                    score = round(float(scores[j]), 3)
                    name = "face " + str(score)
                    cv2.rectangle(frame, box[:2], box[2:], (255,0,0), 2)
                    cv2.putText(
                        frame,
                        name,
                        (box[0], box[1] + 20),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.75,
                        [225, 255, 255],
                        thickness=2,
                    )
        # plt.imshow(ori_images[0])
        cv2.imshow("video", frame)
        # Press Q on keyboard to  exit
        if cv2.waitKey(25) & 0xFF == ord('q'):
            break

# Destroys all the windows created
cv2.destroyAllWindows()

QObject::moveToThread: Current thread (0x557398bd53a0) is not the object's thread (0x5573b950b8c0).
Cannot move to target thread (0x557398bd53a0)

QObject::moveToThread: Current thread (0x557398bd53a0) is not the object's thread (0x5573b950b8c0).
Cannot move to target thread (0x557398bd53a0)

QObject::moveToThread: Current thread (0x557398bd53a0) is not the object's thread (0x5573b950b8c0).
Cannot move to target thread (0x557398bd53a0)

QObject::moveToThread: Current thread (0x557398bd53a0) is not the object's thread (0x5573b950b8c0).
Cannot move to target thread (0x557398bd53a0)

QObject::moveToThread: Current thread (0x557398bd53a0) is not the object's thread (0x5573b950b8c0).
Cannot move to target thread (0x557398bd53a0)

QObject::moveToThread: Current thread (0x557398bd53a0) is not the object's thread (0x5573b950b8c0).
Cannot move to target thread (0x557398bd53a0)

QObject::moveToThread: Current thread (0x557398bd53a0) is not the object's thread (0x5573b950b8c0).
Cannot move to tar

## Deploy model in ONNXRuntime(ONNX)

In [38]:
def nms(boxes, scores, iou_threshold):
    # Sort by score
    sorted_indices = np.argsort(scores)[::-1]

    keep_boxes = []
    while sorted_indices.size > 0:
        # Pick the last box
        box_id = sorted_indices[0]
        keep_boxes.append(box_id)

        # Compute IoU of the picked box with the rest
        ious = compute_iou(boxes[box_id, :], boxes[sorted_indices[1:], :])

        # Remove boxes with IoU over the threshold
        keep_indices = np.where(ious < iou_threshold)[0]

        # print(keep_indices.shape, sorted_indices.shape)
        sorted_indices = sorted_indices[keep_indices + 1]

    return keep_boxes

def compute_iou(box, boxes):
    # Compute xmin, ymin, xmax, ymax for both boxes
    xmin = np.maximum(box[0], boxes[:, 0])
    ymin = np.maximum(box[1], boxes[:, 1])
    xmax = np.minimum(box[2], boxes[:, 2])
    ymax = np.minimum(box[3], boxes[:, 3])

    # Compute intersection area
    intersection_area = np.maximum(0, xmax - xmin) * np.maximum(0, ymax - ymin)

    # Compute union area
    box_area = (box[2] - box[0]) * (box[3] - box[1])
    boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
    union_area = box_area + boxes_area - intersection_area

    # Compute IoU
    iou = intersection_area / union_area

    return iou

def xywh2xyxy(x):
    # Convert bounding box (x, y, w, h) to bounding box (x1, y1, x2, y2)
    y = np.copy(x)
    y[..., 0] = x[..., 0] - x[..., 2] / 2
    y[..., 1] = x[..., 1] - x[..., 3] / 2
    y[..., 2] = x[..., 0] + x[..., 2] / 2
    y[..., 3] = x[..., 1] + x[..., 3] / 2
    return y

In [39]:
import onnxruntime

opt_session = onnxruntime.SessionOptions()
opt_session.enable_mem_pattern = False
opt_session.enable_cpu_mem_arena = False
opt_session.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL

model_path = 'models/best.onnx'
EP_list = ['CPUExecutionProvider','CUDAExecutionProvider']

ort_session = onnxruntime.InferenceSession(model_path, providers=EP_list)

model_inputs = ort_session.get_inputs()
input_names = [model_inputs[i].name for i in range(len(model_inputs))]
input_shape = model_inputs[0].shape

model_output = ort_session.get_outputs()
output_names = [model_output[i].name for i in range(len(model_output))]

2023-05-22 15:53:46.534036252 [W:onnxruntime:Default, onnxruntime_pybind_state.cc:541 CreateExecutionProviderInstance] Failed to create CUDAExecutionProvider. Please reference https://onnxruntime.ai/docs/reference/execution-providers/CUDA-ExecutionProvider.html#requirements to ensure all dependencies are met.


In [42]:
import cv2
import numpy as np
from PIL import Image

cap = cv2.VideoCapture('../vidl.mp4')

if (cap.isOpened()== False):
    print("Error opening video stream or file")

while(cap.isOpened()):
    ret, frame = cap.read()
    if ret == True:
        image_height, image_width = frame.shape[:2]
#         Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

        input_height, input_width = input_shape[2:]
        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        resized = cv2.resize(image_rgb, (input_width, input_height))

        # Scale input pixel value to 0 to 1
        input_image = resized / 255.0
        input_image = input_image.transpose(2,0,1)
        input_tensor = input_image[np.newaxis, :, :, :].astype(np.float32)

        outputs = ort_session.run(output_names, {input_names[0]: input_tensor})[0]
        predictions = np.squeeze(outputs).T
        conf_thresold = 0.7
        # Filter out object confidence scores below threshold
        scores = np.max(predictions[:, 4:], axis=1)
        predictions = predictions[scores > conf_thresold, :]
        scores = scores[scores > conf_thresold]
        class_ids = np.argmax(predictions[:, 4:], axis=1)

        # Get bounding boxes for each object
        boxes = predictions[:, :4]

        #rescale box
        input_shape = np.array([input_width, input_height, input_width, input_height])
        boxes = np.divide(boxes, input_shape, dtype=np.float32)
        boxes *= np.array([image_width, image_height, image_width, image_height])
        boxes = boxes.astype(np.int32)
        
        indices = nms(boxes, scores, 0.3)
        for (bbox, score, label) in zip(xywh2xyxy(boxes[indices]), scores[indices], class_ids[indices]):
            bbox = bbox.round().astype(np.int32).tolist()
            cls = 'face'
            color = (0,255,0)
            cv2.rectangle(frame, tuple(bbox[:2]), tuple(bbox[2:]), color, 2)
            cv2.putText(frame,
                        f'{cls}:{int(score*100)}', (bbox[0], bbox[1] - 2),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.60, [225, 255, 255],
                        thickness=1)
        
        cv2.imshow("video", frame)
        # Press Q on keyboard to  exit
        if cv2.waitKey(25) & 0xFF == ord('q'):
            break

# Destroys all the windows created
cv2.destroyAllWindows()

QObject::moveToThread: Current thread (0x557398bd53a0) is not the object's thread (0x5573b950b8c0).
Cannot move to target thread (0x557398bd53a0)

QObject::moveToThread: Current thread (0x557398bd53a0) is not the object's thread (0x5573b950b8c0).
Cannot move to target thread (0x557398bd53a0)

QObject::moveToThread: Current thread (0x557398bd53a0) is not the object's thread (0x5573b950b8c0).
Cannot move to target thread (0x557398bd53a0)

QObject::moveToThread: Current thread (0x557398bd53a0) is not the object's thread (0x5573b950b8c0).
Cannot move to target thread (0x557398bd53a0)

QObject::moveToThread: Current thread (0x557398bd53a0) is not the object's thread (0x5573b950b8c0).
Cannot move to target thread (0x557398bd53a0)

QObject::moveToThread: Current thread (0x557398bd53a0) is not the object's thread (0x5573b950b8c0).
Cannot move to target thread (0x557398bd53a0)

QObject::moveToThread: Current thread (0x557398bd53a0) is not the object's thread (0x5573b950b8c0).
Cannot move to tar