In [1]:
import cv2
import onnxruntime as ort
import numpy as np
import torch, torchvision

In [4]:
model_dir = "./Assets/Models"
model_file = f"{model_dir}/yv5s-11c.onnx"
names = f"{model_dir}/11c_names.txt"
camera = "/dev/video0"
test_image = f"{model_dir}/test_box.jpeg"
test_video = f"/home/sergio-de/Movies/TestVideoLabBoxesDisplays.mp4"

net_input_size = 416
device = torch.device('cpu')

In [3]:
session = ort.InferenceSession(model_file)
inputs = [x.name for x in session.get_inputs()]
outputs = [x.name for x in session.get_outputs()]

meta = session.get_modelmeta().custom_metadata_map
names = eval(meta['names'])
stride, names = int(meta['stride']), [names[x] for x in names]
print(f"Stride: {stride} | Inputs: {inputs} | Outputs: {outputs}")
print(f"Labels: {names}")

Stride: 32 | Inputs: ['images'] | Outputs: ['output']
Labels: ['box', 'monitor,', 'refrigerator', 'microwave oven', 'television', 'door', 'bed', 'humidifier', 'printer', 'drawer', 'pc']


In [5]:
# https://raw.githubusercontent.com/ultralytics/yolov5/master/utils/augmentations.py

def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
    # Resize and pad image while meeting stride-multiple constraints
    shape = im.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scaleup:  # only scale down, do not scale up (for better val mAP)
        r = min(r, 1.0)

    # Compute padding
    ratio = r, r  # width, height ratios
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
    if auto:  # minimum rectangle
        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
    elif scaleFill:  # stretch
        dw, dh = 0.0, 0.0
        new_unpad = (new_shape[1], new_shape[0])
        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios

    dw /= 2  # divide padding into 2 sides
    dh /= 2

    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return im, ratio, (dw, dh)

def showim(name, im):
    cv2.namedWindow(name, cv2.WINDOW_AUTOSIZE)
    cv2.startWindowThread()
    cv2.setWindowProperty(name, cv2.WND_PROP_TOPMOST, 1)
    cv2.imshow(name, im)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    cv2.waitKey()

In [6]:
# Load Image
im0 = cv2.imread(test_image)
shape = (net_input_size, net_input_size)
im = letterbox(im0, new_shape=shape, stride=stride, scaleFill=True, auto=False)[0]
print(im.shape)
# showim("Resized", im)
im = im.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
print(im.shape)
im = np.ascontiguousarray(im)  # contiguous attribute
print(im.shape)
im = im.astype(np.float32)
im /= 255
im = im[None]
print(im.shape)

(416, 416, 3)
(3, 416, 416)
(3, 416, 416)
(1, 3, 416, 416)


In [9]:
y = session.run(outputs, {inputs[0]: im})[0]
pred = torch.from_numpy(y).to(device)
print(pred.shape)
print(pred)

torch.Size([1, 10647, 16])
tensor([[[5.6869e+00, 5.0823e+00, 1.1613e+01,  ..., 7.7610e-03,
          3.7501e-02, 1.0390e-02],
         [1.4888e+01, 5.9196e+00, 2.6811e+01,  ..., 1.1135e-02,
          3.7146e-02, 1.2119e-02],
         [2.0410e+01, 4.3510e+00, 3.3332e+01,  ..., 1.0906e-02,
          5.0582e-02, 8.5611e-03],
         ...,
         [3.3549e+02, 3.8409e+02, 2.7865e+02,  ..., 2.4195e-02,
          3.1685e-01, 1.2342e-01],
         [3.5968e+02, 3.8287e+02, 2.0404e+02,  ..., 3.7138e-02,
          3.1146e-01, 9.4836e-02],
         [3.8760e+02, 3.8950e+02, 1.8522e+02,  ..., 6.2329e-02,
          2.7714e-01, 6.7724e-02]]])


In [12]:
pred.device

device(type='cpu')

In [None]:
def xywh2xyxy(x):
    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
    return y

def non_max_suppression(
        prediction,
        conf_thres=0.25,
        iou_thres=0.45,
        classes=None,
        multi_label=False,
        labels=(),
        max_det=300,
        nm=0,  # number of masks
):
    """Non-Maximum Suppression (NMS) on inference results to reject overlapping detections

    Returns:
         list of detections, on (n,6) tensor per image [xyxy, conf, cls]
    """

    device = prediction.device
    
    bs = prediction.shape[0]  # batch size
    nc = prediction.shape[2] - nm - 5  # number of classes
    xc = prediction[..., 4] > conf_thres  # candidates

    # Settings
    # min_wh = 2  # (pixels) minimum box width and height
    max_wh = 7680  # (pixels) maximum box width and height
    max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
    time_limit = 0.5 + 0.05 * bs  # seconds to quit after
    redundant = True  # require redundant detections
    multi_label &= nc > 1  # multiple labels per box (adds 0.5ms/img)
    merge = False  # use merge-NMS

    t = time.time()
    mi = 5 + nc  # mask start index
    output = [torch.zeros((0, 6 + nm), device=prediction.device)] * bs
    for xi, x in enumerate(prediction):  # image index, image inference
        # Apply constraints
        # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
        x = x[xc[xi]]  # confidence

        # Cat apriori labels if autolabelling
        if labels and len(labels[xi]):
            lb = labels[xi]
            v = torch.zeros((len(lb), nc + nm + 5), device=x.device)
            v[:, :4] = lb[:, 1:5]  # box
            v[:, 4] = 1.0  # conf
            v[range(len(lb)), lb[:, 0].long() + 5] = 1.0  # cls
            x = torch.cat((x, v), 0)

        # If none remain process next image
        if not x.shape[0]:
            continue

        # Compute conf
        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf

        # Box/Mask
        box = xywh2xyxy(x[:, :4])  # center_x, center_y, width, height) to (x1, y1, x2, y2)
        mask = x[:, mi:]  # zero columns if no masks

        # Detections matrix nx6 (xyxy, conf, cls)
        if multi_label:
            i, j = (x[:, 5:mi] > conf_thres).nonzero(as_tuple=False).T
            x = torch.cat((box[i], x[i, 5 + j, None], j[:, None].float(), mask[i]), 1)
        else:  # best class only
            conf, j = x[:, 5:mi].max(1, keepdim=True)
            x = torch.cat((box, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres]

        # Filter by class
        if classes is not None:
            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]

        # Check shape
        n = x.shape[0]  # number of boxes
        if not n:  # no boxes
            continue
        elif n > max_nms:  # excess boxes
            x = x[x[:, 4].argsort(descending=True)[:max_nms]]  # sort by confidence
        else:
            x = x[x[:, 4].argsort(descending=True)]  # sort by confidence

        # Batched NMS
        c = x[:, 5:6] * max_wh  # classes
        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
        i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
        if i.shape[0] > max_det:  # limit detections
            i = i[:max_det]
        if merge and (1 < n < 3E3):  # Merge NMS (boxes merged using weighted mean)
            # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
            iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
            weights = iou * scores[None]  # box weights
            x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True)  # merged boxes
            if redundant:
                i = i[iou.sum(1) > 1]  # require redundancy

        output[xi] = x[i]
        if mps:
            output[xi] = output[xi].to(device)
        if (time.time() - t) > time_limit:
            LOGGER.warning(f'WARNING ⚠️ NMS time limit {time_limit:.3f}s exceeded')
            break  # time limit exceeded

    return output