In [1]:
import cv2
import onnxruntime as ort
import numpy as np
import torch, torchvision
import time, json
from os.path import basename, dirname

In [69]:
model_dir = "./Assets/Models"
model_file = f"{model_dir}/yv5s-11c.onnx"
names = f"{model_dir}/11c_names.txt"
camera = "/dev/video0"
#test_image = f"{model_dir}/test_bed.jpeg"
test_image = "/Users/sergio-de/Movies/testvid/frames/frame-0594.jpeg"
video_frames = "/Users/sergio-de/Movies/testvid/frames/framepaths.txt"

net_input_size = 416
device = torch.device('cpu')

In [70]:
session = ort.InferenceSession(model_file)
inputs = [x.name for x in session.get_inputs()]
outputs = [x.name for x in session.get_outputs()]

meta = session.get_modelmeta().custom_metadata_map
names = eval(meta['names'])
stride, names = int(meta['stride']), [names[x] for x in names]
print(f"Stride: {stride} | Inputs: {inputs} | Outputs: {outputs}")
print(f"Labels: {names}")

Stride: 32 | Inputs: ['images'] | Outputs: ['output']
Labels: ['box', 'monitor,', 'refrigerator', 'microwave oven', 'television', 'door', 'bed', 'humidifier', 'printer', 'drawer', 'pc']


In [71]:
# https://raw.githubusercontent.com/ultralytics/yolov5/master/utils/augmentations.py

def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
    # Resize and pad image while meeting stride-multiple constraints
    shape = im.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scaleup:  # only scale down, do not scale up (for better val mAP)
        r = min(r, 1.0)

    # Compute padding
    ratio = r, r  # width, height ratios
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
    if auto:  # minimum rectangle
        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
    elif scaleFill:  # stretch
        dw, dh = 0.0, 0.0
        new_unpad = (new_shape[1], new_shape[0])
        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios

    dw /= 2  # divide padding into 2 sides
    dh /= 2

    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return im, ratio, (dw, dh)

def showim(name, im):
    cv2.namedWindow(name, cv2.WINDOW_AUTOSIZE)
    cv2.startWindowThread()
    cv2.setWindowProperty(name, cv2.WND_PROP_TOPMOST, 1)
    cv2.imshow(name, im)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    cv2.waitKey()

def xywh2xyxy(x):
    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
    return y

def non_max_suppression(
        prediction,
        conf_thres=0.25,
        iou_thres=0.45,
        max_det=300,
):
    """Non-Maximum Suppression (NMS) on inference results to reject overlapping detections

    Returns:
         list of detections, on (n,6) tensor per image [xyxy, conf, cls]
    """

    device = prediction.device
    
    bs = prediction.shape[0]  # batch size
    nc = prediction.shape[2] - 5  # number of classes
    xc = prediction[..., 4] > conf_thres  # candidates

    # Settings
    # min_wh = 2  # (pixels) minimum box width and height
    max_wh = 7680  # (pixels) maximum box width and height
    max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
    time_limit = 0.5 + 0.05 * bs  # seconds to quit after
    redundant = True  # require redundant detections
    merge = False  # use merge-NMS

    t = time.time()
    mi = 5 + nc  # mask start index
    output = [torch.zeros((0, 6), device=prediction.device)] * bs
    for xi, x in enumerate(prediction):  # image index, image inference
        # Apply constraints
        # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
        x = x[xc[xi]]  # confidence

        # If none remain process next image
        if not x.shape[0]:
            continue

        # Compute conf
        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf

        # Box/Mask
        box = xywh2xyxy(x[:, :4])  # center_x, center_y, width, height) to (x1, y1, x2, y2)
        mask = x[:, mi:]  # zero columns if no masks

        # Detections matrix nx6 (xyxy, conf, cls)
        # best class only
        conf, j = x[:, 5:mi].max(1, keepdim=True)
        x = torch.cat((box, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres]

        # Check shape
        n = x.shape[0]  # number of boxes
        if not n:  # no boxes
            continue
        elif n > max_nms:  # excess boxes
            x = x[x[:, 4].argsort(descending=True)[:max_nms]]  # sort by confidence
        else:
            x = x[x[:, 4].argsort(descending=True)]  # sort by confidence

        # Batched NMS
        c = x[:, 5:6] * max_wh  # classes
        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
        i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
        if i.shape[0] > max_det:  # limit detections
            i = i[:max_det]
        if merge and (1 < n < 3E3):  # Merge NMS (boxes merged using weighted mean)
            # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
            iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
            weights = iou * scores[None]  # box weights
            x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True)  # merged boxes
            if redundant:
                i = i[iou.sum(1) > 1]  # require redundancy

        output[xi] = x[i]
        if (time.time() - t) > time_limit:
            LOGGER.warning(f'WARNING ⚠️ NMS time limit {time_limit:.3f}s exceeded')
            break  # time limit exceeded

    return output

def clip_boxes(boxes, shape):
    # Clip boxes (xyxy) to image shape (height, width)
    if isinstance(boxes, torch.Tensor):  # faster individually
        boxes[:, 0].clamp_(0, shape[1])  # x1
        boxes[:, 1].clamp_(0, shape[0])  # y1
        boxes[:, 2].clamp_(0, shape[1])  # x2
        boxes[:, 3].clamp_(0, shape[0])  # y2
    else:  # np.array (faster grouped)
        boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1])  # x1, x2
        boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2

def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None):
    # Rescale boxes (xyxy) from img1_shape to img0_shape
    if ratio_pad is None:  # calculate from img0_shape
        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])  # gain  = old / new
        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
    else:
        gain = ratio_pad[0][0]
        pad = ratio_pad[1]

    boxes[:, [0, 2]] -= pad[0]  # x padding
    boxes[:, [1, 3]] -= pad[1]  # y padding
    boxes[:, :4] /= gain
    clip_boxes(boxes, img0_shape)
    return boxes

def xyxy2xywh(x):
    # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right
    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
    y[:, 0] = (x[:, 0] + x[:, 2]) / 2  # x center
    y[:, 1] = (x[:, 1] + x[:, 3]) / 2  # y center
    y[:, 2] = x[:, 2] - x[:, 0]  # width
    y[:, 3] = x[:, 3] - x[:, 1]  # height
    return y

def prepare_input(im0, net_size, stride):
    shape = (net_size, net_size)
    im = letterbox(im0, new_shape=shape, stride=stride, scaleFill=True, auto=False)[0] # to (sz, sz, c)
    # showim("Resized", im)
    print(f"Original {im.shape}: first 3 pixels {im[0][0]}, {im[0][1]}, {im[0][2]}")
    im = im.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB [to (c, sz, sz)]
    print((f"Transposed {im.shape}: first 3 pixels "+
          f"[{im[0][0][0]} {im[1][0][0]} {im[2][0][0]}], "+
          f"[{im[0][0][1]} {im[1][0][1]} {im[2][0][1]}], "+
          f"[{im[0][0][2]} {im[1][0][2]} {im[2][0][2]}]"))
    im = np.ascontiguousarray(im)  # contiguous attribute
    im = im.astype(np.float32)
    im /= 255
    im = im[None] # to (batch, c, sz, sz)
    print((f"Normalized and batched {im.shape}:\nfirst 3 pixels\n"+
          "[{0:.3f} {1:.3f} {2:.3f}],\n".format(im[0][0][0][0],im[0][1][0][0],im[0][2][0][0])+
          "[{0:.3f} {1:.3f} {2:.3f}],\n".format(im[0][0][0][1],im[0][1][0][1],im[0][2][0][1])+
          "[{0:.3f} {1:.3f} {2:.3f}]".format(im[0][0][0][2],im[0][1][0][2],im[0][2][0][2])))
    return im

In [72]:
# Load Images
video = False
frames = []

if video:
    with open(video_frames, "r", encoding="utf-8") as fh:
        frames = fh.read().splitlines()
else:
    frames.append(test_image)

output_path = dirname(frames[0])
output_file = f"{output_path}/annotations.json"

In [74]:
output = []

for idx, frame in enumerate(frames):
    idx += 1
    im0 = cv2.imread(frame)
    im = prepare_input(im0, net_input_size, stride)
    
    y = session.run(outputs, {inputs[0]: im})[0]
    
    print("## Check for weird output ##")
    boxes = y[0]
    x = boxes[boxes[:,4] > 0.25]
    for r in x:
        print(("[{0:.3f},{1:.3f},{2:.3f},{3:.3f},{4:.3f}  "+
               "{5:.2f},{6:.2f},{7:.2f},{8:.2f},{9:.2f},{10:.2f},{11:.2f},{12:.2f},{13:.2f},{14:.2f},{15:.2f}]").format(*r))
    print("## ###################### ##")
    
    pred = torch.from_numpy(y).to(device)
    filtered = non_max_suppression(pred)
    det = filtered[0]
    print(f"Results filtered by NMS: {det}")
    
    gn = torch.tensor(im0.shape)[[1, 0, 1, 0]]  # normalization gain whwh
    imc = im0  # for save_crop

    print(f"Boxes before scaling: {det[:, :4]}")
    det[:, :4] = scale_boxes(im.shape[2:], det[:, :4], im0.shape).round()
    print(f"Boxes after scaling: {det}") # all boxes in format [x1, y1, x2, y2, conf, class]

    s = f"{basename(frame)}: "
    for c in det[:, 5].unique():
        n = (det[:, 5] == c).sum()  # detections per class
        s += f"{n} {names[int(c)]}{'s' * (n > 1)}, "  # add to string

    print(s)

    objects = []
    for *xyxy, conf, cls in reversed(det):
        xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist()  # normalized xywh
        class_id = int(cls)
        line = "{0},{1},{2},{3},{4},{5:.2f}".format(class_id, *xywh, conf.item())
        #print(line)
        obj = {
            "class_id": class_id,
            "name": names[class_id],
            "relative_coordinates": {
                "center_x": xywh[0],
                "center_y": xywh[1],
                "width": xywh[2],
                "height": xywh[3],
            },
            "confidence": conf.item()
        }
        #print(obj)
        objects.append(obj)
    
    frame_data = {
        "frame_id": idx,
        "filename": frame,
        "objects": objects,
    }
    output.append(frame_data)

if not video:
    print(json.dumps(output, indent=4))

Original (416, 416, 3): first 3 pixels [43 46 50], [43 46 50], [44 47 51]
Transposed (3, 416, 416): first 3 pixels [50 46 43], [50 46 43], [51 47 44]
Normalized and batched (1, 3, 416, 416):
first 3 pixels
[0.196 0.180 0.169],
[0.196 0.180 0.169],
[0.200 0.184 0.173]
## Check for weird output ##
[9.964,67.483,18.444,42.888,0.588  1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00]
[10.023,68.307,18.277,43.461,0.767  1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00]
[29.339,75.108,25.289,29.450,0.394  1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00]
[9.941,69.520,18.406,44.371,0.339  1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00]
[29.847,76.057,26.518,30.891,0.872  1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00]
[30.280,76.822,27.385,30.472,0.834  1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00]
[84.791,83.191,28.855,32.350,0.350  1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00]
[110.484,89.734,25.294,27.920,0.820  1.00,0.00,0.00,0.00,0.00

In [None]:
# save output
with open(output_file, 'w', encoding="utf-8") as fh:
    json.dump(output, fh, ensure_ascii=False, indent=4)