In [1]:
import os
import cv2
import pynq
import time
import copy
import numpy as np
from pynq import MMIO

[31mERROR: Could not find a version that satisfies the requirement onnxruntime==0.14.1 (from versions: 1.12.0, 1.12.1, 1.13.1, 1.14.0, 1.14.1, 1.15.0)[0m[31m
[0m[31mERROR: No matching distribution found for onnxruntime==0.14.1[0m[31m
[0m

In [2]:
import sys
import os
import time

sys.path.append(os.path.abspath("../common"))

import math
import time
import numpy as np
from PIL import Image
from matplotlib import pyplot
import cv2
from datetime import datetime

import pynq
import dac_sdc
from IPython.display import display

team_name = 'fpgaconvnet'
dac_sdc.BATCH_SIZE = 64
team = dac_sdc.Team(team_name)

In [3]:
import math
from typing import List
from dataclasses import dataclass

import numpy as np

import pynq

@dataclass
class Partition:
    index: int 
    n_dma: int
    input_sizes: List[tuple[int,int,int]]
    output_sizes: List[tuple[int,int,int]]
    output_streams: List[int]
    input_bp: List[int]
    output_bp: List[int]
    batch_size: int = 64
    
    def __post_init__(self):

        # create an overlay
        self.overlay = pynq.Overlay(f"bitstreams/p{self.index}.bit")
        
        # get all DMA
        self.dma = [ getattr(self.overlay, f"dma_{i}")\
                        for i in range(self.n_dma) ]
        
        # initialise partition register file
        self.baseaddr = 0xA0020000
        self.regfile = MMIO(self.baseaddr, 0x1000)
        
        # allocate input buffers
        self.input_buffers = [ pynq.allocate(
            shape=(self.batch_size*math.prod(s)), 
            dtype=np.int16) for s in self.input_sizes ]
        
        # allocate output buffers
        self.output_buffers = [ pynq.allocate(
            shape=(self.batch_size*math.prod(s)), 
            dtype=np.int16) for s in self.output_sizes ]

        # empty set of weight buffers
        self.weight_buffers = {}
        
        # setup the hardwarte
        self.setup_hardware()
    
    def update_input_buffers(self, batch_size):
        # allocate input buffers
        self.input_buffers = [ pynq.allocate(
            shape=(batch_size*math.prod(s)), 
            dtype=np.int16) for s in self.input_sizes ]

    def update_output_buffers(self, batch_size):
        # allocate output buffers
        self.output_buffers = [ pynq.allocate(
            shape=(batch_size*math.prod(s)), 
            dtype=np.int16) for s in self.output_sizes ]
        
    def setup_hardware(self):
        # initialise the hardware
        self.regfile.write(0x0, 0)
        for (i, (width, size)) in enumerate(zip(self.output_streams, self.output_sizes)):
            self.regfile.write(0x8+i*4, 
                self.batch_size*math.prod(size)//width)
        self.start_hardware() 
        
    def reset_hardware(self):
        self.regfile.write(0x0, 0x2)
        self.regfile.write(0x0, 0x0)
        
    def start_hardware(self):
        self.regfile.write(0x0, 0x4)
        
    def stop_hardware(self):
        self.regfile.write(0x0, 0x0)
        
    def allocate_weights(self, index: int, weights_filepath: str):
        
        # load the weights into a numpy array
        with open(weights_filepath, "r") as f:
             weights = np.array([int(x, base=16) \
                        for x in f.readlines() ], dtype=np.uint32)

        # allocate a pynq buffer for the weights
        self.weight_buffers[index] = pynq.allocate(
                shape=weights.shape, dtype=np.uint32)
        
        # get the values of weights
        self.weight_buffers[index][:] = weights

    def reload_weights(self, index: int):
        
        # set to update mode
        self.regfile.write(0x0, 0x1)  
             
        # set the weight index
        self.regfile.write(0x4, index)  

        # transfer the weights
        self.dma[1].sendchannel.transfer(self.weight_buffers[index])
        
        # wait for transfer to finish
        self.dma[1].sendchannel.wait()
        
        # end update mode
        self.regfile.write(0x0, 0x0)   
        
        self.reset_hardware()
        
        # set the weight index somewhere else
        self.regfile.write(0x4, 0xFFFF)  
        
        self.start_hardware()
        
    def download(self):
        
        # download the bitstream
        self.overlay.download()
        
        # setup the hardwarte
        self.setup_hardware()
        
    def send_dma(self, index: int):
        self.start_hardware()
        self.dma[index].sendchannel.transfer(self.input_buffers[index])
    
    def recv_dma(self, index: int):
        self.dma[index].recvchannel.transfer(self.output_buffers[index])
    
    def wait_dma(self, index: int):
         
        # wait to receive
        try:
            self.dma[index].recvchannel.wait() 
        except:
            print("WARNING: recv channel finished")
            
        # wait to send
        try:
            self.dma[index].sendchannel.wait()
        except:
            print("WARNING: send channel finished")         
    

In [4]:
p0 = Partition(0, 2, [[320, 320, 3]], [[40, 40, 64]], [4], [13], [11], batch_size=dac_sdc.BATCH_SIZE) 

In [5]:
def xywh2xyxy(x):
    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
    y = np.copy(x)
    y[..., 0] = x[..., 0] - x[..., 2] / 2  # top left x
    y[..., 1] = x[..., 1] - x[..., 3] / 2  # top left y
    y[..., 2] = x[..., 0] + x[..., 2] / 2  # bottom right x
    y[..., 3] = x[..., 1] + x[..., 3] / 2  # bottom right y
    return y

def box_iou_batch(boxes_a: np.ndarray, boxes_b: np.ndarray) -> np.ndarray:

    def box_area(box):
        return (box[2] - box[0]) * (box[3] - box[1])

    area_a = box_area(boxes_a.T)
    area_b = box_area(boxes_b.T)

    top_left = np.maximum(boxes_a[:, None, :2], boxes_b[:, :2])
    bottom_right = np.minimum(boxes_a[:, None, 2:], boxes_b[:, 2:])

    area_inter = np.prod(np.clip(bottom_right - top_left, a_min=0, a_max=None), 2)

    return area_inter / (area_a[:, None] + area_b - area_inter)

def non_max_suppression(predictions: np.ndarray, scores: np.ndarray, 
        categories: np.ndarray, iou_threshold: float = 0.5) -> np.ndarray:
    rows, columns = predictions.shape

    sort_index = np.flip(scores.argsort())

    ious = box_iou_batch(predictions, predictions)
    ious = ious - np.eye(rows)

    keep = np.ones(rows, dtype=bool)

    for index, (iou, category) in enumerate(zip(ious, categories)):
        if not keep[index]:
            continue

        condition = (iou > iou_threshold) & (categories == category)
        keep = keep & ~condition

    return keep[sort_index.argsort()]

def yolo_nms(
        prediction,
        conf_thres=0.25,
        iou_thres=0.5,
        classes=None,
        agnostic=False,
        multi_label=False,
        labels=(),
        max_det=300,
        nm=0,  # number of masks
):
    """Non-Maximum Suppression (NMS) on inference results to reject overlapping detections

    Returns:
         list of detections, on (n,6) array per image [xyxy, conf, cls]
    """

    # Checks
    assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
    assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'

    bs = prediction.shape[0]  # batch size
    nc = prediction.shape[2] - nm - 5  # number of classes
    xc = prediction[..., 4] > conf_thres  # candidates

    # Settings
    # min_wh = 2  # (pixels) minimum box width and height
    max_wh = 7680  # (pixels) maximum box width and height
    max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
    time_limit = 0.5 + 0.05 * bs  # seconds to quit after
    redundant = True  # require redundant detections
    multi_label &= nc > 1  # multiple labels per box (adds 0.5ms/img)
    merge = False  # use merge-NMS

    # t = time.time()
    mi = 5 + nc  # mask start index
    output = [np.zeros((0, 6 + nm))] * bs
    for xi, x in enumerate(prediction):  # image index, image inference
        # Apply constraints
        # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
        x = x[xc[xi]]  # confidence

        # Cat apriori labels if autolabelling
        if labels and len(labels[xi]):
            lb = labels[xi]
            v = np.zeros((len(lb), nc + nm + 5), device=x.device)
            v[:, :4] = lb[:, 1:5]  # box
            v[:, 4] = 1.0  # conf
            v[range(len(lb)), lb[:, 0].long() + 5] = 1.0  # cls
            x = np.concatenate((x, v), 0)

        # If none remain process next image
        if not x.shape[0]:
            continue

        # Compute conf
        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf

        # Box/Mask
        box = xywh2xyxy(x[:, :4])  # center_x, center_y, width, height) to (x1, y1, x2, y2)

        mask = x[:, mi:]  # zero columns if no masks

        # Detections matrix nx6 (xyxy, conf, cls)
        if multi_label:
            i, j = (x[:, 5:mi] > conf_thres).nonzero(as_tuple=False).T
            x = np.concatenate((box[i], x[i, 5 + j, None], j[:, None].float(), mask[i]), 1)
        else:  # best class only
            conf = x[:, 5:mi].max(axis=1, keepdims=True)
            j = x[:, 5:mi].argmax(axis=1).reshape(-1, 1)
            x = np.concatenate((box, conf, j.astype(np.float32), mask), 1)[conf.reshape(-1) > conf_thres]

        # Filter by class
        if classes is not None:
            x = x[(x[:, 5:6] == np.array(classes)).any(1)]


        # Check shape
        n = x.shape[0]  # number of boxes
        if not n:  # no boxes
            continue

        x = x[x[:, 4].argsort()[::-1][:max_nms]]  # sort by confidence and remove excess boxes

        # Batched NMS
        c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
        boxes, scores, categories = x[:, :4] + c, x[:, 4], x[:, 5]  # boxes (offset by class), scores

        i = non_max_suppression(boxes, scores, categories, iou_thres)  # NMS
        i = i[:max_det]  # limit detections

        output[xi] = x[i]

        # if (time.time() - t) > time_limit:
        #     print(f'WARNING ⚠️ NMS time limit {time_limit:.3f}s exceeded')
        #     break  # time limit exceeded

    return output

In [6]:
def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=False, scaleFill=False, scaleup=True, stride=32):
    # Resize and pad image while meeting stride-multiple constraints
    shape = im.shape[:2]  # current shape [height, width]

    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scaleup:  # only scale down, do not scale up (for better val mAP)
        r = min(r, 1.0)

    # Compute padding
    ratio = r, r  # width, height ratios
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
    if auto:  # minimum rectangle
        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
    elif scaleFill:  # stretch
        dw, dh = 0.0, 0.0
        new_unpad = (new_shape[1], new_shape[0])
        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios

    dw /= 2  # divide padding into 2 sides
    dh /= 2

    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_NEAREST)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return im, ratio, (dw, dh)

def img_preprocess(img, imgsz, stride):
    im, _, _ = letterbox(img, imgsz, stride)  # padded resiz
    # im = im.transpose((2, 0, 1))  # HWC to CHW
    im = im.astype(np.float32)
    im /= 255  # 0 - 255 to 0.0 - 1.0
    if len(im.shape) == 3:
        im = im[None]  # expand for batch dim
    return im

In [7]:
!pip install onnxruntime==1.14.1
!pip install tqdm

import onnxruntime
from tqdm import tqdm

# import onnx
# from onnxsim import simplify
# from onnxruntime.tools.onnx_model_utils import fix_output_shapes, make_input_shape_fixed


def load_model(model_path, gpu=False):
    providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] if gpu else ['CPUExecutionProvider']
    session = onnxruntime.InferenceSession(model_path, providers=providers)
    output_names = [x.name for x in session.get_outputs()]
    meta = session.get_modelmeta().custom_metadata_map  # metadata
    if 'stride' in meta:
        stride, names = int(meta['stride']), eval(meta['names'])
    else:
        stride = 32
        names = {0: 'motor vehicle', 1: 'non-motor vehicle', 2: 'pedestrian', 3: 'red light', 4: 'yellow light', 5: 'green light', 6: 'off light'}
    return stride, names, session, output_names

def model_inf(img, session, output_names):
    # inference
    outputs = session.run(output_names, {session.get_inputs()[0].name: img})

    return outputs[0] if len(outputs) == 1 else [x for x in outputs]

img_size = 320
floating_point_precision = 32

# fix the input shape of the model
onnx_model_path = f'onnx_models/yolov5n-imgsz-{img_size}-hs-trained_dynamic_batch_part_1.onnx'

# onnx_model_opt_path = f'onnx_models/yolov5n-imgsz-{img_size}-hs-trained_dynamic_batch_part_1_opt.onnx'
# model = onnx.load(onnx_model_path)
# make_input_shape_fixed(model.graph, ", args.input_shape)
# fix_output_shapes(model)
# model, _ = simplify(model)

# onnx.save(model, onnx_model_opt_path)

stride, names, session, output_names = load_model(onnx_model_path)



In [8]:
# images = os.listdir('images')
# rgb_imgs = []
# for img in images:
#     if img.endswith('jpg'):
#         img_path = os.path.join('images', img)
#         bgr_img = cv2.imread(img_path)
#         assert bgr_img is not None, f'Image Not Found {img_path}'
#         rgb_img = cv2.cvtColor(bgr_img, cv2.COLOR_BGR2RGB)
#         rgb_imgs.append((img_path, rgb_img))
# team.img_list = team.img_list[:54]

In [9]:
def clip_boxes(boxes, shape):
    # Clip boxes (xyxy) to image shape (height, width)
    boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1])  # x1, x2
    boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0])  # y1, y2

def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None):
    # Rescale boxes (xyxy) from img1_shape to img0_shape
    if ratio_pad is None:  # calculate from img0_shape
        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])  # gain  = old / new
        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
    else:
        gain = ratio_pad[0][0]
        pad = ratio_pad[1]

    boxes[..., [0, 2]] -= pad[0]  # x padding
    boxes[..., [1, 3]] -= pad[1]  # y padding
    boxes[..., :4] /= gain
    clip_boxes(boxes, img0_shape)
    return boxes

In [10]:
def format_input(data, binary_point = 8):
    data = data * (2**binary_point)
    return data.astype(np.int16)

def format_output(data, binary_point = 8, transpose = True):
    data = data.astype(np.float32) / float(2**binary_point)
    if transpose:
        return np.moveaxis(data, -1, 1)
    else:
        return data

In [11]:
input_shape = (dac_sdc.BATCH_SIZE, 320, 320, 3)
output_shape = (dac_sdc.BATCH_SIZE, 40, 40, 64)

visualize = False
conf_thres = 0.25
iou_thres = 0.5
classes = None
imgsz = [img_size]
imgsz *= 2 if len(imgsz) == 1 else 1
    
visualise_path = "images_result_p0_only/"

def my_callback(rgb_imgs):
    object_locations_by_image = {}
    
    batch_img = []
    # batch_rgb_imgs = []
    batch_vis_imgs = []
    
    # start_time = time.time()
    for path, img in rgb_imgs:
        
        # if visualize:
        #     im0 = img.copy()
        #     im0 = cv2.cvtColor(im0, cv2.COLOR_RGB2BGR)
        #     batch_vis_imgs.append(im0)
        # else:
        batch_vis_imgs.append((path, img.shape))

        img = img_preprocess(img, imgsz, stride)
        
        # batch_rgb_imgs.append((path, img.shape))   
        batch_img.append(img)
            

    batch_img = np.vstack(batch_img)
    # print(f"Batching images took {time.time()-start_time:.5f} sec")
    
    curr_batch_size = batch_img.shape[0]
        
    if curr_batch_size < dac_sdc.BATCH_SIZE:
        p0.update_input_buffers(batch_size = curr_batch_size)
        p0.update_output_buffers(batch_size = curr_batch_size)
    
    # start_time = time.time()
    # Put resized image into DMA
    p0.input_buffers[0][:] = format_input(batch_img.flatten(), binary_point=p0.input_bp[0])
    # print(f"Format and copy data to input buffers took {time.time()-start_time:.5f} sec")
    
    # start_time = time.time()
    p0.send_dma(0)
    p0.recv_dma(0)
    p0.wait_dma(0)
    # print(f"DMA transfer complete in {time.time()-start_time:.5f} sec")

    # start_time = time.time()
    if curr_batch_size < dac_sdc.BATCH_SIZE:
        out_buffer_float = format_output(np.reshape(copy.deepcopy(p0.output_buffers[0]), (curr_batch_size, *output_shape[1:])), binary_point=p0.output_bp[0])
    else:
        out_buffer_float = format_output(np.reshape(copy.deepcopy(p0.output_buffers[0]), output_shape), binary_point=p0.output_bp[0])
    # out_buffer_float = (np.moveaxis(np.reshape(out_buffer, output_shape), -1, 1) / 256).astype(np.float32)
    # print(f"Format and copy data to output buffers took {time.time()-start_time:.5f} sec")
    
    # call onnx model inference
    # onnx_inference_start_time = time.perf_counter()
    batch_pred = model_inf(out_buffer_float, session, output_names)
    # batch_pred = model_inf(batch_img, session, output_names)
    
    # start_time = time.time()
    batch_nms_pred = yolo_nms(batch_pred, conf_thres, iou_thres, classes)
    # print(f"NMS took {time.time() - start_time:.5f} sec")
    
    # start_time = time.time()
    for pred, (path, im0) in zip(batch_nms_pred, batch_vis_imgs):
        # if visualize:
        #     pred[:, :4] = scale_boxes((320, 320), pred[:, :4], im0.shape).round()
        # else:
        pred[:, :4] = scale_boxes((320, 320), pred[:, :4], im0).round()

        object_locations = []
        for p in pred:
            object_locations.append({"type":int(p[5]) + 1, "x":int(p[0]), "y":int(p[1]), "width":int(p[2])-int(p[0]), "height":int(p[3])-int(p[1])})
#             if visualize:
#                 cv2.rectangle(im0, (int(p[0]), int(p[1])), (int(p[2]), int(p[3])), (0, 255, 0), 2)
#                 cv2.putText(im0, names[int(p[5])], (int(p[0]), int(p[1])), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

#         if visualize:
#             res_img_path = os.path.join(visualise_path, str(path).split('/')[-1])
#             cv2.imwrite(res_img_path, im0)

        # Save to dictionary by image filename
        object_locations_by_image[path.name] = object_locations
    # print(f"Writing results to dictionary took {time.time() - start_time:.5f} sec")
    
    return object_locations_by_image

In [12]:
team.run(my_callback, debug=True)

Batch 1 starting. 64 images.
Batch 1 done. Runtime = 6.324365139007568 seconds.
Batch 2 starting. 64 images.
Batch 2 done. Runtime = 6.199946880340576 seconds.
Batch 3 starting. 64 images.
Batch 3 done. Runtime = 6.071592092514038 seconds.
Batch 4 starting. 64 images.
Batch 4 done. Runtime = 6.160435914993286 seconds.
Batch 5 starting. 64 images.
Batch 5 done. Runtime = 6.030292987823486 seconds.
Batch 6 starting. 64 images.
Batch 6 done. Runtime = 6.057433843612671 seconds.
Batch 7 starting. 64 images.
Batch 7 done. Runtime = 6.121445894241333 seconds.
Batch 8 starting. 64 images.
Batch 8 done. Runtime = 6.090564966201782 seconds.
Batch 9 starting. 64 images.
Batch 9 done. Runtime = 6.111550807952881 seconds.
Batch 10 starting. 64 images.
Batch 10 done. Runtime = 6.091575622558594 seconds.
Batch 11 starting. 64 images.
Batch 11 done. Runtime = 6.100207090377808 seconds.
Batch 12 starting. 64 images.
Batch 12 done. Runtime = 6.11053991317749 seconds.
Batch 13 starting. 64 images.
Batch

In [13]:
team.reset_batch_count()

In [14]:
# p0.update_input_buffers(batch_size = dac_sdc.BATCH_SIZE)
# p0.update_output_buffers(batch_size = dac_sdc.BATCH_SIZE)

del p0