In [16]:
import pynq
import time
import numpy as np
from pynq import MMIO
40*40*64/32

3200.0

In [21]:
import math
from typing import List
from dataclasses import dataclass

from multiprocessing import Process, Queue

import numpy as np

import pynq

@dataclass
class Partition:
    bitstream_path: str 
    n_dma: int
    weight_dma_index: int = 4
    baseaddr: int = 0xA0070000
    
    def __post_init__(self):

        # create an overlay
        self.overlay = pynq.Overlay(self.bitstream_path)
        
        # get all DMA
        self.dma = [ getattr(self.overlay, f"dma_{i}")\
                        for i in range(self.n_dma) ]
        
        # initialise partition register file
        self.regfile = MMIO(self.baseaddr, 0x1000)
        
        # create all buffers
        self.input_buffers = {}
        self.output_buffers = {}
        self.weight_buffers = {}
        self.fifo_buffers = {}
    
        # binary points
        self.input_bp = {}
        self.output_bp = {}
        
        # shapes
        self.input_shape = {}
        self.output_shape = {}
        
        # DMA indices
        self.input_dma = {}
        self.output_dma = {}
        self.fifo_dma = {}
        
        # output streams
        self.output_streams = {}
        
        self.fifo_depth = {}
        self.fifo_burst = {}
    
    def add_input_buffer(self, index, dma_index, shape, bp=8):
        
        # add dma index, shape and binary point values
        # using the dma index as the key
        self.input_buffers[index] = pynq.allocate(
            shape=math.prod(shape), dtype=np.int16)
        self.input_dma[index] = dma_index
        self.input_shape[index] = shape
        self.input_bp[index] = bp
    
    def add_output_buffer(self, index, dma_index, shape, bp=8, streams=1):
        
        # add dma index, shape and binary point values
        # using the dma index as the key
        self.output_buffers[index] = pynq.allocate(
            shape=math.prod(shape), dtype=np.int16)
        self.output_dma[index] = dma_index
        self.output_shape[index] = shape
        self.output_bp[index] = bp
        self.output_streams[index] = streams
    
        # setup the hardware again with new output shapes
        self.setup_hardware()
        
    def add_fifo(self, index, dma_in, dma_out, depth, burst=64, streams=1):
        
        # save the fifo depth and burst size
        self.fifo_depth[index] = depth
        self.fifo_burst[index] = burst
        
        # add dma for fifo
        self.fifo_dma[index] = (dma_in, dma_out)
        
        # buffers for the fifo
        self.fifo_buffers[index] = [ pynq.allocate(
            shape=(burst), dtype=np.int16) for _ in range(depth//burst) ]
        
        # setup hardware after adding fifo
        self.setup_hardware()
        
    def setup_hardware(self):
        # reset hardware, turn off updating, etc
        self.regfile.write(0x0, 0)
        
        # initialise regular output ports
        for idx in self.output_buffers.keys():
            self.regfile.write(0x8+idx*4, 
                math.prod(self.output_shape[idx])//self.output_streams[idx])
        
        # initialise fifo output ports
        for idx in self.fifo_buffers.keys():
            self.regfile.write(0x8+idx*4, 
                self.fifo_burst[idx]//1) # TODO include streams
            
        # get the hardware started
        self.start_hardware() 
        
    def reset_hardware(self):
        self.regfile.write(0x0, 0x2)
        self.regfile.write(0x0, 0x0)
        
    def start_hardware(self):
        self.regfile.write(0x0, 0x4)
        
    def stop_hardware(self):
        self.regfile.write(0x0, 0x0)
        
    def allocate_weights(self, index: int, weights_filepath: str):
        
        # load the weights into a numpy array
        with open(weights_filepath, "r") as f:
             weights = np.array([int(x, base=16) \
                        for x in f.readlines() ], dtype=np.uint32)

        # allocate a pynq buffer for the weights
        self.weight_buffers[index] = pynq.allocate(
                shape=weights.shape, dtype=np.uint32)
        
        # get the values of weights
        self.weight_buffers[index][:] = weights

    def reload_weights(self, index: int):
        
        # set to update mode
        self.regfile.write(0x0, 0x1)  
             
        # set the weight index
        self.regfile.write(0x4, index)  

        # transfer the weights
        self.dma[self.weight_dma_index].sendchannel.transfer(self.weight_buffers[index])
        
        # wait for transfer to finish
        self.dma[self.weight_dma_index].sendchannel.wait()
        
        # end update mode
        self.regfile.write(0x0, 0x0)   
        
        self.reset_hardware()
        
        # set the weight index somewhere else
        self.regfile.write(0x4, 0xFFFF)  
        
        self.start_hardware()
        
    def download(self):
        
        # download the bitstream
        self.overlay.download()
        
        # setup the hardwarte
        self.setup_hardware()
        
    def send_dma(self, index: int):
        # self.start_hardware()
        self.dma[self.input_dma[index]].sendchannel.transfer(self.input_buffers[index])
    
    def recv_dma(self, index: int):
        self.dma[self.output_dma[index]].recvchannel.transfer(self.output_buffers[index])
    
    def wait_dma(self, index: int, send: bool = True, recv: bool = True):
         
        # wait to receive
        if recv:
            try:
                self.dma[index].recvchannel.wait() 
            except:
                print("WARNING: recv channel finished")
            
        # wait to send
        if send:
            try:
                self.dma[index].sendchannel.wait()
            except:
                print("WARNING: send channel finished")         
     
    def start_fifo(self, index):
        
        # get the maximum counter value
        cntr_max = self.fifo_depth[index]//self.fifo_burst[index]
        
        # get dma indices
        dma_from_idx = self.fifo_dma[index][0]
        dma_to_idx = self.fifo_dma[index][1]
        
        def run_fifo_in(q):
            cntr_in = 0
            q.put(cntr_in)
            self.dma[dma_from_idx].recvchannel.transfer(
                self.fifo_buffers[index][0])
            while cntr_in < cntr_max:
                self.dma[dma_from_idx].recvchannel.wait()
                if cntr_in < cntr_max-1:
                    self.dma[dma_from_idx].recvchannel.transfer(
                        self.fifo_buffers[index][cntr_in+1])
                cntr_in += 1
                q.put(cntr_in)

        def run_fifo_out(q):
            cntr_out = 0
            try:
                cntr_prev = q.get(block=False)
            except:
                cntr_prev = 0
            while cntr_out < cntr_max:
                if cntr_out < cntr_prev:
                    self.dma[dma_to_idx].sendchannel.transfer(
                        self.fifo_buffers[index][cntr_out])
                    self.dma[dma_to_idx].sendchannel.wait()
                    cntr_out += 1
                    try:
                        cntr_prev = q.get(block=False)
                    except:
                        cntr_prev = cntr_prev
                else:
                    try:
                        cntr_prev = q.get(block=False)
                    except:
                        cntr_prev = cntr_prev
                        
        #  create processes
        q = Queue()
        thread_in = Process(target = run_fifo_in, args=(q,))
        thread_out = Process(target = run_fifo_out, args=(q,))
        
        # start in parallel
        thread_in.start()
        thread_out.start()
        
        # wait for both to finish
        thread_out.join()
        thread_in.join()

In [27]:
# get the sizes in
input_sizes = [
    [320, 320, 3],
    [20, 20, 128],
    [40, 40, 64],
    [10, 10, 128],
]

# get the sizes out
output_sizes = [
    [40, 40, 64],
    [20, 20, 128],
    [10, 10, 128],
    [40, 40, 256],
    [20, 20, 256],
    [10, 10, 256],
]

# initialise partition
p = Partition("hardware/yolov5n.bit", 6) 

# add input buffers
p.add_input_buffer(0, 0, [320, 320, 3], bp=13)
# p.add_input_buffer(1, 1, [20, 20, 128], bp=13)
# p.add_input_buffer(2, 2, [40, 40, 64], bp=13)
# p.add_input_buffer(3, 3, [10, 10, 128], bp=13)

# add output buffers
# p.add_output_buffer(0, 0, [40, 40, 64], bp=11, streams=1)
# p.add_output_buffer(1, 1, [20, 20, 128], bp=11, streams=1)
# p.add_output_buffer(2, 2, [10, 10, 128], bp=11, streams=1)
p.add_output_buffer(3, 3, [40, 40, 256], bp=11, streams=2)
p.add_output_buffer(4, 4, [20, 20, 256], bp=11, streams=2)
p.add_output_buffer(5, 5, [10, 10, 256], bp=11, streams=2)

# create fifos
p.add_fifo(0, 0, 2, 40*40*64 , burst=51200)
p.add_fifo(1, 1, 1, 20*20*128, burst=25600)
p.add_fifo(2, 2, 3, 10*10*128, burst=512)

# p.download()
p.reset_hardware()
p.start_hardware()

In [None]:
# load all the weights

In [28]:
# dma | write | read
# 0   | in 0  | out 0
# 1   | in 1  | out 1
# 2   | in 2  | out 2
# 3   | in 3  | out 3
# 4   | weight| out 4
# 5   |       | out 5

In [29]:
# start all transfers

start_time = time.perf_counter() 

p.send_dma(0)
# p.send_dma(1)
# p.send_dma(2)
# p.send_dma(3)

# p.recv_dma(0)
# p.recv_dma(1)
# p.recv_dma(2)
p.recv_dma(3)
p.recv_dma(4)
p.recv_dma(5)

# start fifos in parallel
fifo_0 = Process(target=p.start_fifo, args=(0,))
fifo_1 = Process(target=p.start_fifo, args=(1,))
fifo_2 = Process(target=p.start_fifo, args=(2,))

fifo_0.start()
fifo_1.start()
fifo_2.start()

fifo_0.join()
fifo_1.join()
fifo_2.join()

p.wait_dma(0, recv=False)
# p.wait_dma(1)
# p.wait_dma(2)
p.wait_dma(3, send=False)
p.wait_dma(4, send=False)
p.wait_dma(5, send=False)

pred_time = (time.perf_counter() - start_time)*1000
print(pred_time)


267.33151997905225
