### Load

In [None]:
from pynq import Overlay
from pynq import MMIO
from pynq import allocate
import numpy as np
import asyncio
import cv2
import pandas as pd
ol = Overlay("./design_1.bit")

### Read input image

In [None]:
# read input image
with open("./input/image.txt", "r") as f:
    img = [int(line.strip()) for line in f if line.strip() != ""]
img = np.array(img)
img = img.reshape(256,256)
input_layer = np.pad(img, ((1, 1), (0, 0)), mode='constant', constant_values=128)
input_layer = input_layer.astype(np.uint8)
# print(input_layer)
print(input_layer.shape)

### Read weights

In [None]:
# read weight
# kernel0
kernel0 = []
with open("./weights/layer0_weight.hex", "r") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        # 8 hex to uint32
        val = int.from_bytes(bytes.fromhex(line), byteorder='big', signed=False)
        kernel0.append(val)
kernel0 = np.array(kernel0, dtype=np.int8)
kernel0 = kernel0.reshape(64,3,3)
# kernel1
kernel1 = []
with open("./weights/layer1_weight.hex", "r") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        # 8 hex to uint32
        val = int.from_bytes(bytes.fromhex(line), byteorder='big', signed=False)
        kernel1.append(val)
kernel1 = np.array(kernel1, dtype=np.uint32)
kernel1 = kernel1.reshape(64,16,9)
# kernel2
kernel2 = []
with open("./weights/layer2_weight.hex", "r") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        # 8 hex to uint32
        val = int.from_bytes(bytes.fromhex(line), byteorder='big', signed=False)
        kernel2.append(val)
kernel2 = np.array(kernel2, dtype=np.uint32)
kernel2 = kernel2.reshape(64,16,9)
# kernel3
kernel3 = []
with open("./weights/layer3_weight.hex", "r") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        # 8 hex to uint32
        val = int.from_bytes(bytes.fromhex(line), byteorder='big', signed=False)
        kernel3.append(val)
kernel3 = np.array(kernel3, dtype=np.uint32)
kernel3 = kernel3.reshape(64,16,9)
# kernel4
kernel4 = []
with open("./weights/layer4_weight.hex", "r") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        # 8 hex to uint32
        val = int.from_bytes(bytes.fromhex(line), byteorder='big', signed=False)
        kernel4.append(val)
kernel4 = np.array(kernel4, dtype=np.uint32)
kernel4 = kernel4.reshape(16,9)

### Read bias

In [None]:
# read bias
csv_path = "./bias/bias_int32.csv"
df = pd.read_csv(csv_path)
np_array = df.to_numpy()
np_array = np_array.T
bias = np_array.astype(np.int32)
BIAS_BASE = 1
# print(bias)
# print(bias.shape)

### Timer

In [None]:
import time
start = time.perf_counter()

### Setting & Config

In [None]:
gpio0_addr = ol.ip_dict['axi_gpio_0']['phys_addr']
cdma_addr = ol.ip_dict['axi_cdma_0']['phys_addr']
# print(hex(cdma_addr))
# config
config_addr = gpio0_addr + 0

config_in = MMIO(config_addr, 0XFFFF)
cdma = MMIO(cdma_addr, 0x7FFFF)

In [None]:
bram_addr = 0xC0000000
ifmap_bram_usage = 8192*4
kernel_bram_usage = 288*4
psum_bram_usage = 98304*4
bias_bram_usage = 16*4
ifmap_bram_addr = bram_addr
kernel_bram_addr = ifmap_bram_addr + ifmap_bram_usage
psum_bram_addr = kernel_bram_addr + kernel_bram_usage
bias_bram_addr = psum_bram_addr + psum_bram_usage
print("bram_ifmap_addr",hex(ifmap_bram_addr),"  ifmap_bram_usage",ifmap_bram_usage)
print("bram_kernel_addr",hex(kernel_bram_addr),"  kernel_bram_usage",kernel_bram_usage)
print("bram_psum_addr",hex(psum_bram_addr),"  psum_bram_usage",psum_bram_usage)
print("bram_bias_addr",hex(bias_bram_addr),"  bias_bram_usage",bias_bram_usage )

In [None]:
ifmap_buffer = allocate(shape=(8192, ), dtype=np.uint32)
kernel_buffer = allocate(shape=(288,), dtype=np.uint32)
ofmap_buffer = allocate(shape=(16,65536), dtype=np.uint32)
bias_buffer = allocate(shape=(16,), dtype=np.int32)

In [None]:
CDMA_CONTROL = 0x00
CDMA_STATUS  = 0x04
CDMA_SRC     = 0x18
CDMA_DST     = 0x20
CDMA_BTT     = 0x28

## Start inference

### Set CDMA 

In [None]:
# RESET CDMA 
cdma.write(CDMA_CONTROL, 0x04) # CDMA rst
# INTERRUPT ENABLE
cdma.write(CDMA_CONTROL, 0x1000)

### Layer 0

In [None]:
# layer 0
E = 256
F = 256
C = 1
M = 64
e = 6
qr = 1
pt = 16
pass_count = 0


for e_tile in range(0,E+2,e): # e=6
    ifmap_this_pass = input_layer[e_tile:e_tile+8,:]
#     print(ifmap_this_pass)
    for qr_tile in range(0,C,qr): # qr=1
        for pt_tile in range(0,M,pt): # pt=16
            kernel_this_pass = kernel0[pt_tile:pt_tile+16,:,:]
            # place to buffer
            ifmap_buffer[0:ifmap_this_pass.shape[0]*ifmap_this_pass.shape[1]] = ifmap_this_pass.reshape(-1)
            kernel_buffer[0:kernel_this_pass.shape[0]*3*3] = kernel_this_pass.reshape(-1)
            # set layer enable
            config_in.write(0x00, 0b00010) # 4'b0010 layer0 layer_enable=1
            # transfer data from ddr to bram

            cdma.write(CDMA_SRC, ifmap_buffer.physical_address) # src
            cdma.write(CDMA_DST, ifmap_bram_addr) # dist
            cdma.write(CDMA_BTT, ifmap_bram_usage) # ? bytes

            await ol.axi_cdma_0.cdma_introut.wait()
            await asyncio.sleep(20e-9)
            cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt

            cdma.write(CDMA_SRC, kernel_buffer.physical_address)
            cdma.write(CDMA_DST, kernel_bram_addr)
            cdma.write(CDMA_BTT, kernel_bram_usage)

            await ol.axi_cdma_0.cdma_introut.wait()
            await asyncio.sleep(20e-9)
            cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt

            # reset layer enable
            config_in.write(0x00, 0) # 4'b0000 layer0 layer_enable=0
            # set pass enable
            config_in.write(0x00, 1) # 4'b0001 pass_enable=1 and reset layer enable
#             time.sleep(20*10**-9)
            config_in.write(0x00, 0) # 4'b0000 pass_enable=0

            await ol.Controller_u.pass_done.wait()
            # transfer data from bram to ddr

            row_to_transfer = 0
            if(e_tile==252):
                row_to_transfer = 4
            else:
                row_to_transfer = 6

            cdma.write(CDMA_SRC, psum_bram_addr + row_to_transfer*F*(pt_tile)*4)
            cdma.write(CDMA_DST, ofmap_buffer[(pt_tile//4)].physical_address + e_tile*F*4)
            cdma.write(CDMA_BTT, row_to_transfer*F*4)

            await ol.axi_cdma_0.cdma_introut.wait()
            await asyncio.sleep(20e-9)
            cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt

            cdma.write(CDMA_SRC, psum_bram_addr + row_to_transfer*F*(pt_tile+1)*4)
            cdma.write(CDMA_DST, ofmap_buffer[((pt_tile//4)+1)].physical_address + e_tile*F*4)
            cdma.write(CDMA_BTT, row_to_transfer*F*4)

            await ol.axi_cdma_0.cdma_introut.wait()
            await asyncio.sleep(20e-9)
            cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt

            cdma.write(CDMA_SRC, psum_bram_addr + row_to_transfer*F*(pt_tile+2)*4)
            cdma.write(CDMA_DST, ofmap_buffer[((pt_tile//4)+2)].physical_address + e_tile*F*4)
            cdma.write(CDMA_BTT, row_to_transfer*F*4)

            await ol.axi_cdma_0.cdma_introut.wait()
            await asyncio.sleep(20e-9)
            cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt

            cdma.write(CDMA_SRC, psum_bram_addr + row_to_transfer*F*(pt_tile+3)*4)
            cdma.write(CDMA_DST, ofmap_buffer[((pt_tile//4)+3)].physical_address + e_tile*F*4)
            cdma.write(CDMA_BTT, row_to_transfer*F*4)

            await ol.axi_cdma_0.cdma_introut.wait()
            await asyncio.sleep(20e-9)
            cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt

            # count pass
            pass_count = pass_count + 1
#             print(f"pass {pass_count-1}")

In [None]:
input_layer = ofmap_buffer.copy()
input_layer = input_layer.reshape(16,256,256)
input_layer = np.pad(input_layer, ((0, 0), (1, 1), (0, 0)), mode='constant', constant_values=2155905152) # 80808080
# input_layer = input_layer
# print(input_layer)
print(input_layer.shape)

### Layer 1

In [None]:
# layer 1
E = 256
F = 256
C = 64
M = 64
e = 3
qr = 16
pt = 8
pass_count = 0

for e_tile in range(0,E,e): # e=3
    if(e_tile == 255):
        OPSUM_ROW_THIS_PASS =  1
    else:
        OPSUM_ROW_THIS_PASS =  3

    for qr_tile in range(0,C,qr): # qr=16

        ifmap_this_pass = input_layer[(qr_tile//4):(qr_tile//4)+4,e_tile:e_tile+OPSUM_ROW_THIS_PASS+2,:]
        ifmap_buffer[0:ifmap_this_pass.shape[0]*ifmap_this_pass.shape[1]*ifmap_this_pass.shape[2]] = ifmap_this_pass.reshape(-1)
#         if(e_tile == 0):
#             print("ifmap_this_pass",ifmap_buffer)
        # put_bram_ifmap
        cdma.write(CDMA_SRC, ifmap_buffer.physical_address) # src
        cdma.write(CDMA_DST, ifmap_bram_addr) # dist
        cdma.write(CDMA_BTT, ifmap_bram_usage) # ? bytes
        await ol.axi_cdma_0.cdma_introut.wait()
        await asyncio.sleep(20e-9)
        cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt

        for pt_tile in range(0,M,pt): # pt=8
            if(qr_tile == 0):
                bias_this_pass = bias[0, pt_tile:pt_tile+8]
                bias_buffer[:bias_this_pass.shape[0]] = bias_this_pass
#                 if(e_tile == 0):
#                     print("bias_buffer",bias_buffer)
                # put_bram_bias
                cdma.write(CDMA_SRC, bias_buffer.physical_address) # src
                cdma.write(CDMA_DST, bias_bram_addr) # dist
                cdma.write(CDMA_BTT, bias_bram_usage) # ? bytes
                await ol.axi_cdma_0.cdma_introut.wait()
                await asyncio.sleep(20e-9)
                cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt

            kernel_this_pass = kernel1[pt_tile:pt_tile+8,(qr_tile//4):(qr_tile//4)+4,:]
            kernel_buffer[0:kernel_this_pass.shape[0]*kernel_this_pass.shape[1]*3*3] = kernel_this_pass.reshape(-1)
#             if(e_tile == 0):
#                     print("kernel_this_pass",kernel_buffer)

            # set layer enable
            config_in.write(0x00, 6) # 5'b00110 layer1 layer_enable=1

            # put_bram_weight
            cdma.write(CDMA_SRC, kernel_buffer.physical_address) # src
            cdma.write(CDMA_DST, kernel_bram_addr) # dist
            cdma.write(CDMA_BTT, kernel_bram_usage) # ? bytes
            await ol.axi_cdma_0.cdma_introut.wait()
            await asyncio.sleep(20e-9)
            cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt

            # reset layer enable
            config_in.write(0x00, 4) # 5'b00100 layer1 layer_enable=0

            # set pass enable
            config_in.write(0x00, 5) # 5'b00101 pass_enable=1
            await asyncio.sleep(20e-9)
            config_in.write(0x00, 4) # 5'b00100 pass_enable=0

            await ol.Controller_u.pass_done.wait()

            # transfer data from bram to ddr
            if(qr_tile==48):
                row_to_transfer = 0
                if(e_tile==255):
                    row_to_transfer = 1
                else:
                    row_to_transfer = 3

                cdma.write(0x18, psum_bram_addr + row_to_transfer*F*(pt_tile)*4)
                cdma.write(0x20, ofmap_buffer[(pt_tile//4)].physical_address + e_tile*F*4)
                cdma.write(0x28, row_to_transfer*F*4)

                await ol.axi_cdma_0.cdma_introut.wait()
                await asyncio.sleep(20e-9)
                cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt

                cdma.write(0x18, psum_bram_addr + row_to_transfer*F*(pt_tile+1)*4)
                cdma.write(0x20, ofmap_buffer[((pt_tile//4)+1)].physical_address + e_tile*F*4)
                cdma.write(0x28, row_to_transfer*F*4)

                await ol.axi_cdma_0.cdma_introut.wait()
                await asyncio.sleep(20e-9)
                cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt


            # count pass
            pass_count = pass_count + 1
#             print(f"pass {pass_count-1}")

In [None]:
input_layer = ofmap_buffer.copy()
input_layer = input_layer.reshape(16,256,256)
input_layer = np.pad(input_layer, ((0, 0), (1, 1), (0, 0)), mode='constant', constant_values=2155905152) # 80808080
# input_layer = input_layer
# print(input_layer)
print(input_layer.shape)

### Layer 2

In [None]:
# layer 2
E = 256
F = 256
C = 64
M = 64
e = 3
qr = 16
pt = 8
pass_count = 0

for e_tile in range(0,E,e): # e=3
    if(e_tile == 255):
        OPSUM_ROW_THIS_PASS =  1
    else:
        OPSUM_ROW_THIS_PASS =  3

    for qr_tile in range(0,C,qr): # qr=16

        ifmap_this_pass = input_layer[(qr_tile//4):(qr_tile//4)+4,e_tile:e_tile+OPSUM_ROW_THIS_PASS+2,:]
        ifmap_buffer[0:ifmap_this_pass.shape[0]*ifmap_this_pass.shape[1]*ifmap_this_pass.shape[2]] = ifmap_this_pass.reshape(-1)
#         if(e_tile == 0):
#             print("ifmap_this_pass",ifmap_buffer)
        # put_bram_ifmap
        cdma.write(CDMA_SRC, ifmap_buffer.physical_address) # src
        cdma.write(CDMA_DST, ifmap_bram_addr) # dist
        cdma.write(CDMA_BTT, ifmap_bram_usage) # ? bytes
        await ol.axi_cdma_0.cdma_introut.wait()
        await asyncio.sleep(20e-9)
        cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt

        for pt_tile in range(0,M,pt): # pt=8
            if(qr_tile == 0):
                bias_this_pass = bias[1, pt_tile:pt_tile+8]
                bias_buffer[:bias_this_pass.shape[0]] = bias_this_pass
#                 if(e_tile == 0):
#                     print("bias_buffer",bias_buffer)
                # put_bram_bias
                cdma.write(CDMA_SRC, bias_buffer.physical_address) # src
                cdma.write(CDMA_DST, bias_bram_addr) # dist
                cdma.write(CDMA_BTT, bias_bram_usage) # ? bytes
                await ol.axi_cdma_0.cdma_introut.wait()
                await asyncio.sleep(20e-9)
                cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt

            kernel_this_pass = kernel2[pt_tile:pt_tile+8,(qr_tile//4):(qr_tile//4)+4,:]
            kernel_buffer[0:kernel_this_pass.shape[0]*kernel_this_pass.shape[1]*3*3] = kernel_this_pass.reshape(-1)
#             if(e_tile == 0):
#                     print("kernel_this_pass",kernel_buffer)

            # set layer enable
            config_in.write(0x00, 10) # 5'b01010 layer2 layer_enable=1

            # put_bram_weight
            cdma.write(CDMA_SRC, kernel_buffer.physical_address) # src
            cdma.write(CDMA_DST, kernel_bram_addr) # dist
            cdma.write(CDMA_BTT, kernel_bram_usage) # ? bytes
            await ol.axi_cdma_0.cdma_introut.wait()
            await asyncio.sleep(20e-9)
            cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt

            # reset layer enable
            config_in.write(0x00, 8) # 5'b01000 layer3 layer_enable=0

            # set pass enable
            config_in.write(0x00, 9) # 5'b01001 pass_enable=1
            await asyncio.sleep(20e-9)
            config_in.write(0x00, 8) # 5'b01000 pass_enable=0

            await ol.Controller_u.pass_done.wait()

            # transfer data from bram to ddr
            if(qr_tile==48):
                row_to_transfer = 0
                if(e_tile==255):
                    row_to_transfer = 1
                else:
                    row_to_transfer = 3

                cdma.write(0x18, psum_bram_addr + row_to_transfer*F*(pt_tile)*4)
                cdma.write(0x20, ofmap_buffer[(pt_tile//4)].physical_address + e_tile*F*4)
                cdma.write(0x28, row_to_transfer*F*4)

                await ol.axi_cdma_0.cdma_introut.wait()
                await asyncio.sleep(20e-9)
                cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt

                cdma.write(0x18, psum_bram_addr + row_to_transfer*F*(pt_tile+1)*4)
                cdma.write(0x20, ofmap_buffer[((pt_tile//4)+1)].physical_address + e_tile*F*4)
                cdma.write(0x28, row_to_transfer*F*4)

                await ol.axi_cdma_0.cdma_introut.wait()
                await asyncio.sleep(20e-9)
                cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt


            # count pass
            pass_count = pass_count + 1
#             print(f"pass {pass_count-1}")

In [None]:
input_layer = ofmap_buffer.copy()
input_layer = input_layer.reshape(16,256,256)
input_layer = np.pad(input_layer, ((0, 0), (1, 1), (0, 0)), mode='constant', constant_values=2155905152) # 80808080
# input_layer = input_layer
# print(input_layer)
print(input_layer.shape)

### Layer 3

In [None]:
# layer 3
E = 256
F = 256
C = 64
M = 64
e = 3
qr = 16
pt = 8
pass_count = 0

for e_tile in range(0,E,e): # e=3
    if(e_tile == 255):
        OPSUM_ROW_THIS_PASS =  1
    else:
        OPSUM_ROW_THIS_PASS =  3

    for qr_tile in range(0,C,qr): # qr=16

        ifmap_this_pass = input_layer[(qr_tile//4):(qr_tile//4)+4,e_tile:e_tile+OPSUM_ROW_THIS_PASS+2,:]
        ifmap_buffer[0:ifmap_this_pass.shape[0]*ifmap_this_pass.shape[1]*ifmap_this_pass.shape[2]] = ifmap_this_pass.reshape(-1)
#         if(e_tile == 0):
#             print("ifmap_this_pass",ifmap_buffer)
        # put_bram_ifmap
        cdma.write(CDMA_SRC, ifmap_buffer.physical_address) # src
        cdma.write(CDMA_DST, ifmap_bram_addr) # dist
        cdma.write(CDMA_BTT, ifmap_bram_usage) # ? bytes
        await ol.axi_cdma_0.cdma_introut.wait()
        await asyncio.sleep(20e-9)
        cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt

        for pt_tile in range(0,M,pt): # pt=8

            if(qr_tile == 0):
                bias_this_pass = bias[2, pt_tile:pt_tile+8]
                bias_buffer[:bias_this_pass.shape[0]] = bias_this_pass
#                 if(e_tile == 0):
#                     print("bias_buffer",bias_buffer)
                # put_bram_bias
                cdma.write(CDMA_SRC, bias_buffer.physical_address) # src
                cdma.write(CDMA_DST, bias_bram_addr) # dist
                cdma.write(CDMA_BTT, bias_bram_usage) # ? bytes
                await ol.axi_cdma_0.cdma_introut.wait()
                await asyncio.sleep(20e-9)
                cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt

            kernel_this_pass = kernel3[pt_tile:pt_tile+8,(qr_tile//4):(qr_tile//4)+4,:]
            kernel_buffer[0:kernel_this_pass.shape[0]*kernel_this_pass.shape[1]*3*3] = kernel_this_pass.reshape(-1)
#             if(e_tile == 0):
#                     print("kernel_this_pass",kernel_buffer)

            # set layer enable
            config_in.write(0x00, 14) # 5'b01110 layer3 layer_enable=1

            # put_bram_weight
            cdma.write(CDMA_SRC, kernel_buffer.physical_address) # src
            cdma.write(CDMA_DST, kernel_bram_addr) # dist
            cdma.write(CDMA_BTT, kernel_bram_usage) # ? bytes
            await ol.axi_cdma_0.cdma_introut.wait()
            await asyncio.sleep(20e-9)
            cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt

            # reset layer enable
            config_in.write(0x00, 12) # 5'b01100 layer3 layer_enable=0

            # set pass enable
            config_in.write(0x00, 13) # 5'b01101 pass_enable=1
            await asyncio.sleep(20e-9)
            config_in.write(0x00, 12) # 5'b01100 pass_enable=0

            await ol.Controller_u.pass_done.wait()

            # transfer data from bram to ddr
            if(qr_tile==48):
                row_to_transfer = 0
                if(e_tile==255):
                    row_to_transfer = 1
                else:
                    row_to_transfer = 3

                cdma.write(0x18, psum_bram_addr + row_to_transfer*F*(pt_tile)*4)
                cdma.write(0x20, ofmap_buffer[(pt_tile//4)].physical_address + e_tile*F*4)
                cdma.write(0x28, row_to_transfer*F*4)

                await ol.axi_cdma_0.cdma_introut.wait()
                await asyncio.sleep(20e-9)
                cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt

                cdma.write(0x18, psum_bram_addr + row_to_transfer*F*(pt_tile+1)*4)
                cdma.write(0x20, ofmap_buffer[((pt_tile//4)+1)].physical_address + e_tile*F*4)
                cdma.write(0x28, row_to_transfer*F*4)

                await ol.axi_cdma_0.cdma_introut.wait()
                await asyncio.sleep(20e-9)
                cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt


            # count pass
            pass_count = pass_count + 1
#             print(f"pass {pass_count-1}")

In [None]:
input_layer = ofmap_buffer.copy()
input_layer = input_layer.reshape(16,256,256)
input_layer = np.pad(input_layer, ((0, 0), (1, 1), (0, 0)), mode='constant', constant_values=2155905152) # 80808080
# input_layer = input_layer
# print(input_layer)
print(input_layer.shape)

### Layer 4

In [None]:
# layer 4
E = 256
F = 256
C = 64
M = 1
e = 6
qr = 16
pt = 1
pass_count = 0

for e_tile in range(0,E,e): # e=6
    if(e_tile == 252):
        OPSUM_ROW_THIS_PASS =  4
    else:
        OPSUM_ROW_THIS_PASS =  6
    for qr_tile in range(0,C,qr): # qr=16
        ifmap_this_pass = input_layer[(qr_tile//4):(qr_tile//4)+4,e_tile:e_tile+OPSUM_ROW_THIS_PASS+2,:]
        ifmap_buffer[0:ifmap_this_pass.shape[0]*ifmap_this_pass.shape[1]*ifmap_this_pass.shape[2]] = ifmap_this_pass.reshape(-1)

        cdma.write(CDMA_SRC, ifmap_buffer.physical_address) # src
        cdma.write(CDMA_DST, ifmap_bram_addr) # dist
        cdma.write(CDMA_BTT, ifmap_bram_usage) # ? bytes
        await ol.axi_cdma_0.cdma_introut.wait()
        await asyncio.sleep(20e-9)
        cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt

        for pt_tile in range(0,M,pt): # pt=1

            kernel_this_pass = kernel4[(qr_tile//4):(qr_tile//4)+4,:]
            kernel_buffer[0:kernel_this_pass.shape[0]*3*3] = kernel_this_pass.reshape(-1)

            # set layer enable
            config_in.write(0x00, 18) # 5'b10010 layer4 layer_enable=1

            # put_bram_weight
            cdma.write(CDMA_SRC, kernel_buffer.physical_address) # src
            cdma.write(CDMA_DST, kernel_bram_addr) # dist
            cdma.write(CDMA_BTT, kernel_bram_usage) # ? bytes
            await ol.axi_cdma_0.cdma_introut.wait()
            await asyncio.sleep(20e-9)
            cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt

            # reset layer enable
            config_in.write(0x00, 16) # 5'b10000 layer4 layer_enable=0

            # set pass enable
            config_in.write(0x00, 17) # 5'b10001 pass_enable=1
            await asyncio.sleep(20e-9)
            config_in.write(0x00, 16) # 5'b10000 pass_enable=0

            await ol.Controller_u.pass_done.wait()

            # transfer data from bram to ddr
            if(qr_tile==48):
                row_to_transfer = 0
                if(e_tile==252):
                    row_to_transfer = 4
                else:
                    row_to_transfer = 6

                cdma.write(0x18, psum_bram_addr)
                cdma.write(0x20, ofmap_buffer[0].physical_address + e_tile*F*4)
                cdma.write(0x28, row_to_transfer*F*4)

                await ol.axi_cdma_0.cdma_introut.wait()
                await asyncio.sleep(20e-9)
                cdma.write(CDMA_STATUS, (1 << 12) | (1 << 4) | (1 << 5) | (1 << 6)) # clear interrupt


            # count pass
            pass_count = pass_count + 1
#             print(f"pass {pass_count-1}")

In [None]:
end = time.perf_counter()
print(f"execution time: {end - start:.6f} s")

### Image recovering

In [None]:
zp = 128
scale = 0.0078125
print(img)
img = (img.astype(np.float32) - zp) * scale

zp = 128
scale = 0.00390625

denoise = ofmap_buffer[0] % 256
denoise = denoise.reshape(256,256)
denoise = (denoise.astype(np.float32) - zp) * scale
denoise = np.array(denoise)

out = np.clip(img - denoise, 0.0, 1.0)
out = (out*255).round().astype(np.uint8)
cv2.imwrite("./result/output.png",out)