In [1]:
import pynq
import time
import numpy as np

In [2]:
overlay = pynq.Overlay("tiled_cim_conv_64_11.bit")

In [3]:
overlay?

[0;31mType:[0m            Overlay
[0;31mString form:[0m     <pynq.overlay.Overlay object at 0xb3dc4100>
[0;31mFile:[0m            /usr/local/share/pynq-venv/lib/python3.8/site-packages/pynq/overlay.py
[0;31mDocstring:[0m      
Default documentation for overlay tiled_cim_conv_64_11.bit. The following
attributes are available on this overlay:

IP Blocks
----------
tiled_cim_conv_0     : pynq.overlay.DefaultIP
processing_system7_0 : pynq.overlay.DefaultIP

Hierarchies
-----------
None

Interrupts
----------
None

GPIO Outputs
------------
None

Memories
------------
PSDDR                : Memory
[0;31mClass docstring:[0m
This class keeps track of a single bitstream's state and contents.

The overlay class holds the state of the bitstream and enables run-time
protection of bindings.

Our definition of overlay is: "post-bitstream configurable design".
Hence, this class must expose configurability through content discovery
and runtime protection.

The overlay class exposes the IP a

In [4]:
dut = overlay.tiled_cim_conv_0

In [5]:
def to_fixed_point(dst, src, *, width=None, iwidth, signed=True):
    if width is None:
        width = dst.dtype.itemsize * 8

    fwidth = width - iwidth
    epsilon = 1.0 / (2.0**fwidth)
    min_ = -1.0 * (2.0 ** (iwidth - 1)) if signed else 0.0
    max_ = (2.0 ** (iwidth - (1 if signed else 0))) - epsilon

    src = np.copy(src)
    src = src.reshape(dst.shape)
    src[src < min_] = min_
    src[src > max_] = max_
    if signed:
        src[src < 0] += 2**iwidth
    dst[:] = np.around(src * (2.0**fwidth)).astype(dst.dtype)


def from_fixed_point(src, *, width=None, iwidth, signed=True):
    if width is None:
        width = src.dtype.itemsize * 8

    fwidth = width - iwidth
    src = np.array(src, dtype=np.int64)
    if signed:
        src[src >= (2 ** (width - 1))] -= 2**width
    return src / (2.0**fwidth)

In [6]:
IN_ROWS = 12545
IN_COLS = 147
WT_ROWS = 147
WT_COLS = 65
WT_BIN_COLS = WT_COLS * 8
ADC_LEVELS = 32
NUM_ARGS = 2
VDD = 1
RES_DIVIDER = 1352

In [7]:
inputs = np.loadtxt(open("input2d.csv"), delimiter=",", dtype="int32")
weights = np.loadtxt(open("weight2d_cond.csv"), delimiter=",", dtype="float64")
v_ref = np.loadtxt(open("v_ref.csv"), delimiter=",", dtype="float64")
correct_output = np.loadtxt(open("correct_output.csv"), delimiter=",", dtype="int32")

In [8]:
cim_args = np.array([VDD, RES_DIVIDER], dtype="u4");

In [9]:
input_buf = pynq.allocate((IN_ROWS, IN_COLS), dtype="u4")
input_buf[:] = inputs
input_buf.sync_to_device()

In [10]:
weight_buf = pynq.allocate((WT_ROWS, WT_BIN_COLS), dtype="u8")
to_fixed_point(weight_buf, weights, iwidth=11, signed=True)
weight_buf.sync_to_device()

In [11]:
v_ref_buf = pynq.allocate(ADC_LEVELS, dtype="u8")
to_fixed_point(v_ref_buf, v_ref, iwidth=11, signed=True)
v_ref_buf.sync_to_device()

In [12]:
cim_args_buf = pynq.allocate(NUM_ARGS, dtype="u4")
cim_args_buf[:] = cim_args
cim_args_buf.sync_to_device()

In [13]:
output_buf = pynq.allocate((IN_ROWS, WT_COLS), dtype="u4")

In [14]:
dut.register_map

RegisterMap {
  CTRL = Register(AP_START=0, AP_DONE=0, AP_IDLE=1, AP_READY=0, RESERVED_1=0, AUTO_RESTART=0, RESERVED_2=0, INTERRUPT=0, RESERVED_3=0),
  GIER = Register(Enable=0, RESERVED=0),
  IP_IER = Register(CHAN0_INT_EN=0, CHAN1_INT_EN=0, RESERVED_0=0),
  IP_ISR = Register(CHAN0_INT_ST=0, CHAN1_INT_ST=0, RESERVED_0=0),
  input2d_1 = Register(input2d=write-only),
  input2d_2 = Register(input2d=write-only),
  weight2d_cond_1 = Register(weight2d_cond=write-only),
  weight2d_cond_2 = Register(weight2d_cond=write-only),
  v_ref_1 = Register(v_ref=write-only),
  v_ref_2 = Register(v_ref=write-only),
  output_r_1 = Register(output_r=write-only),
  output_r_2 = Register(output_r=write-only),
  cim_args_1 = Register(cim_args=write-only),
  cim_args_2 = Register(cim_args=write-only)
}

In [15]:
dut.register_map.input2d_1 = input_buf.device_address & 0xFFFFFFFF
dut.register_map.input2d_2 = input_buf.device_address >> 32

dut.register_map.weight2d_cond_1 = weight_buf.device_address & 0xFFFFFFFF
dut.register_map.weight2d_cond_2 = weight_buf.device_address >> 32

dut.register_map.v_ref_1 = v_ref_buf.device_address & 0xFFFFFFFF
dut.register_map.v_ref_2 = v_ref_buf.device_address >> 32

dut.register_map.cim_args_1 = cim_args_buf.device_address & 0xFFFFFFFF
dut.register_map.cim_args_2 = cim_args_buf.device_address >> 32

dut.register_map.output_r_1 = output_buf.device_address & 0xFFFFFFFF
dut.register_map.output_r_2 = output_buf.device_address >> 32

In [16]:
start_time = time.time()

dut.register_map.CTRL.AP_START = 1
dut.register_map.CTRL[4] = 1
while not dut.register_map.CTRL.AP_DONE:
    pass

end_time = time.time()
duration = end_time - start_time
print(f"Kernel completed in {duration * 1000:.2f}ms")

Kernel completed in 31439.17ms


In [17]:
output_buf.sync_from_device()

In [18]:
print(output_buf)
print(np.amax(output_buf), np.amin(output_buf))

[[1520565 1506235 1528064 ... 1520977 1522650 1523492]
 [1566390 1518860 1567446 ... 1566757 1564184 1560957]
 [1601721 1520574 1605774 ... 1597674 1602548 1598295]
 ...
 [1695595 1687566 1695153 ... 1692669 1701364 1696847]
 [1620349 1619268 1623021 ... 1625326 1608517 1624711]
 [1512432 1509840 1510002 ... 1512270 1511217 1512189]]
3512491 302385


In [19]:
on_board_mse = np.square(np.subtract(correct_output, output_buf)).mean()

In [20]:
on_board_mse

PynqBuffer(0.)