# Testing Kernel-V

This notebook will test an IP written in Vivado HLS.

In [1]:
from pynq import Overlay
import pynq.lib.dma
from pynq import allocate
import numpy as np
from pynq import DefaultIP
import timeit

Program FPGA and inspect Overlay.

In [31]:
overlay = Overlay('overlay/kernel_v.bit')
print(overlay.device)
overlay?

<pynq.pl_server.device.XlnkDevice object at 0xafbe7350>


Get the kernel register map.

In [32]:
kernel = overlay.HlsKernelV_0
kernel.register_map
# print("stream size: ", adder.stream_size)
# accel_state = adder.get_state()
# print("accelerator state: ", accel_state)
# dma = overlay.axi_dma_0
# dma.register_map.MM2S_DMASR
# dma.register_map.S2MM_DMACR

RegisterMap {
  CTRL = Register(AP_START=0, AP_DONE=0, AP_IDLE=1, AP_READY=0, RESERVED_1=0, AUTO_RESTART=0, RESERVED_2=0),
  GIER = Register(Enable=0, RESERVED=0),
  IP_IER = Register(CHAN0_INT_EN=0, CHAN1_INT_EN=0, RESERVED=0),
  IP_ISR = Register(CHAN0_INT_ST=0, CHAN1_INT_ST=0, RESERVED=0),
  num_active_inputs = Register(num_active_inputs=0),
  output_size = Register(output_size=0),
  num_refinements_1 = Register(num_refinements=0),
  num_refinements_2 = Register(num_refinements=0)
}

## Kernel IP

The kernel IP can be automatically bound by first creating our Kernel class. Then, the overlay can be instantiated again.

In [51]:
class KernelDriver(DefaultIP):
    def __init__(self, description):
        super().__init__(description=description)
    
    bindto = ['xilinx.com:hls:HlsKernelV:1.0']

    def start_accel(self):
        self.register_map.CTRL.AP_START = 1
        self.write(0x0, 1)
        self.write(0x0, 1)
        while(self.read(0x0) % 2 == 0):
            self.write(0x0, 1)
            pass # Wait until start, i.e. bit 0, is set.

    def set_state(self, state):
        # self.register_map.CTRL = state
        # return self.register_map.CTRL
        self.write(0x0, state)
        return self.read(0x0)

    def get_state(self):
        return self.register_map.CTRL
        # return self.read(0x0)

    @property
    def num_refinements(self):
        return (self.register_map.num_refinements_1, self.register_map.num_refinements_2)
        # return self.read(0x10)

    @num_refinements.setter
    def num_refinements(self, R):
        self.register_map.num_refinements_1 = R[0]
        self.register_map.num_refinements_2 = R[1]
        # self.write(0x10, R)

    @property
    def num_active_inputs(self):
        return self.register_map.num_active_inputs
        # return self.read(0x10)

    @num_active_inputs.setter
    def num_active_inputs(self, N):
        self.register_map.num_active_inputs = N
        # self.write(0x10, R)

    @property
    def output_size(self):
        return self.register_map.output_size
        # return self.read(0x10)

    @output_size.setter
    def output_size(self, H):
        self.register_map.output_size = H
        # self.write(0x10, R)

overlay = Overlay("overlay/kernel_v.bit")

Let's check again the kernel:

In [23]:
kernel_v = overlay.HlsKernelV_0
kernel_v.get_state()

Register(AP_START=0, AP_DONE=0, AP_IDLE=1, AP_READY=0, RESERVED_1=0, AUTO_RESTART=0, RESERVED_2=0)

In [24]:
kernel_v.read(0x10)

0

To show the class is working, we setup the `num_refinements` using the setter method. We then read its corresponding register.

In [25]:
print(kernel_v.num_refinements)
kernel_v.num_refinements = (1, 1)
print(kernel_v.num_refinements)

(Register(num_refinements=0), Register(num_refinements=0))
(Register(num_refinements=1), Register(num_refinements=1))


In [26]:
print(kernel_v.get_state())
# kernel_u.start_accel()
print(kernel_v.get_state())
kernel_v.get_state()

0x4
0x4


Register(AP_START=0, AP_DONE=0, AP_IDLE=1, AP_READY=0, RESERVED_1=0, AUTO_RESTART=0, RESERVED_2=0)

In [35]:
print(kernel_v.num_active_inputs)
kernel_v.num_active_inputs = 2
print(kernel_v.num_active_inputs)

0x0
0x2


## Data Allocation and Run

The data structures must be contiguosly allocated.

In [52]:
# The following parameters are fixed in hardware and cannot be changed:
# - The maximum output size H
# - The number of gates G
# - The tile size Tv
H = 128
G = 4
Tv = 4
data_t = np.int16
# The following parameters are customizeable in hardware and can be changed:
# - The number of refinements R
# - The output_size <= H
# - The number of active_inputs <= N
R = 16
N = 1
output_size = 8 % 128
# NOTE: Working with (R, N, out) == (16, 2, 8) 

xus_buffer = pynq.allocate(shape=(R, N, G), dtype=data_t)
v_buffer = pynq.allocate(shape=(R, output_size // Tv, G, Tv), dtype=data_t)
y_buffer = pynq.allocate(shape=(output_size // Tv, N, Tv, G), dtype=data_t)

# for i in range(N):
#     for j in range(I):
#         # for ii in range(R):
#         x_buffer[i, j] = data_t(np.random.uniform(low=-2**15, high=2**15))

# for i in range(R):
#     for j in range(I // Tu):
#         for k in range(G):
#             for ii in range(Tu):
#                 u_buffer[i, j, k, ii] = data_t(np.random.uniform(low=-2**15, high=2**15))

# for i in range(R):
#     for j in range(G):
#         for k in range(N):
#             xu_buffer[i, j, k] = 0

print('Buffers setup completed.')
print(f'xus_buffer.shape: {xus_buffer.shape} - Bytes: {xus_buffer.nbytes}')
print(f'v_buffer.shape: {v_buffer.shape} - Bytes: {v_buffer.nbytes}')
print(f'y_buffer.shape: {y_buffer.shape} - Bytes: {y_buffer.nbytes}')

Buffers setup completed.
xus_buffer.shape: (16, 1, 4) - Bytes: 128
v_buffer.shape: (16, 2, 4, 4) - Bytes: 1024
y_buffer.shape: (2, 1, 4, 4) - Bytes: 64


Setup the kernel and then send the data through the DMAs.

In [53]:
kernel_v.num_refinements = (R, R)
kernel_v.output_size = output_size
kernel_v.num_active_inputs = N
print(kernel_v.get_state())
kernel_v.start_accel()
print(kernel_v.get_state())

# Transfer
print('Starting transfer:')
overlay.xus_dma.sendchannel.transfer(xus_buffer)
overlay.v_dma.sendchannel.transfer(v_buffer)
overlay.y_dma.recvchannel.transfer(y_buffer)
# Then wait
print('Wait xus...', end='')
overlay.xus_dma.sendchannel.wait()
print('DONE.\nWait v...', end='')
overlay.v_dma.sendchannel.wait()
print('DONE.\nWait y...', end='')
overlay.y_dma.recvchannel.wait()
print('DONE.\n')

print(f'y_buffer.shape: {y_buffer.shape}')
# print(f'xu_buffer: {xu_buffer}')

0x4
0x1
Starting transfer:
Wait xus...DONE.
Wait v...DONE.
Wait y...DONE.

y_buffer.shape: (2, 1, 4, 4)


In [186]:
def run_kernel(R, x_buffer, u_buffer, xu_buffer):
    kernel_u.num_refinements = R
    kernel_u.start_accel()
    # Transfer
    overlay.x_dma.sendchannel.transfer(x_buffer)
    overlay.u_dma.sendchannel.transfer(u_buffer)
    overlay.xu_dma.recvchannel.transfer(xu_buffer)
    # Then wait
    overlay.x_dma.sendchannel.wait()
    overlay.u_dma.sendchannel.wait()
    overlay.xu_dma.recvchannel.wait()

In [192]:
%timeit run_kernel(R, x_buffer, u_buffer, xu_buffer)

10 loops, best of 3: 80.5 ms per loop


## Checking Correctness

We first find the proper reshape mechanisms:

In [193]:
# =============================================================================
# Reshape: (R, I, G) => (R, I // Tu, G, Tu)
# =============================================================================
u = np.random.randn(R, I, G)
u_tmp = u.copy()
u_tmp = np.transpose(u_tmp.reshape(R, I // Tu, Tu, G), (0, 1, 3, 2))
print(u[0, 0:4, 0], u_tmp[0, 0, 0, 0:4])
print(u[0, 3, 0] - u_tmp[0, 0, 0, 3])

# =============================================================================
# Reshape: (R, I // Tu, G, Tu) => (I, G, R)
# =============================================================================
u = np.random.randn(R, I // Tu, G, Tu)
u_tmp = u.copy()
u_tmp = np.transpose(u_tmp, (1, 3, 2, 0)).reshape(I, G, R)
print(u[0, 0, 0, 0:4], u_tmp[0:4, 0, 0])
print(u[0, 0, 0, 3] - u_tmp[3, 0, 0])

x = np.random.randn(N, I)
u = np.random.randn(I, G, R)
x = (x * 2).astype(np.int16)
u = (u * 2).astype(np.int16)

%timeit xu = np.transpose(np.tensordot(x, u, axes=1), (2, 1, 0))
print(xu.shape)

[ 0.36593539 -1.03844877  0.82985754 -0.82067175] [ 0.36593539 -1.03844877  0.82985754 -0.82067175]
0.0
[-0.07974188  0.01109454 -0.18120697  0.73842526] [-0.07974188  0.01109454 -0.18120697  0.73842526]
0.0
10 loops, best of 3: 24.1 ms per loop
(128, 4, 2)


We now check the Numpy computation against the FPGA result.

In [194]:
u_tmp = np.transpose(u_buffer, (1, 3, 2, 0)).reshape(I, G, R)
%timeit xu_gold = np.transpose(np.tensordot(x_buffer, u_tmp, axes=1), (2, 1, 0))
print('\nAll equal:', np.allclose(xu_buffer, xu_gold))
print('gold[0]: ', xu_gold[0])
print('fpga[0]: ', xu_buffer[0])

10 loops, best of 3: 105 ms per loop

All equal: True
gold[0]:  [[  8822 -32153]
 [-17540   6635]
 [  6489   5700]
 [ 11839  25184]]
fpga[0]:  [[  8822 -32153]
 [-17540   6635]
 [  6489   5700]
 [ 11839  25184]]
