# Testing Kernel-U

This notebook will test an IP written in Vivado HLS.

In [18]:
from pynq import Overlay
import pynq.lib.dma
from pynq import allocate
import numpy as np
from pynq import DefaultIP
import timeit

Program FPGA and inspect Overlay.

In [2]:
overlay = Overlay("overlay/kernel_u.bit")
overlay?

Get the kernel register map.

In [3]:
kernel = overlay.HlsAxisKernelU_0
kernel.register_map
# print("stream size: ", adder.stream_size)
# accel_state = adder.get_state()
# print("accelerator state: ", accel_state)
# dma = overlay.axi_dma_0
# dma.register_map.MM2S_DMASR
# dma.register_map.S2MM_DMACR

RegisterMap {
  CTRL = Register(AP_START=0, AP_DONE=0, AP_IDLE=1, AP_READY=0, RESERVED_1=0, AUTO_RESTART=0, RESERVED_2=0),
  GIER = Register(Enable=0, RESERVED=0),
  IP_IER = Register(CHAN0_INT_EN=0, CHAN1_INT_EN=0, RESERVED=0),
  IP_ISR = Register(CHAN0_INT_ST=0, CHAN1_INT_ST=0, RESERVED=0),
  num_refinements = Register(num_refinements=0)
}

## Kernel IP

The kernel IP can be automatically bound by first creating our Kernel class. Then, the overlay can be instantiated again.

In [179]:
class KernelDriver(DefaultIP):
    def __init__(self, description):
        super().__init__(description=description)
    
    bindto = ['xilinx.com:hls:HlsAxisKernelU:1.0']

    def start_accel(self):
        self.register_map.CTRL.AP_START = 1
        self.write(0x0, 1)
        self.write(0x0, 1)
        while(self.read(0x0) % 2 == 0):
            self.write(0x0, 1)
            pass # Wait until start, i.e. bit 0, is set.

    def set_state(self, state):
        # self.register_map.CTRL = state
        # return self.register_map.CTRL
        self.write(0x0, state)
        return self.read(0x0)

    def get_state(self):
        return self.register_map.CTRL
        # return self.read(0x0)

    @property
    def num_refinements(self):
        return self.register_map.num_refinements
        # return self.read(0x10)

    @num_refinements.setter
    def num_refinements(self, R):
        # self.register_map.num_refinements = R
        self.write(0x10, R)

overlay = Overlay("overlay/kernel_u.bit")

Let's check again the kernel:

In [180]:
kernel_u = overlay.HlsAxisKernelU_0
kernel_u.get_state()

Register(AP_START=0, AP_DONE=0, AP_IDLE=1, AP_READY=0, RESERVED_1=0, AUTO_RESTART=0, RESERVED_2=0)

In [181]:
kernel_u.read(0x10)

0

To show the class is working, we setup the `num_refinements` using the setter method. We then read its corresponding register.

In [182]:
kernel_u.num_refinements = 1
kernel_u.read(0x10)

1

In [183]:
print(kernel_u.get_state())
# kernel_u.start_accel()
print(kernel_u.get_state())
kernel_u.get_state()

0x4
0x4


Register(AP_START=0, AP_DONE=0, AP_IDLE=1, AP_READY=0, RESERVED_1=0, AUTO_RESTART=0, RESERVED_2=0)

## Data Allocation and Run

The data structures must be contiguosly allocated.

In [184]:
# The following parameters are fixed in hardware and cannot be changed:
# - The number of inputs N
# - The input size I
# - The number of gates G
# - The tile size Tu
I = 512
G = 4
N = 2
Tu = 4
data_t = np.int16
# The number of refinements R can instead be adjusted.
R = 128

x_buffer = pynq.allocate(shape=(N, I,), dtype=data_t)
u_buffer = pynq.allocate(shape=(R, I // Tu, G, Tu), dtype=data_t)
xu_buffer = pynq.allocate(shape=(R, G, N,), dtype=data_t)

for i in range(N):
    for j in range(I):
        # for ii in range(R):
        x_buffer[i, j] = data_t(np.random.uniform(low=-2**15, high=2**15))

for i in range(R):
    for j in range(I // Tu):
        for k in range(G):
            for ii in range(Tu):
                u_buffer[i, j, k, ii] = data_t(np.random.uniform(low=-2**15, high=2**15))

for i in range(R):
    for j in range(G):
        for k in range(N):
            xu_buffer[i, j, k] = 0

print('Buffers setup completed.')
print(f'x_buffer.shape: {x_buffer.shape} - Bytes: {x_buffer.nbytes}')
print(f'u_buffer.shape: {u_buffer.shape} - Bytes: {u_buffer.nbytes}')
print(f'xu_buffer.shape: {xu_buffer.shape} - Bytes: {xu_buffer.nbytes}')

Buffers setup completed.
x_buffer.shape: (2, 512) - Bytes: 2048
u_buffer.shape: (128, 128, 4, 4) - Bytes: 524288
xu_buffer.shape: (128, 4, 2) - Bytes: 2048


Setup the kernel and then send the data through the DMAs.

In [185]:
kernel_u.num_refinements = R
print(kernel_u.get_state())
kernel_u.start_accel()
print(kernel_u.get_state())

# Transfer
print('Starting transfer:')
overlay.x_dma.sendchannel.transfer(x_buffer)
overlay.u_dma.sendchannel.transfer(u_buffer)
overlay.xu_dma.recvchannel.transfer(xu_buffer)
# Then wait
print('Wait x...', end='')
overlay.x_dma.sendchannel.wait()
print('DONE.\nWait u...', end='')
overlay.u_dma.sendchannel.wait()
print('DONE.\nWait xu...', end='')
overlay.xu_dma.recvchannel.wait()
print('DONE.\n')

print(f'xu_buffer.shape: {xu_buffer.shape}')
# print(f'xu_buffer: {xu_buffer}')

0x4
0x1
Starting transfer:
Wait x...DONE.
Wait u...DONE.
Wait xu...DONE.

xu_buffer.shape: (128, 4, 2)


In [186]:
def run_kernel(R, x_buffer, u_buffer, xu_buffer):
    kernel_u.num_refinements = R
    kernel_u.start_accel()
    # Transfer
    overlay.x_dma.sendchannel.transfer(x_buffer)
    overlay.u_dma.sendchannel.transfer(u_buffer)
    overlay.xu_dma.recvchannel.transfer(xu_buffer)
    # Then wait
    overlay.x_dma.sendchannel.wait()
    overlay.u_dma.sendchannel.wait()
    overlay.xu_dma.recvchannel.wait()

In [192]:
%timeit run_kernel(R, x_buffer, u_buffer, xu_buffer)

10 loops, best of 3: 80.5 ms per loop


## Checking Correctness

We first find the proper reshape mechanisms:

In [193]:
# =============================================================================
# Reshape: (R, I, G) => (R, I // Tu, G, Tu)
# =============================================================================
u = np.random.randn(R, I, G)
u_tmp = u.copy()
u_tmp = np.transpose(u_tmp.reshape(R, I // Tu, Tu, G), (0, 1, 3, 2))
print(u[0, 0:4, 0], u_tmp[0, 0, 0, 0:4])
print(u[0, 3, 0] - u_tmp[0, 0, 0, 3])

# =============================================================================
# Reshape: (R, I // Tu, G, Tu) => (I, G, R)
# =============================================================================
u = np.random.randn(R, I // Tu, G, Tu)
u_tmp = u.copy()
u_tmp = np.transpose(u_tmp, (1, 3, 2, 0)).reshape(I, G, R)
print(u[0, 0, 0, 0:4], u_tmp[0:4, 0, 0])
print(u[0, 0, 0, 3] - u_tmp[3, 0, 0])

x = np.random.randn(N, I)
u = np.random.randn(I, G, R)
x = (x * 2).astype(np.int16)
u = (u * 2).astype(np.int16)

%timeit xu = np.transpose(np.tensordot(x, u, axes=1), (2, 1, 0))
print(xu.shape)

[ 0.36593539 -1.03844877  0.82985754 -0.82067175] [ 0.36593539 -1.03844877  0.82985754 -0.82067175]
0.0
[-0.07974188  0.01109454 -0.18120697  0.73842526] [-0.07974188  0.01109454 -0.18120697  0.73842526]
0.0
10 loops, best of 3: 24.1 ms per loop
(128, 4, 2)


We now check the Numpy computation against the FPGA result.

In [194]:
u_tmp = np.transpose(u_buffer, (1, 3, 2, 0)).reshape(I, G, R)
%timeit xu_gold = np.transpose(np.tensordot(x_buffer, u_tmp, axes=1), (2, 1, 0))
print('\nAll equal:', np.allclose(xu_buffer, xu_gold))
print('gold[0]: ', xu_gold[0])
print('fpga[0]: ', xu_buffer[0])

10 loops, best of 3: 105 ms per loop

All equal: True
gold[0]:  [[  8822 -32153]
 [-17540   6635]
 [  6489   5700]
 [ 11839  25184]]
fpga[0]:  [[  8822 -32153]
 [-17540   6635]
 [  6489   5700]
 [ 11839  25184]]
