# Testing DenseSvd

This notebook will test an IP written in Vivado HLS.

In [1]:
from pynq import Overlay
import pynq.lib.dma
from pynq import allocate
import numpy as np
from pynq import DefaultIP
import timeit

Program FPGA and inspect Overlay.

In [3]:
overlay = Overlay("overlay/kernel_svd.bit")
overlay?

Get the kernel register map.

In [4]:
kernel = overlay.HlsSvdKernel_0
kernel.register_map
# print("stream size: ", adder.stream_size)
# accel_state = adder.get_state()
# print("accelerator state: ", accel_state)
# dma = overlay.axi_dma_0
# dma.register_map.MM2S_DMASR
# dma.register_map.S2MM_DMACR

RegisterMap {
  CTRL = Register(AP_START=0, AP_DONE=0, AP_IDLE=1, AP_READY=0, RESERVED_1=0, AUTO_RESTART=0, RESERVED_2=0, RESERVED_3=0, RESERVED_4=0),
  GIER = Register(Enable=0, RESERVED=0),
  IP_IER = Register(CHAN0_INT_EN=0, CHAN1_INT_EN=0, RESERVED=0),
  IP_ISR = Register(CHAN0_INT_ST=0, CHAN1_INT_ST=0, RESERVED=0),
  num_active_inputs = Register(num_active_inputs=0),
  input_size = Register(input_size=0),
  output_size = Register(output_size=0),
  num_refinements_0 = Register(num_refinements_0=0),
  num_refinements_1 = Register(num_refinements_1=0)
}

## Kernel IP

The kernel IP can be automatically bound by first creating our Kernel class. Then, the overlay can be instantiated again.

In [32]:
class KernelDriver(DefaultIP):
    def __init__(self, description):
        super().__init__(description=description)
        self.max_G = 4
        self.max_I = 1
        self.max_H = 1
    
    bindto = ['xilinx.com:hls:HlsSvdKernel:1.0']

    def start_accel(self):
        self.register_map.CTRL.AP_START = 1
        self.write(0x0, 1)
        self.write(0x0, 1)
        while(self.read(0x0) % 2 == 0):
            self.write(0x0, 1)
            pass # Wait until start, i.e. bit 0, is set.

    def set_state(self, state):
        # self.register_map.CTRL = state
        # return self.register_map.CTRL
        self.write(0x0, state)
        return self.read(0x0)

    def get_state(self):
        return self.register_map.CTRL
        # return self.read(0x0)

    @property
    def num_active_inputs(self):
        return self.register_map.num_active_inputs

    @num_active_inputs.setter
    def num_active_inputs(self, N):
        self.register_map.num_active_inputs = N

    @property
    def input_size(self):
        return self.register_map.input_size

    @input_size.setter
    def input_size(self, I):
        self.register_map.input_size = I

    @property
    def output_size(self):
        return self.register_map.output_size

    @output_size.setter
    def output_size(self, H):
        self.register_map.output_size = H

    @property
    def num_refinements(self):
        return (self.register_map.num_refinements_0, self.register_map.num_refinements_1)

    @num_refinements.setter
    def num_refinements(self, R):
        self.register_map.num_refinements_0 = R[0]
        self.register_map.num_refinements_1 = R[1]

overlay = Overlay("overlay/kernel_svd.bit")

To show the class is working, we setup the `num_refinements` using the setter method. We then read its corresponding register.

In [33]:
kernel_svd.num_refinements = (1, 1)
kernel_svd.num_refinements

(Register(num_refinements_0=1), Register(num_refinements_1=1))

## Data Allocation and Run

The data structures must be contiguosly allocated.

In [34]:
data_t = np.int16
G = kernel_svd.max_G
N = 2
I = 8
H = 8
R = 4
x_buffer = pynq.allocate(shape=(N, I), dtype=data_t)
u_buffer = pynq.allocate(shape=(R, I, G), dtype=data_t)
s_buffer = pynq.allocate(shape=(R, N, G), dtype=data_t)
v_buffer = pynq.allocate(shape=(R, H, G), dtype=data_t)
y_buffer = pynq.allocate(shape=(N, G, H), dtype=data_t)

# Generate random arrays
x_np = np.random.rand(N, I).astype(dtype=data_t)
u_np = np.random.rand(R, I, G).astype(dtype=data_t)
s_np = np.random.rand(R, N, G).astype(dtype=data_t)
v_np = np.random.rand(R, H, G).astype(dtype=data_t)
y_np = np.zeros((N, G, H)).astype(dtype=data_t)

np.copyto(x_buffer, x_np, casting='no')
np.copyto(u_buffer, u_np, casting='no')
np.copyto(s_buffer, s_np, casting='no')
np.copyto(v_buffer, v_np, casting='no')
np.copyto(y_buffer, y_np, casting='no')

print('Buffers setup completed.')
print(f'x_buffer.shape: {x_buffer.shape} - Bytes: {x_buffer.nbytes}')
print(f'u_buffer.shape: {u_buffer.shape} - Bytes: {u_buffer.nbytes}')

Buffers setup completed.
x_buffer.shape: (2, 8) - Bytes: 32
u_buffer.shape: (4, 8, 4) - Bytes: 256


Setup the kernel and then send the data through the DMAs.

In [35]:
kernel_svd.num_active_inputs = N
kernel_svd.input_size = I
kernel_svd.output_size = H
kernel_svd.num_refinements = (R, R)
print(kernel_svd.get_state())
kernel_svd.start_accel()
print(kernel_svd.get_state())
kernel_svd.register_map

0x4
0x1


RegisterMap {
  CTRL = Register(AP_START=1, AP_DONE=0, AP_IDLE=0, AP_READY=0, RESERVED_1=0, AUTO_RESTART=0, RESERVED_2=0, RESERVED_3=0, RESERVED_4=0),
  GIER = Register(Enable=0, RESERVED=0),
  IP_IER = Register(CHAN0_INT_EN=0, CHAN1_INT_EN=0, RESERVED=0),
  IP_ISR = Register(CHAN0_INT_ST=0, CHAN1_INT_ST=0, RESERVED=0),
  num_active_inputs = Register(num_active_inputs=2),
  input_size = Register(input_size=8),
  output_size = Register(output_size=8),
  num_refinements_0 = Register(num_refinements_0=4),
  num_refinements_1 = Register(num_refinements_1=4)
}

In [36]:
# Transfer
print('Starting transfer:')
overlay.x_dma.sendchannel.transfer(x_buffer)
overlay.u_dma.sendchannel.transfer(u_buffer)
overlay.s_dma.sendchannel.transfer(s_buffer)
overlay.v_dma.sendchannel.transfer(v_buffer)
overlay.y_dma.recvchannel.transfer(y_buffer)
# Then wait
print('Waiting transfer completion.')
overlay.x_dma.sendchannel.wait()
print(f'x_DMA done.')
overlay.u_dma.sendchannel.wait()
print(f'u_DMA done.')
overlay.s_dma.sendchannel.wait()
print(f's_DMA done.')
overlay.v_dma.sendchannel.wait()
print(f'v_DMA done.')
overlay.y_dma.recvchannel.wait()
print(f'y_DMA done.')
print('Done.\n')

print(f'y_buffer.shape: {y_buffer.shape}')

Starting transfer:
Waiting transfer completion.
x_DMA done.
u_DMA done.
s_DMA done.
v_DMA done.


KeyboardInterrupt: 

In [11]:
def run_kernel(R, x_buffer, u_buffer, xu_buffer):
    kernel_u.num_refinements = R
    kernel_u.start_accel()
    # Transfer
    overlay.x_dma.sendchannel.transfer(x_buffer)
    overlay.u_dma.sendchannel.transfer(u_buffer)
    overlay.xu_dma.recvchannel.transfer(xu_buffer)
    # Then wait
    overlay.x_dma.sendchannel.wait()
    overlay.u_dma.sendchannel.wait()
    overlay.xu_dma.recvchannel.wait()

In [12]:
%timeit run_kernel(R, x_buffer, u_buffer, xu_buffer)

10 loops, best of 3: 148 ms per loop


## Checking Correctness

We first find the proper reshape mechanisms:

In [15]:
# =============================================================================
# Reshape: (R, I, G) => (R, I // Tu, G, Tu)
# =============================================================================
u = np.random.randn(R, I, G)
u_tmp = u.copy()
u_tmp = np.transpose(u_tmp.reshape(R, I // Tu, Tu, G), (0, 1, 3, 2))
print(u[0, 0:4, 0], u_tmp[0, 0, 0, 0:4])
print(u[0, 3, 0] - u_tmp[0, 0, 0, 3])

# =============================================================================
# Reshape: (R, I // Tu, G, Tu) => (I, G, R)
# =============================================================================
u = np.random.randn(R, I // Tu, G, Tu)
u_tmp = u.copy()
u_tmp = np.transpose(u_tmp, (1, 3, 2, 0)).reshape(I, G, R)
print(u[0, 0, 0, 0:4], u_tmp[0:4, 0, 0])
print(u[0, 0, 0, 3] - u_tmp[3, 0, 0])

x = np.random.randn(N, I)
u = np.random.randn(I, G, R)
x = (x * 2).astype(np.int16)
u = (u * 2).astype(np.int16)

xu = np.transpose(np.tensordot(x, u, axes=1), (2, 1, 0))
print(xu.shape)

[-1.25823639  1.03248304 -0.3389279  -0.26103506] [-1.25823639  1.03248304 -0.3389279  -0.26103506]
0.0
[ 0.38526848 -0.34712276 -0.39317614  0.77762274] [ 0.38526848 -0.34712276 -0.39317614  0.77762274]
0.0
(128, 4, 2)


We now check the Numpy computation against the FPGA result.

In [16]:
u_tmp = np.transpose(u_buffer, (1, 3, 2, 0)).reshape(I, G, R)
xu_gold = np.transpose(np.tensordot(x_buffer, u_tmp, axes=1), (2, 1, 0))
print('\nAll equal:', np.allclose(xu_buffer, xu_gold))
print('gold[0]: ', xu_gold[0])
print('fpga[0]: ', xu_buffer[0])


All equal: True
gold[0]:  [[ -3634 -22667]
 [ 31065  15347]
 [ 22140  -9595]
 [  9106  26136]]
fpga[0]:  [[ -3634 -22667]
 [ 31065  15347]
 [ 22140  -9595]
 [  9106  26136]]
