###  mmult on PYNQ
##### Source : https://github.com/tkat0/pynqmmult

* SDSoC付属の内積演算サンプルをPYNQで動かしてみた
* ドライバは共有ライブラリとしてビルドし、CFFIを利用してPythonから呼び出す
* CMAでアロケートした連続領域をnumpyのndarrayとして扱えるようにした
* 開発環境は、SDSoC 2015.4

In [3]:
import numpy as np
import cffi
from pynq import Overlay
# load Base Overlay
Overlay("/home/xilinx/pynq/bitstream/base.bit").download()

from pynq.drivers import xlnk

ImportError: cannot import name 'xlnk'

In [4]:
# SDSoCでビルドした共有ライブラリから、HW化した関数の名前を把握する
# SW版の関数名は、mmult_accelだが、HW化すると_p0_mmult_accel_0となる。
# さらに、コンパイラの名前マングリングにより_Z17_p0_mmult_accel_0PfS_S_になるよう
!nm -C ./libpynqmmult.so | grep mmult_accel
!echo ---
!nm -D ./libpynqmmult.so | grep mmult_accel

000121e4 B _p0_mmult_accel_0_num_out_C
00012168 D _p0_swinst_mmult_accel_0
00012124 D _p0_swinst_mmult_accel_0_cmd_mmult_accel_info
00012144 D _p0_swinst_mmult_accel_0_in_A_info
00012150 D _p0_swinst_mmult_accel_0_in_B_info
0001215c D _p0_swinst_mmult_accel_0_out_C_info
00012104 D _sds__p0_mmult_accel_0
00001318 T mmult_accel(float*, float*, float*)
000014c0 T _p0_mmult_accel_0(float*, float*, float*)
---
000121e4 B _p0_mmult_accel_0_num_out_C
00012168 D _p0_swinst_mmult_accel_0
00012124 D _p0_swinst_mmult_accel_0_cmd_mmult_accel_info
00012144 D _p0_swinst_mmult_accel_0_in_A_info
00012150 D _p0_swinst_mmult_accel_0_in_B_info
0001215c D _p0_swinst_mmult_accel_0_out_C_info
00012104 D _sds__p0_mmult_accel_0
00001318 T _Z11mmult_accelPfS_S_
000014c0 T _Z17_p0_mmult_accel_0PfS_S_


### Call Accelerator

In [5]:
class Mmult():
    def __init__(self):
        self.bitfile = "./pynqmmult.bit"
        self.libfile = "./libpynqmmult.so"
        self.ffi = cffi.FFI()
        self.ffi.cdef("void _Z17_p0_mmult_accel_0PfS_S_(float*, float*, float*);")
        self.lib = self.ffi.dlopen(self.libfile)
        Overlay(self.bitfile).download()
    
    def __call__(self, a, b, c):
        # a,b,c is CData Object
        self.lib._Z17_p0_mmult_accel_0PfS_S_(a,b,c)


In [6]:
mmult = Mmult()

OSError: cannot load library ./libpynqmmult.so: ./libpynqmmult.so: undefined symbol: accel_register

In [5]:
# SDSoCにより高位合成したHWにDMA(not SG)経由で読み書きするメモリ領域は連続領域である必要がある
# 連続領域は、CMAのAPIにより、確保する
# 連続領域をndarrayとして扱う

memmanager = xlnk.xlnk()
ffi = cffi.FFI()

 # TODO 現状32x32の配列のみ対応。汎用化する
def init_contiguous_ndarray(size=(32,32), dtype="float"):
    buf = memmanager.cma_alloc(32*32, data_type=dtype)
    cbuf = ffi.buffer(buf,  32*32 * ffi.sizeof(dtype))
    return np.frombuffer(cbuf, dtype=np.float32).reshape(size), buf

In [6]:
a, pa = init_contiguous_ndarray()
b, pb = init_contiguous_ndarray()
c, pc = init_contiguous_ndarray()

a += 1
b += 2

print("A", a.shape, type(a))
print(a)
print("B", b.shape, type(b))
print(b)
print("C", c.shape, type(c))
print(c)

A (32, 32) <class 'numpy.ndarray'>
[[ 1.  1.  1. ...,  1.  1.  1.]
 [ 1.  1.  1. ...,  1.  1.  1.]
 [ 1.  1.  1. ...,  1.  1.  1.]
 ..., 
 [ 1.  1.  1. ...,  1.  1.  1.]
 [ 1.  1.  1. ...,  1.  1.  1.]
 [ 1.  1.  1. ...,  1.  1.  1.]]
B (32, 32) <class 'numpy.ndarray'>
[[ 2.  2.  2. ...,  2.  2.  2.]
 [ 2.  2.  2. ...,  2.  2.  2.]
 [ 2.  2.  2. ...,  2.  2.  2.]
 ..., 
 [ 2.  2.  2. ...,  2.  2.  2.]
 [ 2.  2.  2. ...,  2.  2.  2.]
 [ 2.  2.  2. ...,  2.  2.  2.]]
C (32, 32) <class 'numpy.ndarray'>
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [7]:
mmult(pa, pb, pc)

In [8]:
print("C", c.shape, type(c))
print(c)

C (32, 32) <class 'numpy.ndarray'>
[[ 64.  64.  64. ...,  64.  64.  64.]
 [ 64.  64.  64. ...,  64.  64.  64.]
 [ 64.  64.  64. ...,  64.  64.  64.]
 ..., 
 [ 64.  64.  64. ...,  64.  64.  64.]
 [ 64.  64.  64. ...,  64.  64.  64.]
 [ 64.  64.  64. ...,  64.  64.  64.]]


### Test

In [9]:
# For comparison
np.dot(a,b)

array([[ 64.,  64.,  64., ...,  64.,  64.,  64.],
       [ 64.,  64.,  64., ...,  64.,  64.,  64.],
       [ 64.,  64.,  64., ...,  64.,  64.,  64.],
       ..., 
       [ 64.,  64.,  64., ...,  64.,  64.,  64.],
       [ 64.,  64.,  64., ...,  64.,  64.,  64.],
       [ 64.,  64.,  64., ...,  64.,  64.,  64.]], dtype=float32)

In [10]:
# SWとHWの計算結果の全ての要素が一致することを確認
if np.alltrue(c == np.dot(a, b)):
    print("OK")
else:
    print("NG")

OK


### Benchmarks

In [11]:
# HW
t_hw = %timeit -n 100 -o mmult(pa, pb, pc)

100 loops, best of 3: 33.6 µs per loop


In [12]:
# SW
t_sw = %timeit -n 100 -o np.dot(a, b)

100 loops, best of 3: 7.36 ms per loop
