In [1]:
from pynq import Overlay
ol = Overlay('matrix.bit')

In [2]:
dma = ol.axi_dma_0

In [3]:
data_send = dma.sendchannel
data_recv = dma.recvchannel

In [4]:
import numpy as np
import random 

In [5]:
random.seed(2)
data_A = np.random.uniform(low=0, high=10, size=(32, 32)).astype(float)
data_B = np.random.uniform(low=0, high=10, size=(32, 32)).astype(float)

In [6]:
data_A, data_B

(array([[7.46713664, 1.28009441, 4.44601356, ..., 5.6087702 , 1.5769116 ,
         4.41272805],
        [6.92401984, 5.6963646 , 2.41438126, ..., 3.07592663, 8.76372156,
         7.49377515],
        [1.24765405, 9.69368611, 0.93861617, ..., 4.56548003, 8.80471041,
         0.29034192],
        ...,
        [0.36441048, 6.34301879, 0.86991348, ..., 4.34230666, 7.01742382,
         0.43338361],
        [2.29490331, 1.33691716, 2.16477319, ..., 7.50453154, 0.7113644 ,
         5.53221198],
        [8.01725648, 6.12393779, 3.54815734, ..., 1.4399024 , 0.59694786,
         0.43932485]]),
 array([[7.30735456, 0.80547148, 8.34309233, ..., 1.30228234, 1.42521682,
         5.47922807],
        [2.34379101, 4.94618668, 0.58272723, ..., 8.66618838, 6.76139372,
         4.62787536],
        [8.09996709, 2.04918406, 2.00246559, ..., 5.92873438, 7.2034194 ,
         2.88286321],
        ...,
        [3.28799937, 7.47999015, 6.47438118, ..., 3.34455689, 5.46777963,
         3.09009459],
        [9.9

In [7]:
def multiply(matrix_a, matrix_b):
    result_matrix = np.zeros((32,32))
    for i in range(32):
        for j in range(32):
            for k in range(32):
                result_matrix[i][j] += matrix_a[i][k] * matrix_b[k][j]
                
    return result_matrix

In [8]:
import time
start_time = time.time()
multiply(data_A, data_B) #software
end_time = time.time()
print(end_time - start_time)

0.6534547805786133


In [10]:
from pynq import allocate

In [11]:
data_a_f = data_A.flatten()
data_b_f = data_B.flatten()

data_input = np.concatenate([data_a_f, data_b_f])

In [12]:
data_input

array([7.46713664, 1.28009441, 4.44601356, ..., 6.63239433, 7.21664369,
       5.45820054])

In [13]:
input_buffer = allocate(2048, np.float32)
output_buffer = allocate((32,32), np.float32)

In [14]:
np.copyto(input_buffer, data_input)

In [15]:
start_time = time.time()
data_send.transfer(input_buffer)
data_recv.transfer(output_buffer)
data_send.wait()
data_recv.wait()
end_time = time.time()
print(end_time - start_time)

0.0027790069580078125


In [16]:
output_buffer

PynqBuffer([[903.2101 , 696.98065, 960.0653 , ..., 890.6255 , 987.86115,
             741.74835],
            [785.84674, 640.02606, 831.40607, ..., 701.7499 , 814.31586,
             747.61847],
            [787.9109 , 666.77637, 859.2986 , ..., 880.60187, 805.64874,
             729.9307 ],
            ...,
            [731.228  , 621.3732 , 763.2449 , ..., 670.7744 , 847.6279 ,
             685.7108 ],
            [820.6272 , 606.94604, 907.74866, ..., 765.6956 , 818.38324,
             720.1079 ],
            [802.2441 , 624.4669 , 905.74567, ..., 829.118  , 927.43884,
             723.16565]], dtype=float32)

In [17]:
diff = np.dot(data_A, data_B) - output_buffer

In [18]:
rms_error = np.sqrt(np.mean(diff**2))

In [19]:
diff, rms_error

(PynqBuffer([[-1.11889032e-04,  2.68505721e-06,  2.70032759e-05, ...,
              -5.73156627e-05, -9.43749520e-05,  9.98094234e-05],
             [ 7.21167686e-05,  2.72816874e-06,  8.38799817e-05, ...,
               1.83074641e-04,  2.05738652e-04,  4.04082491e-05],
             [ 4.89622682e-05,  1.02489627e-04, -6.41267700e-05, ...,
              -5.26360318e-05,  3.32801250e-05,  4.27365846e-05],
             ...,
             [-1.82989643e-05,  2.08680959e-05,  6.87599783e-05, ...,
               6.69503920e-05,  4.47446303e-06, -4.12651664e-05],
             [ 1.04660478e-04,  4.81303072e-05,  2.35066569e-05, ...,
               1.40895987e-04, -9.64926921e-05,  7.02942774e-05],
             [-2.19423789e-05,  4.34030634e-05,  9.53086956e-05, ...,
              -5.02120357e-05,  2.43553142e-05, -2.85307189e-05]]),
 PynqBuffer(6.75104759e-05))

In [26]:
%%time
np.dot(data_A, data_B)

CPU times: user 658 µs, sys: 56 µs, total: 714 µs
Wall time: 727 µs


array([[903.20997112, 696.98065454, 960.06533462, ..., 890.62543097,
        987.86105064, 741.74845186],
       [785.84681284, 640.02606474, 831.40615077, ..., 701.750061  ,
        814.31606267, 747.61850965],
       [787.91093763, 666.77646968, 859.29851986, ..., 880.60181504,
        805.64877596, 729.93076783],
       ...,
       [731.22800904, 621.37325085, 763.24494181, ..., 670.77448101,
        847.62793416, 685.71077416],
       [820.62730193, 606.94609305, 907.74868073, ..., 765.69575857,
        818.38314325, 720.10798045],
       [802.24405765, 624.46696235, 905.74576181, ..., 829.11793074,
        927.43886713, 723.16562088]])

In [23]:
np.dot(data_A, data_B) - multiply(data_A, data_B)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])