# Thomas Solver Overlay Test

This notebook compares the FPGA implementation of a tridiagonal matrix solver against a NumPy implementation. It measures the execution time of each approach and checks that both produce the same result.

In [33]:
from pynq import Overlay, allocate
import numpy as np
import time

np.set_printoptions(precision=4, suppress=True)


In [34]:
# Load overlay and inspect available IP blocks
overlay = Overlay('custom_tomas_solver_v1.bit')
# Replace 'thomas_solver_0' with the actual IP name from the printed dictionary if different
solver_ip = overlay.thomas_solver_0

In [35]:
N = 64
# Constants describing the tridiagonal matrix
dp = np.complex64(4+0j)
dp1 = np.complex64(3+0j)
dp2 = np.complex64(3+0j)
off = np.complex64(1+0j)

# Allocate buffers accessible to the FPGA
a_b = allocate(shape=(N,), dtype=np.complex64)
a_x = allocate(shape=(N,), dtype=np.complex64)

# Random right-hand side vector
b_np = (np.random.rand(N) + 1j*np.random.rand(N)).astype(np.complex64)
a_b[:] = b_np
a_x.fill(0)
a_b.flush()
a_x.flush()
print('Input vector b', b_np)


Input vector b [0.6907+0.697j  0.7775+0.8045j 0.4146+0.3793j 0.7462+0.681j
 0.8194+0.1159j 0.3962+0.4246j 0.4628+0.4783j 0.8896+0.1898j
 0.5112+0.8052j 0.8238+0.5776j 0.5049+0.1007j 0.8522+0.0221j
 0.8214+0.3274j 0.9018+0.8867j 0.2052+0.322j  0.5452+0.6983j
 0.7015+0.11j   0.8208+0.3336j 0.082 +0.6328j 0.3224+0.3763j
 0.1736+0.4839j 0.7128+0.8203j 0.9912+0.7411j 0.6276+0.7991j
 0.2117+0.0522j 0.4267+0.4251j 0.7297+0.8888j 0.681 +0.0216j
 0.1692+0.8624j 0.401 +0.0815j 0.6651+0.9806j 0.9598+0.9482j
 0.3716+0.4935j 0.6027+0.1938j 0.1805+0.0614j 0.4263+0.9759j
 0.457 +0.0253j 0.521 +0.6958j 0.148 +0.8498j 0.675 +0.7923j
 0.2311+0.408j  0.3715+0.319j  0.9363+0.3622j 0.5922+0.4592j
 0.5565+0.2786j 0.2766+0.7272j 0.0716+0.7149j 0.1895+0.7202j
 0.1285+0.3064j 0.6185+0.2962j 0.1767+0.9275j 0.7504+0.0615j
 0.6549+0.5989j 0.1388+0.2255j 0.2911+0.0297j 0.1827+0.5766j
 0.9335+0.0607j 0.7628+0.6257j 0.1509+0.1544j 0.8239+0.0395j
 0.0967+0.4822j 0.4657+0.9578j 0.9557+0.1788j 0.1846+0.6515j]


In [36]:
def thomas_solver_numpy(dp, dp1, dp2, off, b):
    N = b.shape[0]
    c_prime = np.empty(N, dtype=np.complex64)
    d_prime = np.empty(N, dtype=np.complex64)
    inv = 1.0/np.complex64(dp1)
    c_prime[0] = off * inv
    d_prime[0] = b[0] * inv
    for i in range(1, N-1):
        denom = dp - off * c_prime[i-1]
        inv = 1.0/denom
        c_prime[i] = off * inv
        d_prime[i] = (b[i] - off * d_prime[i-1]) * inv
    denom = dp2 - off * c_prime[N-2]
    d_prime[N-1] = (b[N-1] - off * d_prime[N-2]) / denom
    x = np.empty(N, dtype=np.complex64)
    x[-1] = d_prime[-1]
    for i in range(N-2, -1, -1):
        x[i] = d_prime[i] - c_prime[i] * x[i+1]
    return x

In [37]:
# CPU reference implementation
t0 = time.time()
x_ref = thomas_solver_numpy(dp, dp1, dp2, off, b_np)
cpu_time = time.time() - t0
print(f'CPU time: {cpu_time*1e3:.3f} ms')


CPU time: 2.353 ms


In [38]:
a_rm = solver_ip.register_map
print(a_rm)

RegisterMap {
  CTRL = Register(AP_START=0, AP_DONE=0, AP_IDLE=1, AP_READY=0, RESERVED_1=0, AUTO_RESTART=0, RESERVED_2=0, INTERRUPT=0, RESERVED_3=0),
  GIER = Register(Enable=0, RESERVED=2),
  IP_IER = Register(CHAN0_INT_EN=0, CHAN1_INT_EN=0, RESERVED_0=1),
  IP_ISR = Register(CHAN0_INT_ST=0, CHAN1_INT_ST=0, RESERVED_0=1),
  dp_1 = Register(dp=write-only),
  dp_2 = Register(dp=write-only),
  dp1_1 = Register(dp1=write-only),
  dp1_2 = Register(dp1=write-only),
  dp2_1 = Register(dp2=write-only),
  dp2_2 = Register(dp2=write-only),
  off_1 = Register(off=write-only),
  off_2 = Register(off=write-only)
}


In [45]:
# Configure solver IP
rm = solver_ip.register_map
rm.dp_r = float(np.real(dp))
rm.dp_i = float(np.imag(dp))
rm.dp1_r = float(np.real(dp1))
rm.dp1_i = float(np.imag(dp1))
rm.dp2_r = float(np.real(dp2))
rm.dp2_i = float(np.imag(dp2))
rm.off_r = float(np.real(off))
rm.off_i = float(np.imag(off))
rm.b = a_b.physical_address
rm.x = a_x.physical_address

# Run hardware solver
t0 = time.time()
rm.CTRL.AP_START = 1
while True:
    rm = solver_ip.register_map
    if rm.CTRL.AP_DONE:
        break
hw_time = time.time() - t0
print(f'Hardware time: {hw_time*1e3:.3f} ms')
if hw_time > 0:
    print(f'Speedup (CPU/HW): {cpu_time/hw_time:.2f}x')


Hardware time: 0.654 ms
Speedup (CPU/HW): 3.60x


In [46]:
# Compare results
a_x.invalidate()
x_hw = np.array(a_x)
print('Results match:', np.allclose(x_ref, x_hw, atol=1e-6))
print('CPU result x_ref', x_ref)
print('Hardware result x_hw', x_hw)


Results match: False
CPU result x_ref [ 0.1836+0.1814j  0.14  +0.1526j  0.0341+0.0125j  0.1383+0.1765j
  0.1588-0.0376j  0.0459+0.0898j  0.0539+0.1031j  0.2014-0.024j
  0.03  +0.1827j  0.1898+0.0985j  0.0345+0.0009j  0.177 -0.0015j
  0.1097+0.0271j  0.2055+0.2203j -0.03  -0.0217j  0.1196+0.1886j
  0.097 -0.0344j  0.194 +0.0588j -0.0523+0.1326j  0.0974+0.0436j
 -0.0147+0.0692j  0.1351+0.1634j  0.187 +0.0975j  0.1082+0.1875j
  0.0078-0.0485j  0.0725+0.0589j  0.129 +0.238j   0.1413-0.1222j
 -0.0132+0.2726j  0.0806-0.1056j  0.0919+0.2312j  0.217 +0.1615j
 -0.0001+0.0711j  0.1551+0.0478j -0.0177-0.0686j  0.0963+0.288j
  0.0589-0.1076j  0.1252+0.1678j -0.0387+0.1324j  0.1776+0.1524j
  0.0033+0.0504j  0.0403+0.0542j  0.2069+0.0519j  0.0685+0.1003j
  0.1112+0.006j   0.043 +0.1542j -0.0066+0.1042j  0.0551+0.1439j
 -0.0243+0.0405j  0.1707+0.0004j -0.04  +0.2542j  0.1659-0.0896j
  0.1267+0.1658j -0.0179+0.0255j  0.0836-0.0421j -0.0254+0.1726j
  0.2009-0.0718j  0.1554+0.1754j -0.0597-0.0042j  0.23

In [32]:
overlay.free()
a_b.free()
a_x.free()

AttributeError: 'PynqBuffer' object has no attribute 'free'