# Thomas Solver Overlay Test

This notebook compares the FPGA implementation of a tridiagonal matrix solver against a NumPy implementation. It measures the execution time of each approach and checks that both produce the same result.

In [15]:
from pynq import Overlay, allocate
import numpy as np
import time

np.set_printoptions(precision=4, suppress=True)


In [22]:
# Load overlay and inspect available IP blocks
overlay = Overlay('custom_tomas_solver_v1.bit')
# Replace 'thomas_solver_0' with the actual IP name from the printed dictionary if different
solver_ip = overlay.thomas_solver_0

In [23]:
N = 64
# Constants describing the tridiagonal matrix
dp = np.complex64(4+0j)
dp1 = np.complex64(3+0j)
dp2 = np.complex64(3+0j)
off = np.complex64(1+0j)

# Allocate buffers accessible to the FPGA
a_b = allocate(shape=(N,), dtype=np.complex64)
a_x = allocate(shape=(N,), dtype=np.complex64)

# Random right-hand side vector
b_np = (np.random.rand(N) + 1j*np.random.rand(N)).astype(np.complex64)
a_b[:] = b_np
print('Input vector b', b_np)


Input vector b [0.9951+0.2919j 0.5935+0.5928j 0.6198+0.505j  0.5183+0.2525j
 0.4651+0.4984j 0.6998+0.8327j 0.1664+0.9751j 0.1527+0.33j
 0.5166+0.7082j 0.0615+0.1517j 0.1662+0.8839j 0.4362+0.7738j
 0.1737+0.8555j 0.0157+0.163j  0.3768+0.3657j 0.2823+0.4069j
 0.7414+0.6674j 0.3859+0.7601j 0.6939+0.2425j 0.8607+0.5738j
 0.6065+0.6454j 0.5069+0.705j  0.9654+0.9096j 0.8694+0.1258j
 0.049 +0.8078j 0.1296+0.1774j 0.5161+0.9913j 0.694 +0.3089j
 0.5499+0.5346j 0.3003+0.6355j 0.2576+0.5043j 0.8421+0.4894j
 0.6225+0.8446j 0.7132+0.0236j 0.648 +0.4634j 0.0585+0.9294j
 0.5342+0.4923j 0.2242+0.2272j 0.9805+0.6915j 0.6467+0.3867j
 0.1612+0.5548j 0.2187+0.4103j 0.509 +0.6446j 0.8942+0.2929j
 0.3422+0.5471j 0.0877+0.144j  0.0508+0.4213j 0.2647+0.267j
 0.0522+0.9759j 0.6657+0.4402j 0.4171+0.6302j 0.3547+0.472j
 0.1001+0.816j  0.0259+0.6463j 0.5059+0.6132j 0.0026+0.9244j
 0.6982+0.3534j 0.009 +0.9845j 0.5562+0.6305j 0.9782+0.1778j
 0.9225+0.2533j 0.0949+0.7248j 0.6995+0.2077j 0.4306+0.3071j]


In [24]:
def thomas_solver_numpy(dp, dp1, dp2, off, b):
    N = b.shape[0]
    c_prime = np.empty(N, dtype=np.complex64)
    d_prime = np.empty(N, dtype=np.complex64)
    inv = 1.0/np.complex64(dp1)
    c_prime[0] = off * inv
    d_prime[0] = b[0] * inv
    for i in range(1, N-1):
        denom = dp - off * c_prime[i-1]
        inv = 1.0/denom
        c_prime[i] = off * inv
        d_prime[i] = (b[i] - off * d_prime[i-1]) * inv
    denom = dp2 - off * c_prime[N-2]
    d_prime[N-1] = (b[N-1] - off * d_prime[N-2]) / denom
    x = np.empty(N, dtype=np.complex64)
    x[-1] = d_prime[-1]
    for i in range(N-2, -1, -1):
        x[i] = d_prime[i] - c_prime[i] * x[i+1]
    return x

In [26]:
# CPU reference implementation
t0 = time.time()
x_ref = thomas_solver_numpy(dp, dp1, dp2, off, b_np)
cpu_time = time.time() - t0
print(f'CPU time: {cpu_time*1e3:.3f} ms')


CPU time: 2.424 ms


In [27]:
a_rm = solver_ip.register_map
print(a_rm)

RegisterMap {
  CTRL = Register(AP_START=0, AP_DONE=0, AP_IDLE=1, AP_READY=0, RESERVED_1=0, AUTO_RESTART=0, RESERVED_2=0, INTERRUPT=0, RESERVED_3=0),
  GIER = Register(Enable=0, RESERVED=2),
  IP_IER = Register(CHAN0_INT_EN=0, CHAN1_INT_EN=0, RESERVED_0=1),
  IP_ISR = Register(CHAN0_INT_ST=0, CHAN1_INT_ST=0, RESERVED_0=1),
  dp_1 = Register(dp=write-only),
  dp_2 = Register(dp=write-only),
  dp1_1 = Register(dp1=write-only),
  dp1_2 = Register(dp1=write-only),
  dp2_1 = Register(dp2=write-only),
  dp2_2 = Register(dp2=write-only),
  off_1 = Register(off=write-only),
  off_2 = Register(off=write-only)
}


In [28]:
# Configure solver IP
a_rm = solver_ip.register_map
a_rm.dp_r = float(np.real(dp))
a_rm.dp_i = float(np.imag(dp))
a_rm.dp1_r = float(np.real(dp1))
a_rm.dp1_i = float(np.imag(dp1))
a_rm.dp2_r = float(np.real(dp2))
a_rm.dp2_i = float(np.imag(dp2))
a_rm.off_r = float(np.real(off))
a_rm.off_i = float(np.imag(off))
a_rm.b = a_b.physical_address
a_rm.x = a_x.physical_address

# Run hardware solver
t0 = time.time()
a_rm.CTRL.AP_START = 1
while a_rm.CTRL.AP_DONE == 0:
    a_rm = solver_ip.register_map
hw_time = time.time() - t0
print(f'Hardware time: {hw_time*1e3:.3f} ms')
if hw_time > 0:
    print(f'Speedup (CPU/HW): {cpu_time/hw_time:.2f}x')


Hardware time: 0.738 ms
Speedup (CPU/HW): 3.29x


In [30]:
# Compare results
x_hw = np.array(a_x)
print('Results match:', np.allclose(x_ref, x_hw, atol=1e-6))
print('CPU result x_ref', x_ref)
print('Hardware result x_hw', x_hw)


Results match: False
CPU result x_ref [ 0.3192+0.0608j  0.0375+0.1094j  0.1244+0.0945j  0.0848+0.0177j
  0.0548+0.087j   0.1612+0.1324j  0.0001+0.216j   0.0047-0.0212j
  0.134 +0.1987j -0.024 -0.0655j  0.0234+0.2149j  0.0966+0.09j
  0.0263+0.199j  -0.0279-0.0305j  0.1011+0.086j   0.0001+0.052j
  0.1805+0.1127j  0.0191+0.1644j  0.1291-0.0102j  0.1584+0.1189j
  0.0978+0.1085j  0.0568+0.0927j  0.1818+0.2258j  0.1814-0.0862j
 -0.0378+0.2448j  0.0187-0.0853j  0.0926+0.2739j  0.1271-0.019j
  0.093 +0.1111j  0.0508+0.1093j  0.0042+0.0872j  0.1901+0.0463j
  0.0776+0.217j   0.1222-0.0697j  0.1469+0.0853j -0.0619+0.192j
  0.1593+0.0761j -0.0412-0.0042j  0.2297+0.168j   0.1029+0.0235j
  0.0056+0.1245j  0.0361+0.0331j  0.0686+0.1532j  0.1985-0.0012j
  0.0317+0.1445j  0.017 -0.0299j -0.0119+0.119j   0.0815-0.0248j
 -0.0493+0.2472j  0.1678+0.0121j  0.0439+0.1446j  0.0737+0.0396j
  0.0159+0.1688j -0.0373+0.101j   0.1593+0.0734j -0.0938+0.2186j
  0.2185-0.0232j -0.0821+0.2278j  0.1188+0.0964j  0.1629+

In [32]:
overlay.free()
a_b.free()
a_x.free()

AttributeError: 'PynqBuffer' object has no attribute 'free'