In [1]:
import numpy as np
from pynq import Overlay
import time

In [2]:
ol = Overlay('svd.bit')

In [3]:
ol?

In [4]:
dma_main = ol.axi_dma_0
dma2 = ol.axi_dma_1
dma3 = ol.axi_dma_2

In [36]:
def rmse(A, B):
    return np.sqrt(np.mean((A - B) ** 2))

In [5]:
dma_main?

In [6]:
A_channel = dma_main.sendchannel
S_channel = dma_main.recvchannel
U_channel = dma2.recvchannel
V_channel = dma3.recvchannel

In [7]:
import random

In [8]:
np.random.seed(1)
data_A = np.random.uniform(low=1, high=2, size=(50, 50)).astype(np.float32)

In [9]:
data_A

array([[1.417022 , 1.7203245, 1.0001143, ..., 1.2936141, 1.2877754,
        1.1300286],
       [1.019367 , 1.6788355, 1.2116281, ..., 1.5736794, 1.0028703,
        1.617145 ],
       [1.3266449, 1.5270581, 1.8859421, ..., 1.712989 , 1.5597169,
        1.012556 ],
       ...,
       [1.1466087, 1.6124611, 1.2793204, ..., 1.0070467, 1.0595286,
        1.4814005],
       [1.9257041, 1.4055574, 1.0750326, ..., 1.1667219, 1.5208484,
        1.7402936],
       [1.4624745, 1.8696141, 1.660412 , ..., 1.5681397, 1.3434201,
        1.3506172]], dtype=float32)

In [10]:
np.shape(data_A)

(50, 50)

In [84]:
np.set_printoptions(suppress=True, precision=4)

def set_lowVal_zero(X):
    low_values_indices = abs(X) < 9e-15   # where values are low
    X[low_values_indices] = 0             # all low values set to 0
    return X

def Householder(x, i):
    alpha = -np.sign(x[i]) * np.linalg.norm(x)
    e = np.zeros(len(x)); e[i] = 1.0
    
    v = (x - alpha * e)
    w = v / np.linalg.norm(v)
    P = np.eye(len(x)) - 2 * np.outer(w, w.T)
    
    return P

def Golub_Kahan(X):
    m, n = X.shape
    J = X.copy()

    U = np.identity(m)
    V = np.identity(n)

    for i in range(n - 2):  # n-2 steps
        # --- Column reflector (left multiply)
        h = np.zeros(m)
        h[i:] = J[i:, i]
        P = Householder(h, i)
        J = set_lowVal_zero(P @ J)
        U = U @ P

        # --- Row reflector (right multiply)
        h = np.zeros(n)
        h[i+1:] = J[i, i+1:]
        Q = Householder(h, i+1)
        J = set_lowVal_zero(J @ Q)
        V = V @ Q

    return U, J, V


def givensrotation_R(a, b):
    if b == 0:
        return 1.0, 0.0
    r = np.hypot(a, b)
    c = a / r
    s = -b / r
    return c, s

def QR_givens(A):
    m, n = A.shape
    R = A.copy()
    Q = np.identity(m,dtype=float)
    for i in range(0, n - 1):
        for j in range(i + 1, m):
            cos, sin = givensrotation_R(R[i, i], R[j, i])
            R[i], R[j] = (R[i]*cos) + (R[j]*(-sin)), (R[i]*sin) + (R[j] * cos)
            Q[i], Q[j] = (Q[i]*cos) + (Q[j]*(-sin)), (Q[i]*sin) + (Q[j] * cos)
    return np.transpose(np.conj(Q)), R

def QR_iterate(A):
    n = A.shape[0]
    Q_total = np.eye(n)
    A_k = A.copy()
    for _ in range(100):
        Q, R = QR_givens(A_k)
        A_k = R @ Q
        Q_total = Q_total @ Q
    
    
        
    return Q_total, A_k

def eigenvalues(A):
    
    Q, R = QR_iterate(A)
    eigenvals = np.diag(R)
    eigenvecs = Q
    return eigenvals, eigenvecs

def fix_sign(v):
    original_shape = v.shape
    v_flat = v.flatten()
    idx = np.argmax(np.abs(v_flat))
    if v_flat[idx] < 0:
        v_flat = -v_flat
    return v_flat.reshape(original_shape)

def compute_svd(A):
    m, n = A.shape
    AAt = A @ A.T
    AtA = A.T @ A

    vals1, vecs1 = eigenvalues(AAt)
    vals2, vecs2 = eigenvalues(AtA)
    #vecs1 = fix_sign(vecs1)
    #vecs2 = fix_sign(vecs2)
    
    sigma = np.sqrt(np.clip(vals1, 0, None))
    V = vecs2

    U = vecs1

    Sigma = np.zeros((m, n))
    np.fill_diagonal(Sigma, sigma)
    
    print(vals1)
    print(vals2)
    
    U[np.abs(U) < 1e-4] = 0
    Sigma[np.abs(Sigma) < 1e-4] = 0
    V[np.abs(V) < 1e-4] = 0
    
    for i in range(min(m,n)):
        Av = A @ V[:, i]
        sig_ui = sigma[i] * U[:, i]
        
        if np.dot(Av, sig_ui) < 0: 
            U[:, i] *= -1
            #V[:, i] *= -1

    return U, Sigma, V.T

def pinv_from_svd(U, Sigma, Vt, tol=1e-10):
    m, n = Sigma.shape
    sigma_vals = np.diag(Sigma)[:min(m, n)]  

    sigma_inv = np.array([1/s if s > tol else 0 for s in sigma_vals])
    Sigma_inv = np.zeros((n, m))  
    for i in range(len(sigma_inv)):
        Sigma_inv[i, i] = sigma_inv[i]

    # A⁺ = V @ Sigma⁺ @ U.T
    return Vt.T @ Sigma_inv @ U.T

In [85]:
def pinv(A):
    u,j,v = Golub_Kahan(A)
    print(np.allclose(A, u @ j @ v.T))
    U, Sigma, V = compute_svd(j)
    print(np.allclose(U@Sigma@V,j))
    U_final = u @ U
    V_final = v.conj().T @ V.conj().T
    print(np.allclose(A, U_final @ Sigma @ V_final.T))
    return pinv_from_svd(U_final, Sigma, V_final.T)

In [89]:
def bidiagonalize_and_svd(A):
    u,j,v = Golub_Kahan(A)
    U, Sigma, V = compute_svd(j)
    U_final = u @ U
    V_final = v.conj().T @ V.conj().T
    return U_final, Sigma, V_final

In [87]:
def pinv(A):
    u,j,v = Golub_Kahan(A)
    print(np.allclose(A, u @ j @ v.T))
    U, Sigma, V = compute_svd(j)
    print(np.allclose(U@Sigma@V,j))
    U_final = u @ U
    V_final = v.conj().T @ V.conj().T
    print(np.allclose(A, U_final @ Sigma @ V_final.T))
    return pinv_from_svd(U_final, Sigma, V_final.T)

In [86]:
start = time.time()
compute_svd(data_A)
fin = time.time()
pstime = fin-start


[5656.0152   14.603    13.6648   12.5412   11.5118   11.3287   10.7519
    9.6317    9.146     8.3449    7.9251    7.1937    6.7256    6.4718
    5.9689    5.7411    4.7031    4.5443    4.2428    4.0484    3.933
    3.601     3.3937    3.1298    3.0056    3.0218    2.7998    2.2186
    1.9852    1.7358    1.6396    1.3852    1.3008    1.0997    0.9889
    0.878     0.6474    0.5618    0.4912    0.4504    0.4148    0.2852
    0.1633    0.111     0.0965    0.0567    0.0463    0.0294    0.0042
    0.0002]
[5656.0116   14.6029   13.6648   12.5412   11.5117   11.3289   10.7518
    9.6313    9.1464    8.3449    7.9251    7.1937    6.7256    6.4718
    5.9688    5.7411    4.703     4.5444    4.2428    3.9352    4.0461
    3.6009    3.3938    3.1324    3.0189    3.0059    2.7998    2.2187
    1.9852    1.7358    1.6396    1.3852    1.3008    1.0997    0.9889
    0.878     0.6474    0.5618    0.4912    0.4505    0.4148    0.2853
    0.1632    0.111     0.0965    0.0568    0.0463    0.0294    0.

In [88]:
print(pstime)

41.07524490356445


In [90]:
start = time.time()
s4,u4,v4 = bidiagonalize_and_svd(data_A)
fin = time.time()
ps2time = fin-start

[5656.0121   14.6029   13.6648   12.5412   11.5279   10.9664   11.0982
    9.6316    9.1461    8.3445    7.9254    7.1937    6.7094    6.4879
    5.9557    5.7542    4.6894    4.558     4.2426    4.0034    3.9781
    3.6009    3.3937    3.132     3.0246    2.9896    2.8109    2.2187
    1.9852    1.6412    1.7342    1.3852    1.3008    1.0997    0.9889
    0.878     0.6474    0.5618    0.4912    0.4503    0.415     0.2853
    0.1633    0.111     0.0965    0.0567    0.0463    0.0294    0.0042
    0.0002]
[5656.0121   14.6029   13.6648   12.5412   11.5279   10.9597   11.1049
    9.6316    9.1461    8.3445    7.9255    7.1937    6.7088    6.4885
    5.9552    5.7546    4.6889    4.5584    4.2426    4.0026    3.9789
    3.6009    3.3937    3.1319    3.0246    2.9889    2.8116    2.2187
    1.9852    1.6411    1.7343    1.3852    1.3008    1.0997    0.9889
    0.878     0.6474    0.5618    0.4912    0.4503    0.415     0.2853
    0.1633    0.111     0.0965    0.0567    0.0463    0.0294    0

In [91]:
print(ps2time)

40.867342948913574


In [12]:
start = time.time()
u,s,v = np.linalg.svd(data_A)
fin = time.time()
nptime = fin-start
print(nptime)

0.016895771026611328


In [13]:
s

array([7.5206467e+01, 3.8213797e+00, 3.6965921e+00, 3.5413575e+00,
       3.3952742e+00, 3.3634415e+00, 3.2789977e+00, 3.1034992e+00,
       3.0242343e+00, 2.8887510e+00, 2.8151503e+00, 2.6821060e+00,
       2.5933847e+00, 2.5439532e+00, 2.4431224e+00, 2.3960464e+00,
       2.1686695e+00, 2.1317306e+00, 2.0598218e+00, 2.0121279e+00,
       1.9830902e+00, 1.8976101e+00, 1.8422046e+00, 1.7700166e+00,
       1.7396508e+00, 1.7314295e+00, 1.6732581e+00, 1.4895160e+00,
       1.4089705e+00, 1.3174994e+00, 1.2804837e+00, 1.1769483e+00,
       1.1405319e+00, 1.0486873e+00, 9.9442625e-01, 9.3701369e-01,
       8.0463731e-01, 7.4955142e-01, 7.0088172e-01, 6.7114925e-01,
       6.4408922e-01, 5.3409749e-01, 4.0408063e-01, 3.3322820e-01,
       3.1071559e-01, 2.3818696e-01, 2.1508199e-01, 1.7145586e-01,
       6.4537309e-02, 1.2543841e-02], dtype=float32)

In [14]:
np.shape(s)

(50,)

In [15]:
np.shape(v)

(50, 50)

In [16]:
from pynq import allocate

In [17]:
input_buffer = allocate(2500, np.float32)
out1_buff = allocate(2500, np.float32)
out2_buff = allocate(2500,np.float32)
out3_buff = allocate(2500,np.float32)

In [18]:
np.copyto(input_buffer, data_A.flatten())

In [19]:
V_channel

<pynq.lib.dma._SDMAChannel at 0xffff688bdd60>

In [20]:
U_channel

<pynq.lib.dma._SDMAChannel at 0xffff68925b50>

In [21]:
start_time = time.time()
V_channel.transfer(out3_buff)
U_channel.transfer(out2_buff)
A_channel.transfer(input_buffer)
S_channel.transfer(out1_buff)
A_channel.wait()
S_channel.wait()
V_channel.wait()
U_channel.wait()
end_time = time.time()
pltime = end_time - start_time
print(pltime)

0.1335747241973877


In [22]:
out1_buff

PynqBuffer([1.2544073e-02, 0.0000000e+00, 0.0000000e+00, ...,
            0.0000000e+00, 0.0000000e+00, 7.5206345e+01], dtype=float32)

In [23]:
out2_buff

PynqBuffer([ 0.17645997, -0.1605335 ,  0.05815537, ..., -0.01146385,
            -0.10338434,  0.13784662], dtype=float32)

In [24]:
out3_buff

PynqBuffer([ 0.04684398, -0.03934851, -0.02325301, ..., -0.00524139,
             0.02409708,  0.14248185], dtype=float32)

In [25]:
u,s,v = np.linalg.svd(data_A)

In [26]:
s

array([7.5206467e+01, 3.8213797e+00, 3.6965921e+00, 3.5413575e+00,
       3.3952742e+00, 3.3634415e+00, 3.2789977e+00, 3.1034992e+00,
       3.0242343e+00, 2.8887510e+00, 2.8151503e+00, 2.6821060e+00,
       2.5933847e+00, 2.5439532e+00, 2.4431224e+00, 2.3960464e+00,
       2.1686695e+00, 2.1317306e+00, 2.0598218e+00, 2.0121279e+00,
       1.9830902e+00, 1.8976101e+00, 1.8422046e+00, 1.7700166e+00,
       1.7396508e+00, 1.7314295e+00, 1.6732581e+00, 1.4895160e+00,
       1.4089705e+00, 1.3174994e+00, 1.2804837e+00, 1.1769483e+00,
       1.1405319e+00, 1.0486873e+00, 9.9442625e-01, 9.3701369e-01,
       8.0463731e-01, 7.4955142e-01, 7.0088172e-01, 6.7114925e-01,
       6.4408922e-01, 5.3409749e-01, 4.0408063e-01, 3.3322820e-01,
       3.1071559e-01, 2.3818696e-01, 2.1508199e-01, 1.7145586e-01,
       6.4537309e-02, 1.2543841e-02], dtype=float32)

In [27]:
out1_buff

PynqBuffer([1.2544073e-02, 0.0000000e+00, 0.0000000e+00, ...,
            0.0000000e+00, 0.0000000e+00, 7.5206345e+01], dtype=float32)

In [34]:
s2 = np.sort(s)

In [35]:
sigma = np.diag(s2)

In [38]:
rmse(out1_buff,sigma.flatten())

PynqBuffer(2.5748286e-06, dtype=float32)

In [41]:
rmse(out2_buff, u.flatten())

PynqBuffer(0.20000033, dtype=float32)

In [59]:
u2_pl = np.sort(np.abs(out2_buff))

In [58]:
u2_np = np.sort(np.abs(u.flatten()))

In [60]:
rmse(u2_pl,u2_np)

PynqBuffer(4.3206005e-06, dtype=float32)

In [53]:
out2_buff

PynqBuffer([ 0.17645997, -0.1605335 ,  0.05815537, ..., -0.01146385,
            -0.10338434,  0.13784662], dtype=float32)

In [54]:
u.flatten()

array([-0.13811974,  0.01740287,  0.20241071, ...,  0.04568309,
       -0.1772769 , -0.385451  ], dtype=float32)

In [56]:
v.flatten()

array([-0.1361321 , -0.1515666 , -0.14263709, ...,  0.06443339,
       -0.08378325,  0.04950593], dtype=float32)

In [57]:
out3_buff

PynqBuffer([ 0.04684398, -0.03934851, -0.02325301, ..., -0.00524139,
             0.02409708,  0.14248185], dtype=float32)

In [61]:
v2_pl = np.sort(np.abs(out3_buff))

In [62]:
v2_np = np.sort(np.abs(u.flatten()))

In [63]:
rmse(v2_pl,v2_np)

PynqBuffer(0.0024974, dtype=float32)

In [64]:
v.flatten()

array([-0.1361321 , -0.1515666 , -0.14263709, ...,  0.06443339,
       -0.08378325,  0.04950593], dtype=float32)

In [65]:
out3_buff

PynqBuffer([ 0.04684398, -0.03934851, -0.02325301, ..., -0.00524139,
             0.02409708,  0.14248185], dtype=float32)

In [66]:
u.flatten()

array([-0.13811974,  0.01740287,  0.20241071, ...,  0.04568309,
       -0.1772769 , -0.385451  ], dtype=float32)

In [67]:
out2_buff

PynqBuffer([ 0.17645997, -0.1605335 ,  0.05815537, ..., -0.01146385,
            -0.10338434,  0.13784662], dtype=float32)

In [70]:
sigma_pl = np.reshape(out1_buff, (50,50))
u_pl = np.reshape(out2_buff, (50,50))
v_pl = np.reshape(out3_buff, (50,50))

In [76]:
u_pl @ sigma_pl @v_pl.T

PynqBuffer([[1.4170175, 1.7203207, 1.0001138, ..., 1.2936112, 1.2877737,
             1.130027 ],
            [1.0193673, 1.6788338, 1.2116269, ..., 1.5736783, 1.002873 ,
             1.617142 ],
            [1.3266394, 1.5270553, 1.8859369, ..., 1.7129837, 1.5597138,
             1.0125535],
            ...,
            [1.1466067, 1.6124578, 1.2793186, ..., 1.0070466, 1.0595291,
             1.481396 ],
            [1.925697 , 1.4055539, 1.07503  , ..., 1.1667213, 1.5208457,
             1.7402854],
            [1.46247  , 1.8696091, 1.6604072, ..., 1.5681379, 1.3434174,
             1.3506132]], dtype=float32)

In [77]:
data_A

array([[1.417022 , 1.7203245, 1.0001143, ..., 1.2936141, 1.2877754,
        1.1300286],
       [1.019367 , 1.6788355, 1.2116281, ..., 1.5736794, 1.0028703,
        1.617145 ],
       [1.3266449, 1.5270581, 1.8859421, ..., 1.712989 , 1.5597169,
        1.012556 ],
       ...,
       [1.1466087, 1.6124611, 1.2793204, ..., 1.0070467, 1.0595286,
        1.4814005],
       [1.9257041, 1.4055574, 1.0750326, ..., 1.1667219, 1.5208484,
        1.7402936],
       [1.4624745, 1.8696141, 1.660412 , ..., 1.5681397, 1.3434201,
        1.3506172]], dtype=float32)

In [78]:
rmse(u_pl @ sigma_pl @v_pl.T, data_A)

PynqBuffer(4.163147e-06, dtype=float32)

In [81]:
v.T

array([[-0.1361321 ,  0.1864823 , -0.06351586, ...,  0.02325287,
         0.03934882,  0.04684435],
       [-0.1515666 ,  0.1225536 , -0.13381521, ...,  0.03810025,
         0.12809914,  0.15667093],
       [-0.14263709,  0.04732468, -0.01970438, ..., -0.08503009,
         0.23170668,  0.1224163 ],
       ...,
       [-0.14495319,  0.01055746,  0.11370312, ..., -0.187727  ,
        -0.2131916 ,  0.06443339],
       [-0.14391916, -0.126213  ,  0.00557159, ...,  0.19514473,
        -0.00090396, -0.08378325],
       [-0.14248198, -0.02410052,  0.00523833, ..., -0.13740343,
        -0.06927516,  0.04950593]], dtype=float32)

In [83]:
v_pl

PynqBuffer([[ 0.04684398, -0.03934851, -0.02325301, ...,  0.06351703,
             -0.1864809 ,  0.136132  ],
            [ 0.15667138, -0.12809853, -0.03810072, ...,  0.13381523,
             -0.12255251,  0.1515666 ],
            [ 0.12241694, -0.2317065 ,  0.08503027, ...,  0.01970365,
             -0.04732518,  0.14263701],
            ...,
            [ 0.06443322,  0.21319123,  0.18772684, ..., -0.11370131,
             -0.01055695,  0.1449532 ],
            [-0.0837843 ,  0.00090429, -0.1951447 , ..., -0.00557078,
              0.12621458,  0.14391923],
            [ 0.0495059 ,  0.06927635,  0.13740216, ..., -0.00524139,
              0.02409708,  0.14248185]], dtype=float32)

In [103]:
np.max(u2_pl)

PynqBuffer(0.5604, dtype=float32)

In [104]:
np.min(u2_np)

5.1366544e-05

In [95]:
np.max(v2_pl)

PynqBuffer(0.502, dtype=float32)

In [102]:

np.min(v2_np)

5.1366544e-05

In [107]:
min = 100
for _ in v2_pl:
    if(_ != 0):
        if(_ < min):
            min = _

min

2.1711445e-05

In [96]:
np.max(sigma)

75.20647

In [99]:
np.min(s)

0.012543841

In [108]:
out1_buff

PynqBuffer([ 0.0125,  0.    ,  0.    , ...,  0.    ,  0.    , 75.2063],
           dtype=float32)

In [109]:
s

array([75.2065,  3.8214,  3.6966,  3.5414,  3.3953,  3.3634,  3.279 ,
        3.1035,  3.0242,  2.8888,  2.8152,  2.6821,  2.5934,  2.544 ,
        2.4431,  2.396 ,  2.1687,  2.1317,  2.0598,  2.0121,  1.9831,
        1.8976,  1.8422,  1.77  ,  1.7397,  1.7314,  1.6733,  1.4895,
        1.409 ,  1.3175,  1.2805,  1.1769,  1.1405,  1.0487,  0.9944,
        0.937 ,  0.8046,  0.7496,  0.7009,  0.6711,  0.6441,  0.5341,
        0.4041,  0.3332,  0.3107,  0.2382,  0.2151,  0.1715,  0.0645,
        0.0125], dtype=float32)

In [112]:
c = np.reshape(out2_buff, (50,50))

In [113]:
u

array([[-0.1381,  0.0174,  0.2024, ..., -0.0582,  0.1605,  0.1765],
       [-0.1415, -0.0105,  0.1522, ..., -0.1542, -0.126 ,  0.0806],
       [-0.1384,  0.3491, -0.0919, ..., -0.0639,  0.0047,  0.0868],
       ...,
       [-0.1315,  0.128 , -0.1518, ...,  0.3815, -0.0033,  0.0683],
       [-0.1363, -0.0881, -0.2158, ..., -0.025 , -0.1825,  0.0397],
       [-0.1378,  0.1034,  0.0115, ...,  0.0457, -0.1773, -0.3855]],
      dtype=float32)

In [114]:
c

PynqBuffer([[ 0.1765, -0.1605,  0.0582, ..., -0.2024, -0.0174,  0.1381],
            [ 0.0806,  0.126 ,  0.1542, ..., -0.1522,  0.0105,  0.1415],
            [ 0.0868, -0.0047,  0.0639, ...,  0.0919, -0.3491,  0.1384],
            ...,
            [ 0.0683,  0.0033, -0.3815, ...,  0.1518, -0.128 ,  0.1315],
            [ 0.0397,  0.1825,  0.025 , ...,  0.2158,  0.0881,  0.1363],
            [-0.3855,  0.1773, -0.0457, ..., -0.0115, -0.1034,  0.1378]],
           dtype=float32)

In [119]:
d = np.reshape(out3_buff, (50,50))

In [120]:
v.T

array([[-0.1361,  0.1865, -0.0635, ...,  0.0233,  0.0393,  0.0468],
       [-0.1516,  0.1226, -0.1338, ...,  0.0381,  0.1281,  0.1567],
       [-0.1426,  0.0473, -0.0197, ..., -0.085 ,  0.2317,  0.1224],
       ...,
       [-0.145 ,  0.0106,  0.1137, ..., -0.1877, -0.2132,  0.0644],
       [-0.1439, -0.1262,  0.0056, ...,  0.1951, -0.0009, -0.0838],
       [-0.1425, -0.0241,  0.0052, ..., -0.1374, -0.0693,  0.0495]],
      dtype=float32)

In [121]:
d

PynqBuffer([[ 0.0468, -0.0393, -0.0233, ...,  0.0635, -0.1865,  0.1361],
            [ 0.1567, -0.1281, -0.0381, ...,  0.1338, -0.1226,  0.1516],
            [ 0.1224, -0.2317,  0.085 , ...,  0.0197, -0.0473,  0.1426],
            ...,
            [ 0.0644,  0.2132,  0.1877, ..., -0.1137, -0.0106,  0.145 ],
            [-0.0838,  0.0009, -0.1951, ..., -0.0056,  0.1262,  0.1439],
            [ 0.0495,  0.0693,  0.1374, ..., -0.0052,  0.0241,  0.1425]],
           dtype=float32)