In [119]:
import math
import numpy

def binary32_to_vector(binary_str, signed=False):
    """
    Convert a 32-bit binary string into a list of 16 decimal numbers
    either as signed 2-bit 2's complement or unsigned 2-bit.

    Parameters:
        binary_str (str): 32-bit binary string
        signed (bool): If True, interpret as signed 2-bit numbers; else unsigned

    Returns:
        List[int]: list of 16 decimal numbers
    """
    if len(binary_str) != 32:
        raise ValueError("Input binary string must be exactly 32 bits")

    vector = []
    for i in range(0, 32, 2):      # STEP SIZE = 2 (not 4 anymore)
        chunk = binary_str[i:i+2]
        value = int(chunk, 2)

        if signed:
            # 2-bit signed range: 00=0, 01=1, 10=-2, 11=-1
            if value & 0b10:       # MSB is 1 → negative
                value -= 4         # subtract 2^2 = 4

        vector.append(value)

    return vector


In [120]:
binary_w_list = [
    "00000000000000000000000000000000",
    "00000000000000000000000000000001",
    "00000000000000000000000000000000",
    "00010000001000000000000000000000",
    "00000000000100000000000000000000",
    "00010000000000000000000000000000",
    "00010000000000000000000000000000",
    "00100001000100000000000000010000",
    "00100000001000010000000000000000"]
dec_a_list = []
for b in range(0,9):
    dec_a_list.append(binary32_to_vector(binary_w_list[b]))

In [121]:
dec_a_list

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 2, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
 [0, 2, 0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]]

In [122]:
signed_binary_list = [
     "10011001011101111001011110010111",
    "10011001100101110111011110010111",
    "01110111100101110111011110011001",
    "10010111100101110111100110010111",
    "01111001011101110111011101110111",
    "01111001100101110111011110011001",
    "10010111100101111001100110011001",
    "10010111100110011001100101110111",
    "01111001011110010111100101110111"]
dec_w_list = []
for b in range(0,9):
    dec_w_list.append(binary32_to_vector(signed_binary_list[b],True))

In [123]:
dec_w_list

[[-2, 1, -2, 1, 1, -1, 1, -1, -2, 1, 1, -1, -2, 1, 1, -1],
 [-2, 1, -2, 1, -2, 1, 1, -1, 1, -1, 1, -1, -2, 1, 1, -1],
 [1, -1, 1, -1, -2, 1, 1, -1, 1, -1, 1, -1, -2, 1, -2, 1],
 [-2, 1, 1, -1, -2, 1, 1, -1, 1, -1, -2, 1, -2, 1, 1, -1],
 [1, -1, -2, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1],
 [1, -1, -2, 1, -2, 1, 1, -1, 1, -1, 1, -1, -2, 1, -2, 1],
 [-2, 1, 1, -1, -2, 1, 1, -1, -2, 1, -2, 1, -2, 1, -2, 1],
 [-2, 1, 1, -1, -2, 1, -2, 1, -2, 1, -2, 1, 1, -1, 1, -1],
 [1, -1, -2, 1, 1, -1, -2, 1, 1, -1, -2, 1, 1, -1, 1, -1]]

In [124]:
sum1 = []
for b in range(0,9):
    sum1.append(numpy.array(dec_a_list[b])*numpy.array(dec_w_list[b]))

In [125]:
sum_np = numpy.array(sum1)
sum_np

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  1,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  2,  0, -1,  0,  1,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0],
       [ 0, -2,  0,  0,  0, -2,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0]])

In [126]:
f_sum = numpy.sum(sum_np,axis=1)
f_sum

array([ 0, -1,  0,  3, -1, -1,  1,  1, -3])

In [127]:
def to_hex_array_2bit(int_array):
    """
    Convert a NumPy array of integers to a NumPy array of 4-digit hex strings
    using 16-bit representation (handles signed integers with 2's complement).

    Parameters:
        int_array (np.ndarray): NumPy array of integers (signed or unsigned)

    Returns:
        np.ndarray: NumPy array of 4-digit hexadecimal strings
    """
    return numpy.vectorize(lambda x: format(x & 0xFFFF, '04X'))(int_array)

In [128]:
to_hex_array_2bit(f_sum)

array(['0000', 'FFFF', '0000', '0003', 'FFFF', 'FFFF', '0001', '0001',
       'FFFD'], dtype='<U4')

In [129]:
f_sum.sum()

np.int64(-1)

In [130]:
def recursive_partial_sums_hex(arr, index=0, current_sum=0, result=None):
    """
    Recursively compute partial sums of an array and return each sum as 16-bit hex.

    Parameters:
        arr (list or np.ndarray): Array of integers
        index (int): Current index for recursion (used internally)
        current_sum (int): Accumulated sum so far (used internally)
        result (list): List of partial sums (used internally)

    Returns:
        np.ndarray: Array of 4-digit hexadecimal strings for each partial sum
    """
    if result is None:
        result = []

    if index >= len(arr):
        return numpy.array(result)

    # Add current element
    current_sum += arr[index]

    # Convert to 16-bit hex string
    hex_value = format(current_sum & 0xFFFF, '04X')
    result.append(hex_value)

    # Recursive call for next index
    return recursive_partial_sums_hex(arr, index + 1, current_sum, result)


In [131]:
def to_hex_array_1(int_array):
    """
    Convert a NumPy array of integers to a NumPy array of 4-digit hex strings
    using 16-bit representation (handles signed integers with 2's complement).

    Parameters:
        int_array (np.ndarray): NumPy array of integers (signed or unsigned)

    Returns:
        np.ndarray: NumPy array of 4-digit hexadecimal strings
    """
    return numpy.vectorize(lambda x: format(x & 0xF, '04X'))(int_array)


In [132]:
hex_res = recursive_partial_sums_hex(f_sum)

In [133]:
hex_res

array(['0000', 'FFFF', 'FFFF', '0002', '0001', '0000', '0001', '0002',
       'FFFF'], dtype='<U4')

In [134]:
recursive_partial_sums_hex([0])

array(['0000'], dtype='<U4')

In [138]:
'''
code used to check Sankalpa's calculation with the formulaic calculation
'''

def hex_signed16_to_int(h):
    x = int(h, 16)
    if x & 0x8000:   # negative in 16-bit
        x -= 0x10000
    return x

hex_to_int_array = [hex_signed16_to_int(h) for h in hex_res]
print("Sankalpa calculated value: ", hex_to_int_array)


def two_bit_signed(x):
    """
    Convert 2-bit unsigned integer (0..3) to signed decimal (-2..1)
    """
    return x - 4 if x & 0b10 else x


def binary32_to_vector_2bit(bits, signed=False):
    """
    Convert a 32-bit binary string into a list of 16 2-bit values.
    """
    vals = []
    for i in range(0, 32, 2):  # 2 bits per lane
        x = int(bits[i:i+2], 2)
        if signed:
            x = two_bit_signed(x)
        vals.append(x)
    return vals


act_words_manual = binary_w_list          # 9 strings: activations for k=0..8
wgt_words_manual = signed_binary_list     # 9 strings: weights for k=0..8

partials = []
combined = []
running = 0        # 16-bit running sum

for k in range(9):
    a = binary32_to_vector_2bit(act_words_manual[k], signed=False)
    w = binary32_to_vector_2bit(wgt_words_manual[k], signed=True)

    print("printing a")
    print(a)
    print("printing w")
    print(w)

    # ---- MINIMAL CHANGE #1 ----
    # PSUM of this tap (2-bit multipliers → full precision int)
    p = sum(ai * wi for ai, wi in zip(a, w))
    print("printing p")
    print(p)
    partials.append(p)

    # ---- MINIMAL CHANGE #2 ----
    # 16-bit wrapping accumulator (hardware-accurate)
    running = (running + p) & 0xFFFF

    # ---- MINIMAL CHANGE #3 ----
    # Interpret running sum as signed 16-bit
    signed_running = running if running < 0x8000 else running - 0x10000

    print("printing running")
    print(signed_running)

    combined.append(signed_running)
    print("printing combined")
    print(combined)

print("printing combined")
print(combined)

final_val = max(combined[-1], 0)   # ReLU

print("formula running combined psums:", combined)
print("formula final output (after ReLU):", final_val)


Sankalpa calculated value:  [0, -1, -1, 2, 1, 0, 1, 2, -1]
printing a
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
printing w
[-2, 1, -2, 1, 1, -1, 1, -1, -2, 1, 1, -1, -2, 1, 1, -1]
printing p
0
printing running
0
printing combined
[0]
printing a
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
printing w
[-2, 1, -2, 1, -2, 1, 1, -1, 1, -1, 1, -1, -2, 1, 1, -1]
printing p
-1
printing running
-1
printing combined
[0, -1]
printing a
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
printing w
[1, -1, 1, -1, -2, 1, 1, -1, 1, -1, 1, -1, -2, 1, -2, 1]
printing p
0
printing running
-1
printing combined
[0, -1, -1]
printing a
[0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
printing w
[-2, 1, 1, -1, -2, 1, 1, -1, 1, -1, -2, 1, -2, 1, 1, -1]
printing p
3
printing running
2
printing combined
[0, -1, -1, 2]
printing a
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
printing w
[1, -1, -2, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1]
printing p
-1
printing running
1
printing combined
[0, -1, 

In [146]:
'''
UPDATED GENERAL CODE WITH 16-BIT ACCUMULATION (MATCHES FIRST SECTION)
'''

import numpy as np

nij_count = 36

# ---------- helper to unpack one 32-bit word into eight 4-bit values ----------
def binary32_to_vector(bits, signed=False):
    vals = []
    for i in range(0, 32, 2):  # 2 bits per lane
        x = int(bits[i:i+2], 2)
        if signed:
            x = two_bit_signed(x)
        vals.append(x)
    return vals


# ---------- load activations ----------
with open("activation.txt") as f:
    activation_lines = [l.strip() for l in f]

# ---------- print activation.txt ----------
print("======= ACTIVATION VALUES (decoded per NIJ) =======")
for nij in range(nij_count):
    a = binary32_to_vector(activation_lines[nij], signed=False)
    print(f"NIJ {nij:2d}: {a}")
print("===================================================\n")

# ---------- load weights from weight_kij/ ----------
weight_lines = []
for k in range(9):
    fname = f"weight_kij/weight_k{k}.txt"
    with open(fname) as f:
        weight_lines.append([l.strip() for l in f])

nij_count  = len(activation_lines)
cout_count = len(weight_lines[0])

# psums[nij, k, cout] = 16-bit signed combined psum
psums = np.zeros((nij_count, 9, cout_count), dtype=int)
golden_outputs = np.zeros((nij_count, cout_count), dtype=int)

for nij in range(nij_count):

    act_word = activation_lines[nij]
    a = np.array(binary32_to_vector(act_word, signed=False), dtype=int)  # 8 lanes
    print("Printing a")
    print(a)

    for k in range(9):
        print("Printing k")
        print(k)
        print("Printint a")
        print(a)
        running = 0               # 16-bit accumulator
        psum = np.zeros(cout_count, dtype=int)

        for cout in range(cout_count):

            # decode weights (8 lanes)
            w = np.array(
                binary32_to_vector(weight_lines[k][cout], signed=True),
                dtype=int
            )
            print("Printing w")
            print(w)
            # ------ MAIN FIX #1 ------
            # full-precision MAC for this tap
            p = int(np.dot(a, w))
            print("Printing p")
            print(p)
            # ------ MAIN FIX #2 ------
            # 16-bit wrapping accumulator
            running = (running + p) & 0xFFFF
            print("Printing running")
            print(running)
            # ------ MAIN FIX #3 ------
            # interpret as signed 16-bit
            signed_running = running if running < 0x8000 else running - 0x10000
            print("Printing signed_running")
            print(signed_running)
            psum[cout] = signed_running

        psums[nij][k] = psum
        print("Printing psums")
        print(psums)
        print("Printing golden_outputs")
        print(golden_outputs)
        golden_outputs[nij] = np.maximum(psum, 0)


print("psums shape (NIJ x 9 x COUT):", psums.shape)

for k in range(9):
    for nij in range(nij_count):
        row = " ".join(f"{v:6d}" for v in psums[nij][k])
        print(f"NIJ {nij:2d}, k={k}:   {row}")


# ---------- 3×3 kernel combination ----------
nij_kernel = [0,1,2,3,6,7,8,9,12,13,14,15,18,19,20,21]
out = np.zeros((16, cout_count), int)

for kernel in range(16):

    nij = nij_kernel[kernel]

    for k in range(9):
        out[kernel] += psums[nij][k]

        if k in (2, 5):
            nij += 4
        else:
            nij += 1

    # after summing 9 taps → ReLU
    out[kernel] = np.maximum(out[kernel], 0)


print("\nOutputs (after ReLU):")
for kernel in range(16):
    row = " ".join(f"{v:6d}" for v in out[kernel])
    print(f"kernel {kernel:2d} out:   {row}")


print("\nOutputs for each kernel in hex:")
for kernel in range(16):
    hex_row = " ".join(f"{v & 0xFFFF:04x}" for v in out[kernel])
    print(f"kernel {kernel:2d} hex:   {hex_row}")


print("\nOutputs in binary:")
for kernel in range(16):
    bin_row = " ".join(f"{v & 0xFFFF:016b}" for v in out[kernel])
    print(f"kernel {kernel:2d} bin:   {bin_row}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 1 2 1 0 0 0 0 0 0 1 3 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
Printing k
6
Printint a
[0 0 0 2 0 1 0 0 0 0 2 0 0 0 0 0]
Printing w
[-1 -1  0  1  0  1 -1 -1 -1 -1 -1 -1 -1 -1  0  1]
Printing p
1
Printing running
1
Printing signed_running
1
Printing w
[ 0  1  0  1  0  1  0  1  0  1 -1 -1 -1 -1 -1 -1]
Printing p
1
Printing running
2
Printing signed_running
2
Printing

In [155]:
import numpy as np

nij_count = 36

# ---------- helper to unpack one 32-bit word into lanes ----------
def binary32_to_vector(bits, signed=False):
    vals = []
    for i in range(0, 32, 2):  # 2 bits per lane
        x = int(bits[i:i+2], 2)
        if signed:
            x = x - 4 if x & 0b10 else x  # 2-bit signed
        vals.append(x)
    return vals


# ---------- load activations ----------
with open("activation.txt") as f:
    activation_lines = [l.strip() for l in f]

# ---------- print activation.txt ----------
print("======= ACTIVATION VALUES (decoded per NIJ) =======")
for nij in range(nij_count):
    a = binary32_to_vector(activation_lines[nij], signed=False)
    print(f"NIJ {nij:2d}: {a}")
print("===================================================\n")


# ---------- load weights ----------
weight_lines = []
for k in range(9):
    fname = f"weight_kij/weight_k{k}.txt"
    with open(fname) as f:
        weight_lines.append([l.strip() for l in f])

nij_count = len(activation_lines)
cout_count = len(weight_lines[0])

# ---------- compute psums ----------
psums = np.zeros((nij_count, 9, cout_count), dtype=int)
golden_outputs = np.zeros((nij_count, cout_count), dtype=int)

for nij in range(nij_count):
    a = np.array(binary32_to_vector(activation_lines[nij], signed=False), dtype=int)

    for k in range(9):
        running = 0
        psum = np.zeros(cout_count, dtype=int)
        for cout in range(cout_count):
            w = np.array(binary32_to_vector(weight_lines[k][cout], signed=True), dtype=int)
            p = int(np.dot(a, w))
            running = (running + p) & 0xFFFF
            signed_running = running if running < 0x8000 else running - 0x10000
            psum[cout] = signed_running

        psums[nij][k] = psum
        golden_outputs[nij] = np.maximum(psum, 0)


# ---------- print psums ----------
print("psums shape (NIJ x 9 x COUT):", psums.shape)
for k in range(9):
    for nij in range(nij_count):
        row = " ".join(f"{v:6d}" for v in psums[nij][k])
        print(f"NIJ {nij:2d}, k={k}:   {row}")


# ---------- 3x3 kernel combination ----------
nij_kernel = [0,1,2,3,6,7,8,9,12,13,14,15,18,19,20,21]
out = np.zeros((16, cout_count), int)

for kernel in range(16):
    nij = nij_kernel[kernel]
    for k in range(9):
        out[kernel] += psums[nij][k]
        if k in (2, 5):
            nij += 4
        else:
            nij += 1
    out[kernel] = np.maximum(out[kernel], 0)


# ---------- print outputs ----------
print("\nOutputs (after ReLU):")
for k in range(16):
    row = " ".join(f"{v:6d}" for v in out[k])
    print(f"NIJ {k:2d} out:     {row}")

print("\nOutputs for each NIJ in hex (after ReLU):")
for k in range(16):
    hex_row = " ".join(f"{v & 0xFFFF:04x}" for v in out[k])
    print(f"NIJ {k:2d} hex:     {hex_row}")

print("\nOutputs in binary:")
for k in range(16):
    bin_row = " ".join(f"{v & 0xFFFF:016b}" for v in out[k])
    print(f"NIJ {k:2d} bin:     {bin_row}")



NIJ  0: [0, 0, 0, 2, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0]
NIJ  1: [1, 0, 0, 2, 0, 2, 0, 1, 0, 0, 2, 0, 0, 1, 0, 1]
NIJ  2: [1, 0, 2, 2, 0, 0, 1, 0, 0, 0, 2, 0, 0, 1, 0, 0]
NIJ  3: [1, 0, 2, 2, 0, 0, 1, 0, 1, 0, 2, 2, 0, 0, 0, 1]
NIJ  4: [0, 0, 0, 2, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0]
NIJ  5: [0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0]
NIJ  6: [0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1]
NIJ  7: [0, 0, 1, 2, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 1]
NIJ  8: [0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1]
NIJ  9: [0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1]
NIJ 10: [0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 2, 0, 0, 1, 0, 1]
NIJ 11: [0, 0, 0, 2, 0, 1, 1, 0, 1, 0, 2, 0, 0, 2, 0, 1]
NIJ 12: [0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1]
NIJ 13: [0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0]
NIJ 14: [0, 0, 0, 2, 0, 2, 0, 0, 1, 0, 2, 1, 0, 0, 0, 0]
NIJ 15: [0, 0, 0, 2, 0, 1, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0]
NIJ 16: [0, 0, 0, 2, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0]
NIJ 17: [1, 0, 0, 2, 0, 2, 0, 1

In [150]:
with open("output.txt", "w") as f:
    for nij in range(16):
        row = " ".join(f"{v & 0xFFFF:016b}" for v in out[nij])
        f.write(row + "\n")