In [63]:
import args as cim_args
import torch
import math
import numpy as np

HRS = 148000  # if HRS = LRS*onoff then some issues occur, set this slightly higher
LRS = 1000

# define arguments for simulation
args = cim_args.CIMArgs(
    inference=True,
    batch_size=1,
    mem_values=torch.tensor([HRS, LRS], device="cuda"),
    sub_array=[147, 520],
    open_rows=21,
    adc_precision=5,
    weight_precision=8,
    input_precision=8,
)

# load inputs
input2d = torch.load("../bin/quant_input.bin")
weight2d = torch.load("../bin/quant_weight.bin")

# store original weight shape
weight2d_shape = weight2d.shape

In [64]:
args.adc_precision

5

In [65]:
input2d

tensor([[ 81,  81,  81,  ..., 133, 121, 124],
        [ 81,  81,  81,  ..., 124, 134, 128],
        [ 81,  81,  81,  ..., 128, 134, 112],
        ...,
        [101, 114, 113,  ...,  81,  81,  81],
        [113,  94,  46,  ...,  81,  81,  81],
        [ 81,  81,  81,  ...,  81,  81,  81]], device='cuda:0',
       dtype=torch.int32)

In [66]:
np.savetxt("../csv/input2d.csv", input2d.cpu().numpy(), delimiter=",", fmt="%d")

In [67]:
weight2d

tensor([[126, 123, 129,  ..., 123, 127, 127],
        [127, 130, 125,  ..., 127, 127, 127],
        [130, 129, 126,  ..., 138, 126, 127],
        ...,
        [105, 133, 116,  ..., 144, 120, 127],
        [143, 124, 131,  ..., 153, 134, 127],
        [124, 119, 132,  ..., 121, 134, 127]], device='cuda:0',
       dtype=torch.int32)

Breakdown of simulate_array()

In [68]:
## convert weights to eNVM cell values

# convert to binary
base = len(args.mem_values)
bits = args.weight_precision

rows, cols = weight2d.shape
dec_matrix = weight2d.flatten().reshape(-1, 1).int()

max_val = 2**bits
num_digits = math.ceil(math.log(max_val, base))

binary_weights = base ** torch.arange(num_digits, device="cuda").flip(0)

binary_weights = dec_matrix // binary_weights % base

binary_weights = binary_weights.reshape(rows, num_digits * cols)

In [69]:
binary_weights

tensor([[0, 1, 1,  ..., 1, 1, 1],
        [0, 1, 1,  ..., 1, 1, 1],
        [1, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 1, 1,  ..., 1, 1, 1],
        [1, 0, 0,  ..., 1, 1, 1],
        [0, 1, 1,  ..., 1, 1, 1]], device='cuda:0')

In [70]:
# convert binary weights to resistances
weights = args.mem_values[binary_weights]

In [71]:
weights

tensor([[148000,   1000,   1000,  ...,   1000,   1000,   1000],
        [148000,   1000,   1000,  ...,   1000,   1000,   1000],
        [  1000, 148000, 148000,  ...,   1000,   1000,   1000],
        ...,
        [148000,   1000,   1000,  ...,   1000,   1000,   1000],
        [  1000, 148000, 148000,  ...,   1000,   1000,   1000],
        [148000,   1000,   1000,  ...,   1000,   1000,   1000]],
       device='cuda:0')

In [72]:
# convert resistances to conductances
np.savetxt("../csv/weight2d.csv", weights.cpu().numpy(), delimiter=",", fmt="%d")
weights = 1 / weights

In [73]:
# for calculating the optimal value of resistive divider
def R_opt(R1, R2):
    return (R1 - R2 * math.sqrt(R1 / R2)) / (math.sqrt(R1 / R2) - 1)


# calculate reference voltages
num_refs = 2**args.adc_precision
x = torch.arange(num_refs, device="cuda") + 1

vdd = args.vdd
LRS = args.mem_values[-1]
HRS = args.mem_values[0]

r_max = 1 / (x / LRS)
r_min = 1 / ((args.open_rows - (x - 1)) / HRS + (x - 1) / LRS)

# calculate optimal value of resistive divider for largest sense margins
res_divider = (
    R_opt(r_min[0], r_max[0])
    + R_opt(r_min[args.open_rows - 1], r_max[args.open_rows - 1])
) / 2

v_max = vdd * (r_max / (res_divider + r_max))
v_min = vdd * (r_min / (res_divider + r_min))

v_ref = (v_min + v_max) / 2

# update args
args.res_divider = res_divider
args.v_ref = v_ref

In [74]:
res_divider

tensor(1351.7611, device='cuda:0')

In [75]:
np.savetxt("../csv/v_ref.csv", v_ref.cpu().numpy(), delimiter=",")

In [76]:
x

tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32],
       device='cuda:0')

In [77]:
# v_ref.cpu().numpy().tofile('../bin/v_ref_32_c.bin')

In [78]:
r_max[0]

tensor(999.9999, device='cuda:0')

In [79]:
r_max[args.open_rows - 1]

tensor(47.6190, device='cuda:0')

In [80]:
r_min[0]

tensor(7047.6187, device='cuda:0')

In [81]:
r_min[args.open_rows - 1]

tensor(49.9831, device='cuda:0')

In [82]:
res_divider

tensor(1351.7611, device='cuda:0')

In [83]:
v_max

tensor([0.4252, 0.2700, 0.1978, 0.1561, 0.1289, 0.1098, 0.0956, 0.0846, 0.0760,
        0.0689, 0.0630, 0.0581, 0.0538, 0.0502, 0.0470, 0.0442, 0.0417, 0.0395,
        0.0375, 0.0357, 0.0340, 0.0325, 0.0312, 0.0299, 0.0287, 0.0277, 0.0267,
        0.0257, 0.0249, 0.0241, 0.0233, 0.0226], device='cuda:0')

In [84]:
v_min

tensor([0.8391, 0.3946, 0.2579, 0.1916, 0.1524, 0.1265, 0.1081, 0.0944, 0.0838,
        0.0753, 0.0684, 0.0627, 0.0578, 0.0536, 0.0500, 0.0469, 0.0441, 0.0416,
        0.0394, 0.0375, 0.0357, 0.0340, 0.0325, 0.0312, 0.0299, 0.0288, 0.0277,
        0.0267, 0.0258, 0.0249, 0.0241, 0.0234], device='cuda:0')

In [85]:
v_ref

tensor([0.6321, 0.3323, 0.2279, 0.1738, 0.1406, 0.1181, 0.1019, 0.0895, 0.0799,
        0.0721, 0.0657, 0.0604, 0.0558, 0.0519, 0.0485, 0.0455, 0.0429, 0.0406,
        0.0385, 0.0366, 0.0348, 0.0333, 0.0319, 0.0305, 0.0293, 0.0282, 0.0272,
        0.0262, 0.0253, 0.0245, 0.0237, 0.0230], device='cuda:0')

In [86]:
# define ADC_output function
def ADC_output(args, inputs, weights):
    inputs = inputs.to(dtype=torch.float32)
    weights = weights.to(dtype=torch.float32)

    # calculate the equivalent conductance of each column
    equiv_cond = torch.matmul(inputs, weights)

    vdd = args.vdd

    # calculate the voltage of each column
    BL_voltages = torch.div(vdd, 1 + torch.mul(equiv_cond, args.res_divider))

    # sense the voltage of each column
    num_refs = 2**args.adc_precision

    ADC_output = num_refs - torch.bucketize(
        BL_voltages, args.v_ref.flip(0), out_int32=True, right=True
    )

    return ADC_output

In [87]:
ADC_out = torch.zeros((input2d.shape[0], weights.shape[1]), device="cuda")
Psum = torch.zeros_like(ADC_out)

# divide the weight matrix into partitions
num_partitions = math.ceil(weights.shape[0] / args.open_rows)
print(num_partitions)

# calculate ADC outputs for each bit of the input
for i in range(args.input_precision):
    mask = 2**i
    input = (input2d & mask) >> i

    # if i == 4:
    #     print(input[0][24*8])

    # calculate partial sum for each partition
    Psum[:, :] = 0

    for part in range(num_partitions):
        start_row = part * args.open_rows
        end_row = start_row + args.open_rows

        # get digital outputs from the ADCs
        out = ADC_output(
            args, input[:, start_row:end_row], weights[start_row:end_row, :]
        )
        # print(out[0][24*8])

        # add partition output to total output of the sub array
        Psum += out

    # scale partial sum for input bit significance
    Psum *= mask

    # add partition output to total output of the sub array
    ADC_out += Psum

7
tensor(59., device='cuda:0')
tensor(15., device='cuda:0')
tensor(17., device='cuda:0')
tensor(18., device='cuda:0')
tensor(12, device='cuda:0', dtype=torch.int32)
tensor(16, device='cuda:0', dtype=torch.int32)
tensor(5, device='cuda:0', dtype=torch.int32)
tensor(9, device='cuda:0', dtype=torch.int32)
tensor(8, device='cuda:0', dtype=torch.int32)
tensor(5, device='cuda:0', dtype=torch.int32)
tensor(5, device='cuda:0', dtype=torch.int32)
tensor(60., device='cuda:0')
tensor(19., device='cuda:0')
tensor(53., device='cuda:0')
tensor(2., device='cuda:0')


In [88]:
# create a mask to account for the weight precision
max_val = 2**args.weight_precision
base = len(args.mem_values)
cols_per_weight = math.ceil(math.log(max_val, base))
weights_mask = base ** torch.arange(cols_per_weight, device="cuda").flip(0)

# split output into groups of each dot product
ADC_out = ADC_out.reshape(
    ADC_out.shape[0], int(ADC_out.shape[1] / cols_per_weight), cols_per_weight
)

# multiply each dot product by the weight mask
ADC_out *= weights_mask

# add output bits together and accumulate total output
output = ADC_out.sum(dim=-1)

In [89]:
# compare with correct output
correct_output = torch.load("../bin/correct_output.bin")

output[0][24]

tensor(1572897., device='cuda:0')

In [90]:
correct_output[1][:]

tensor([1566390., 1518860., 1567446., 1592368., 1574229., 1565001., 1567719.,
        1560396., 1484889., 1558856., 1550843., 1561893., 1472448., 1633787.,
        1560208., 1552162., 1564499., 1555341., 1561253., 1737761., 1537762.,
        1555929., 1561823., 1550852., 1614412., 1547837., 1555974., 1594043.,
        1562478., 1257210., 1557829., 1549665., 1578656., 1571097., 1549261.,
        1591087., 1548641., 1552569., 1556776., 1652367., 1567072., 1537333.,
        1564913., 1552418., 1565795., 1556537., 1560376., 1568563., 1557226.,
        1557383., 1560979., 1557701., 1497948., 1562933., 1528514., 1558376.,
        1523191., 1562043., 1561805., 1588803., 1561691., 1554277., 1566757.,
        1564184., 1560957.], device='cuda:0')

In [91]:
# correct_output = correct_output.cpu().numpy()
np.savetxt(
    "../csv/correct_output.csv", correct_output.cpu().numpy(), delimiter=",", fmt="%d"
)

In [92]:
torch.allclose(output, correct_output)

True