# Matrix Multiplication Verilog Design Generation

- Matrix -> Index array -> Generate optimized matrix multiplication Verilog deisgn
- Matrix -> Generate baseline matrix multiplication Verilog deisgn (hardcoded version)
- Matrix -> Generate baseline matrix multiplication Verilog deisgn (shift and add version)
- Matrix -> Generate baseline matrix multiplication Verilog deisgn (single-bit version)

In [6]:
import numpy as np
np.set_printoptions(linewidth=400)
from fxpmath import Fxp

## Optimized matrix multiplication

In [7]:
def check_one_hot(matrix):
    return np.all(np.sum(matrix, axis=1) == 1)

In [8]:
# process input matrix into array which stores necessary positions
# format of array is: [left#, right#, target#, 'operation']
# when target# is equal with -1, the values store in left# and right# will be the output value
def preprocess_matrix_to_array(matrix):
    # print matrix
    print("Size：",matrix.size)      
    print("Shape：",matrix.shape)        
    print("Dimension",matrix.ndim)      
    print(matrix,"\n")
    
    # define parameters
    rows1, cols1 = matrix.shape
    bit = int(np.ceil(np.log2(np.max(np.abs(matrix))+1)))+1
    bin_weight = cols1*bit
    
    # the first row of index_array is to store required parameters, [rows, cols, bit, 'parameters']
    index_array = []
    index_array.append([rows1, cols1, bit,'parameters'])
    
    
    # prepocess input matrix into binary matrix
    bin_matrix = np.zeros(shape=(rows1, bin_weight),dtype=np.int32)
    for i in range(0, rows1):
        for j in range(0, cols1):
            a = np.binary_repr(matrix[i][j],bit)
            for k in range(0,bit):
                    bin_matrix [i][k+bit*j]=a[k]
     
    a = 1
    for i in range(bin_weight):
        if((i%bit)==0):
            print("MEM[%d] = -(input_vector[%d] << %d);" %(i,a-1,a*bit-i-1))
            index_array.append([i,a-1,a*bit-i-1,'-<<'])
        else:
            print("MEM[%d] = input_vector[%d] << %d;" %(i,a-1,a*bit-i-1))
            index_array.append([i,a-1,a*bit-i-1,'<<'])
        if((i+1)%bit==0):
            a+=1
    
    rows2, cols2 = bin_matrix.shape
    # find the number of max matching ones of input binary matrix
    max_matching = 0
    for i in range(cols2 - 1):
        for j in range(i + 1, cols2):
            matching = np.sum((bin_matrix[:, i] == 1) & (bin_matrix[:, j] == 1))
            if matching > max_matching:
                max_matching = matching
                
    # process binary matrix
    cnt = 0 #count number of matches
    add_count = [0]*rows2 #count number of additions 
    one_count = max_matching #count number of one in column
    loop_range = cols2
    while(one_count):
        print("\n Max number of 1: %d" %(one_count))
        for i in range(0,loop_range):
            for j in range(i+1,loop_range):
                if((np.sum(bin_matrix[:,i]==1))>=one_count):
                    if((np.count_nonzero(bin_matrix[:,[i]]&bin_matrix[:,[j]]))==one_count): 
                        loop_range+=1
                        print("MEM[%d] = MEM[%d] + MEM[%d];"%(bin_weight+cnt,i,j))
                        index_array.append([i,j,bin_weight+cnt,'+'])
                        bin_matrix = np.hstack((bin_matrix, np.zeros((rows1, 1), dtype=np.int32)))
                        bin_matrix[:,[bin_weight+cnt]] = bin_matrix[:,[i]]&bin_matrix[:,[j]]
                        bin_matrix[:,[i]] = bin_matrix[:,[i]]&(~bin_matrix[:,[bin_weight+cnt]])
                        bin_matrix[:,[j]] = bin_matrix[:,[j]]&(~bin_matrix[:,[bin_weight+cnt]])
                        cnt+=1
                        #new_cols_added = True
                        add_count[rows2-one_count] += 1
        print("Counts of Additions for %d one match:  %d" %(one_count,add_count[rows2-one_count]))
        if(one_count>1):
            one_count=one_count-1
        else:
            one_count = 1
        if check_one_hot(bin_matrix):
            break
    print("Total Counts of Additions:%d \n" %(cnt))
    
    # find output index
    output_index = []
    # Iterate over the rows
    for row_index, row in enumerate(bin_matrix):
        # Iterate over the columns
        for col_index, value in enumerate(row):
            if value == 1:
                output_index.append((row_index,col_index))
                break  
    h = 0
    for i in range(rows2):
        print("output_vector[%d] = MEM[%d]; " %(output_index[h][0], output_index[h][1]))
        index_array.append([h,output_index[h][1],-1,'='])
        h += 1
    
    # print binary matrix
    print("\nSize：",bin_matrix .size)      
    print("Shape：",bin_matrix .shape)         
    print("Dimension",bin_matrix .ndim)      
    print(bin_matrix, "\n")
    
    # print index array 
    for row in index_array:
        print(row)
        
    return index_array

In [9]:
def minimize_operation_matrix(matrix, matrix_binarized = False, bit_size = 8):
    """
    Args:
        matrix: given m x n matrix
        bit_size: bit width of each element in the matrix
        matrix_binarized: flag to show whether matrix is already binarized or not
        
    Output:
        index_array: format of array is: [left#, right#, target#, 'operation'] 
            and first row is: [rows, cols, bit, 'parameters']
    """
    # bit_size = bit_size+1
    rows1, cols1 = matrix.shape 
    bin_max_length = cols1*bit_size*3
    bin_weight = cols1*bit_size
    
    if not matrix_binarized:
        bin_matrix_ext = np.zeros( (rows1, bin_max_length), dtype=np.uint8)
        # Element wise conversion of each element to binary number for 2^bit notation
        bin_matrix = ((matrix.reshape(-1, 1) & (2**np.arange(bit_size)[::-1])) != 0).astype(int)
        bin_matrix = bin_matrix.reshape(rows1, -1)
        # Copying elements of bin matrix to extended binarized matrix
        bin_matrix_ext[:,:bit_size*cols1] = bin_matrix
    else:
        bin_matrix_ext = matrix
    
    # find the number of max ones of input binary matrix
    max_matching = bin_matrix_ext.sum(axis=0).max()
  
    # process binary matrix
    rows2, cols2 = bin_matrix_ext.shape
    cnt = 0 #count number of matches
    add_count = [0]*rows2 #count number of additions 
    one_count = int(max_matching) #count number of one in column

    # the first row of index_array is to store required parameters, [rows, cols, bit_size, 'parameters']
    index_array = []
    index_array.append([rows1, cols1, bit_size,'parameters'])

    a = 1
    for i in range(bin_weight):
        if((i%bit_size)==0):
            #print("MEM[%d] = -(input_vector[%d] << %d);" %(i,a-1,a*bit_size-i-1))
            index_array.append([i,a-1,a*bit_size-i-1,'-<<'])
        else:
            #print("MEM[%d] = input_vector[%d] << %d;" %(i,a-1,a*bit_size-i-1))
            index_array.append([i,a-1,a*bit_size-i-1,'<<'])
        if((i+1)%bit_size==0):
            a+=1
    
    
    while(one_count >= 1):
        print("\n Max number of 1: %d" % one_count)
        i = 0
        # while(i < cols2 and bin_max_length > bin_weight+cnt):
        while(i < bin_matrix_ext.shape[1]): # and bin_max_length > bin_weight+cnt):
            # doubling the matrix size whenever it runs out of memory  
            if bin_max_length <= bin_weight+cnt+2: 
                print('Increasing matrix size, current bin_max_length:', bin_max_length, 'bin_weight:', bin_weight, 'cnt:', cnt)
                bin_max_length += bin_matrix_ext.shape[1]
                bin_matrix_ext = np.hstack((bin_matrix_ext, np.zeros((rows1, bin_matrix_ext.shape[1]), 
                                                            dtype=np.int32)))
                print('Increasing matrix size, updated bin_max_length:', bin_max_length)
            
            # print('i:', i, 'searching for col:', bin_matrix_ext[:,i])
            # print('one_count:', one_count, 'i:', i)
            if((np.sum(bin_matrix_ext[:,i]==1))>=one_count):
                matching_idx = ((bin_matrix_ext[:,i] @ bin_matrix_ext) == one_count).nonzero()[0]
                # print('matching_idx:', matching_idx)
                if len(matching_idx) > 1:
                    j = matching_idx[0] if i != matching_idx[0] else matching_idx[1]
                    # print('j:', j, 'found col:', bin_matrix_ext[:,j])
                    #print("MEM[%d] = MEM[%d] + MEM[%d];"%(bin_weight+cnt,i,j))
                    index_array.append([i,j,bin_weight+cnt,'+'])
                    bin_matrix_ext[:,bin_weight+cnt] = bin_matrix_ext[:,i]&bin_matrix_ext[:,j]
                    bin_matrix_ext[:,i] = bin_matrix_ext[:,i]&(~bin_matrix_ext[:,bin_weight+cnt])
                    bin_matrix_ext[:,j] = bin_matrix_ext[:,j]&(~bin_matrix_ext[:,bin_weight+cnt])

                    cnt += 1
                    add_count[rows2-one_count] += 1
                else:
                    i += 1
            else:
                i += 1
        
        if check_one_hot(bin_matrix_ext):
            break
        #print("Counts of Additions for %d one match:  %d" %(one_count,add_count[rows2-one_count]))
        one_count=one_count-1
    
    # Adding non zero operations at the end
    nonzero_row_cols = bin_matrix_ext.nonzero() 
    nonzero_rows = nonzero_row_cols[0]
    nonzero_cols = nonzero_row_cols[1]
    index_array = index_array + [[row, col, -1, '='] for row, col in zip(nonzero_rows, nonzero_cols)]
    
    return index_array, bin_matrix_ext

In [10]:
# generate verilog
def generate_verilog(index_array):
    # define parameters
    rows1, cols1 = index_array[0][0], index_array[0][1]
    bit = index_array[0][2]
    mem_size = max(row[2] for row in index_array)
    
    added_statements = ""
    h = 1
    '''
    for i in range(cols1*bit):
        added_statements += f"assign MEM[{i}] = input_vector[{h-1}] <<< {h*bit-i-1};\n"
        if((i+1)%bit==0):
            h+=1
    '''
    for row in index_array[1:]:
        if row[2] != -1:
            if row[3] == '<<':
                added_statements += f"assign MEM[{row[0]}] = input_vector[{row[1]}] {row[3]} {row[2]};\n"
            elif row[3] == '-<<':
                added_statements += f"assign MEM[{row[0]}] = -(input_vector[{row[1]}] << {row[2]});\n"
            elif row[3] == '>>':
                added_statements += f"assign MEM[{row[0]}] = input_vector[{row[1]}] {row[3]} {row[2]};\n"
            elif row[3] == '+':
                added_statements += f"assign MEM[{row[2]}] = MEM[{row[0]}] {row[3]} MEM[{row[1]}];\n"
        else:
            added_statements += f"assign output_vector[{row[0]}] = MEM[{row[1]}];\n"     
    
    statements = [
        "// Verilog module\n",
        f"module matrix_multiplier#(\n",
        f"    parameter ROWS = {rows1},\n",
        f"    parameter COLS = {cols1},\n",
        f"    parameter BITS = {bit},\n",
        f"    parameter MEM_SIZE = {mem_size},\n",
        f"    parameter input_bit_width = 4,\n",
        f"    parameter output_bit_width = {7+bit}\n",
        ")(\n",
        "    input wire [input_bit_width-1:0] input_vector [0: COLS-1],\n",
        f"    output wire[output_bit_width-1:0] output_vector [0: ROWS-1]\n",
        ");\n\n",
        f"wire [output_bit_width-1:0] MEM [0:MEM_SIZE];\n\n",
        added_statements,
        "\n",
        "endmodule\n"
    ]
    # Create and open a new Verilog file
    verilog_file = open(f"output/matrix_multiplier_{mem_size}.sv", "w")
    # Write the strings to the Verilog file
    for statement in statements:
        verilog_file.write(statement)
    # Close the Verilog file
    verilog_file.close()
    print("Successfully generate verilog file!")

 ## Baseline matrix multiplication (hardcoded)

In [11]:
def to_twos_complement(val, bit_width):
    if val >= 0:
        return val
    else:
        return (1 << bit_width) + val

In [12]:
def generate_verilog_baseline_hardcoded(matrix):
    # Matrix dimensions
    rows = len(matrix)
    cols = len(matrix[0])

    # Check that the matrix is rectangular
    for row in matrix:
        assert len(row) == cols

    # Find the maximum absolute value in the matrix
    max_val = max(max(abs(val) for val in row) for row in matrix)

    # Find the number of bits needed to represent this value
    bit_width = np.floor(np.log2(max_val)) + 2  # +1 for sign bit, +1 because log2 is zero-based

    # Template for the Verilog module
    template = """
    module matrix_mult
        #(
        parameter input_bit_width = 4,
        parameter output_bit_width = 16
        )
        (
        {inputs}
        {outputs}
        );

        // Matrix A
        {matrix_a}

        // Matrix multiplication
        {mult_assign}

    endmodule
    """

    # Create the input, output, and matrix declarations
    inputs = ["input wire [input_bit_width-1:0] B{0},".format(i) for i in range(cols)]
    outputs = ["output wire [output_bit_width-1:0] C{0}{1}".format(i, "," if i < rows-1 else "") for i in range(rows)]
    matrix_a = []
    for i in range(rows):
        for j in range(cols):
            #matrix_a.append("wire A{0}{1} = 1'd{2};".format(i, j, matrix[i][j]))
            matrix_a.append("wire [{0}:0] A{1}{2} = {3}'d{4};".format(int(bit_width-1), i, j, int(bit_width), to_twos_complement(matrix[i][j], int(bit_width))))
    
    # Create the multiplication assignments
    mult_assign = []
    for i in range(rows):
        terms = ["A{0}{1}*B{1}".format(i, j) for j in range(cols)]
        mult_assign.append("assign C{0} = {1};".format(i, " + ".join(terms)))

    # Substitute into the template
    verilog_code = template.format(
        inputs="\n        ".join(inputs),
        outputs="\n        ".join(outputs),
        matrix_a="\n        ".join(matrix_a),
        mult_assign="\n        ".join(mult_assign)
    )

    with open(f'output/matrix_mult_{rows}x{cols}_{bit_width}.v', 'w') as f:
        f.write(verilog_code)
    #return verilog_code, bit_width

# Example usage:
#matrix = [[-2, 3], [4, -5]]
#verilog_code, bit_width = generate_verilog(matrix)
#print(f"Bit width: {bit_width}")
#print(verilog_code)


## Baseline matrix multiplication (shift and add)

In [13]:
def generate_verilog_baseline_shift_add(matrix):
    # Matrix dimensions
    rows = len(matrix)
    cols = len(matrix[0])

    # Check that the matrix is rectangular
    for row in matrix:
        assert len(row) == cols

    # Find the maximum absolute value in the matrix
    max_val = max(max(abs(val) for val in row) for row in matrix)

    # Find the number of bits needed to represent this value
    bit_width = np.floor(np.log2(max_val)) + 2  # +1 for sign bit, +1 because log2 is zero-based

    # Template for the Verilog module
    template = """
    module matrix_mult
        #(
        parameter input_bit_width = 2,
        parameter output_bit_width = 16
        )
        (
        {inputs}
        {outputs}
        );

        // Matrix A
        {matrix_a}

        // Shift and add modules
        {shift_and_add}

    endmodule
    """

    # Create the input, output, and matrix declarations
    inputs = ["input [input_bit_width-1:0] B{0},".format(i) for i in range(cols)]
    outputs = ["output [output_bit_width-1:0] C{0}{1}".format(i, "," if i < rows-1 else "") for i in range(rows)]
    matrix_a = []
    for i in range(rows):
        for j in range(cols):
            matrix_a.append("wire [{0}:0] A{1}{2} = {3}'d{4};".format(int(bit_width), i, j, int(bit_width), to_twos_complement(matrix[i][j], int(bit_width))))
    
    # Create the multiplication assignments
    shift_and_add = []
    for i in range(rows):
        for j in range(cols):
            shift_and_add.append("shift_and_add #(WIDTH) mult{0}{1} (.A(A{0}{1}), .B(B{1}), .P(P{0}{1}));".format(i, j))
        shift_and_add.append("assign C{0} = {1};".format(i, " + ".join("P{0}{1}".format(i, j) for j in range(cols))))

    # Substitute into the template
    verilog_code = template.format(
        inputs="\n        ".join(inputs),
        outputs="\n        ".join(outputs),
        matrix_a="\n        ".join(matrix_a),
        shift_and_add="\n        ".join(shift_and_add)
    )
    with open(f'output/matrix_mult_baseline_{rows}x{cols}.v', 'w') as f:
        f.write(verilog_code)
    


## Baseline matrix multiplication (single bit)

In [14]:
def generate_verilog_baseline_single_bit(matrix):
    # Matrix dimensions
    rows = len(matrix)
    cols = len(matrix[0])

    # Check that the matrix is rectangular
    for row in matrix:
        assert len(row) == cols

    # Find the maximum absolute value in the matrix
    #max_val = max(max(abs(val) for val in row) for row in matrix)

    # Find the number of bits needed to represent this value
    #bit_width = np.floor(np.log2(max_val)) + 2  # +1 for sign bit, +1 because log2 is zero-based

    # Template for the Verilog module
    template = """
    module matrix_mult_single_bit
        #(
        parameter input_bit_width = 1,
        parameter output_bit_width = 32
        )
        (
        {inputs}
        {outputs}
        );
        // Matrix A
        {matrix_a}
        
        // Output_list
        {output_list}

    endmodule
    """

    # Create the input, output declarations
    inputs = ["input input{0},".format(i) for i in range(cols)]
    outputs = ["output [output_bit_width-1:0] output{0}{1}".format(i, "," if i < rows-1 else "") for i in range(rows)]
    matrix_a = []
    for i in range(rows):
        for j in range(cols):
            matrix_a.append("wire [{0}:0] A{1}{2};".format(31, i, j))
            matrix_a.append("assign A{}{}".format(i, j)+" = (input"+str(j)+"==1'b1) ? " +
                "{}'b{}".format(32, str(np.binary_repr(matrix[i, j], width=32)))+":" + 
                "{}'b{};".format(32, str(np.binary_repr(-matrix[i, j], width=32))))
    
    # Create the multiplication assignments
    
    output_list = []
    for i in range(rows):
        terms = ["A{0}{1}".format(i, j) for j in range(cols)]
        output_list.append("assign output{0} = {1};".format(i, " + ".join(terms)))
        
    # Substitute into the template
    verilog_code = template.format(
        inputs="\n        ".join(inputs),
        outputs="\n        ".join(outputs),
        matrix_a="\n        ".join(matrix_a),
        output_list="\n        ".join(output_list),
    )
    with open(f'output/matrix_mult_baseline_{rows}x{cols}_single_bit.v', 'w') as f:
        f.write(verilog_code)
    

## Test

In [15]:
bit = 5
INT_range = 2**bit - 1
height = 3
weight = 3

In [16]:
np.random.seed(100)
#matrix = np.random.randint(low=-INT_range ,high=INT_range,size=(height, weight))
matrix = np.random.uniform(low=-INT_range, high=INT_range,size=(height, weight))

In [17]:
mb = Fxp(matrix, signed=True, n_word=16, n_frac=10)

In [18]:
fmb = np.int64(np.float64(mb)*pow(2,10))

In [19]:
generate_verilog_baseline_hardcoded(fmb)

In [20]:
generate_verilog_baseline_shift_add(fmb)

In [21]:
generate_verilog_baseline_single_bit(fmb)

In [22]:
index_array = preprocess_matrix_to_array(fmb)
# index_array, bin_matrix = minimize_operation_matrix(matrix, bit_size = 16)

Size： 9
Shape： (3, 3)
Dimension 2
[[  2755 -14070  -4792]
 [ 21889 -31444 -24025]
 [ 10840  20687 -23064]] 

MEM[0] = -(input_vector[0] << 15);
MEM[1] = input_vector[0] << 14;
MEM[2] = input_vector[0] << 13;
MEM[3] = input_vector[0] << 12;
MEM[4] = input_vector[0] << 11;
MEM[5] = input_vector[0] << 10;
MEM[6] = input_vector[0] << 9;
MEM[7] = input_vector[0] << 8;
MEM[8] = input_vector[0] << 7;
MEM[9] = input_vector[0] << 6;
MEM[10] = input_vector[0] << 5;
MEM[11] = input_vector[0] << 4;
MEM[12] = input_vector[0] << 3;
MEM[13] = input_vector[0] << 2;
MEM[14] = input_vector[0] << 1;
MEM[15] = input_vector[0] << 0;
MEM[16] = -(input_vector[1] << 15);
MEM[17] = input_vector[1] << 14;
MEM[18] = input_vector[1] << 13;
MEM[19] = input_vector[1] << 12;
MEM[20] = input_vector[1] << 11;
MEM[21] = input_vector[1] << 10;
MEM[22] = input_vector[1] << 9;
MEM[23] = input_vector[1] << 8;
MEM[24] = input_vector[1] << 7;
MEM[25] = input_vector[1] << 6;
MEM[26] = input_vector[1] << 5;
MEM[27] = input_vec

In [23]:
generate_verilog(index_array)

Successfully generate verilog file!
