In [256]:
from queue import Queue
import numpy as np
import math

class SystolicArrayCell:
    def __init__(self):
        self.pos_x = 0
        self.pos_y = 0
        self.array_size = 0

        # Each cell has the concept of a "partial sum" and an "activation".
        # These take one cycle to cross each cell (they would be delayed
        # with a register). To model this in python, we'll have a <field>
        # variable that represents the value driven by the neighboring cell,
        # and a <field>_out value representing the value driven by this cell.

        # partial sum: the running sum of the products, transmitted vertically
        self.partial_sum = 0
        self.partial_sum_out = 0
        # activation: the input activation value, transmitted horizontally
        self.activation = 0
        self.activation_out = 0

        # weight: The weight representing the second value to be multiplied
        self.weight = 0
        
        # Input fields, which will hold the connection to the cells or FIFOs
        # above and to the left of this cell
        self.input_activation = None
        self.input_partial_sum = None
        
        #ring register
        self.receive_cell = None  #接收数据寄存器
        self.receive_reg = 0
        self.receive_out = 0
        #self.send    =   #发送数据寄存器
        
        #On chip buffer
        self.result_bank_input = None
        
        #edge update
        self.process_id = 0
        self.process_id_out = 0
        self.src = 0
        self.dst = 0
        self.rb_depth = 0
        self.rb_value = 0
        self.edge_compute = True
        self.edge_number = 0

    # In the hardware implementation, we would use a control flow signal and
    # weight inputs via the partial sum lines (note that a weight is only half
    # the bits of that field, allowing control flow to be transmitted 
    # alongside). For simplification here, we'll just say it's hacked in by 
    # magic.
    def set_weight(self, weight):
        self.weight = weight

    # Connects this cell to its neighbors above and to the left
    def connect(self, pos_x, pos_y, array):
        self.pos_x = pos_x
        self.pos_y = pos_y
        self.array_size = array.array_size
        self.edge_number = pos_y

        # If we're at x position zero, then our left neighbor is a FIFO queue
        if self.pos_x is 0:
            self.input_activation = array.input[self.pos_y]
        # Otherwise, it's another cell
        else:
            self.input_activation = array.cells[self.pos_y][self.pos_x - 1]

        # If we're at y position zero, then our above neighbor is nothing
        if self.pos_y is 0:
            # All partial sums from here will just be 0
            self.input_partial_sum = None
        # Otherwise, our above neighbor is another cell
        else:
            self.input_partial_sum = array.cells[self.pos_y - 1][self.pos_x]
            
        #ring dataflow
        if self.pos_y is array.array_size-1:
            self.receive_cell = array.cells[0][self.pos_x] #cell 第一个代表行数，也就是Y， 第二个代表列数，也就是X
        # Otherwise, it's another cell
        else:
            self.receive_cell = array.cells[self.pos_y+1][self.pos_x]
        #each PE on the same row connect to the same result bank
        self.result_bank_input = array.result_bank[self.pos_y][self.pos_x]
        self.edge_bank         = array.edge_bank[self.pos_x]

    def set_process_id(self, idx):
        self.process_id = idx
        
    # We'll model the transfer of signals through registers with a read() and a
    # compute() method. 
    # read() represents the registers sampling data at the positive edge of the
    # clock
    def read(self, edge_update):
        # Read the left neighbor
        # If this is a FIFO queue, take its value (or 0 if it's empty)
        if type(self.input_activation) is Queue:
            if self.input_activation.empty():
                self.activation = 0
            else:
                self.activation = self.input_activation.get()
        # If it is a cell, we read the value from activation_out
        else:
            self.activation = self.input_activation.activation_out

        # Read the above neighbor
        # If this is not connected, then the partial sum is always 0
        if self.input_partial_sum is None:
            self.partial_sum = 0
        # Otherwise, read the partial sum from the above cell
        else:
            self.partial_sum = self.input_partial_sum.partial_sum_out
        #ring dataflow
        if self.edge_bank.empty():
            self.edge_compute = False
        elif edge_update:
            self.edge_compute = True
            self.src, self.dst = self.edge_bank.get()
        elif self.src == self.process_id:
            self.edge_compute = True
            self.src, self.dst = self.edge_bank.get()
        else:
            self.edge_compute = False
            self.src = self.src
            self.dst = self.dst
        if(edge_update):
            self.receive_reg = self.edge_number
            self.process_id  = self.process_id
        else:
            self.receive_reg = self.receive_cell.receive_out
            self.process_id  = self.receive_cell.process_id_out
        
        self.rb_depth = int(self.dst/self.array_size)
        print("cell({:d},{:d}) src {:d}, dst {:d}, process_id {:d}". format(self.pos_x, self.pos_y, self.src, self.dst, self.process_id))
        self.rb_value = self.result_bank_input[self.rb_depth]
        

    # compute() represents combinational logic that takes place between 
    # positive edges of the clock (multiplication and addition)
    def compute(self):
        # First, the weight and activation in are multiplied
        product = self.weight * self.activation
        # Then that value is added to the partial sum from above and transmitted
        # downwards
        self.partial_sum_out = self.partial_sum + product
        # And the activation is transmitted to the right
        self.activation_out = self.activation
        
        #ring dataflow
        if self.edge_compute:
            self.result_bank_input[self.rb_depth] = self.rb_value + self.receive_reg
        self.receive_out = self.receive_reg
        self.process_id_out = self.process_id
        #print(self.edge_number)
        
        
    def cell_state(self):
        #print("cell({:d},{:d}),rec_reg={:d}, rec_out={:d}, proc_id={:d}, proc_out={:d}". format(self.pos_x, self.pos_y, self.receive_reg, self.receive_out, self.process_id, self.process_id_out))
        print("cell({:d},{:d}),rec_reg={:d}, proc_id={:d}, rb_value={:d}". format(self.pos_x, self.pos_y, self.receive_reg, self.process_id, self.rb_value))

In [257]:
# This represents our entire array: cells, inputs, and outputs
class SystolicArray:
    # We'll take a parameter for the size of the square arrays to be multiplied
    def __init__(self, array_size):
        self.array_size = array_size

        # "cells" will hold the array of processing elements
        self.cells = []
        # This array is a square with dimensions "array_size"
        for _ in range(self.array_size):
            row = []
            for _ in range(self.array_size):
                cell = SystolicArrayCell()
                row.append(cell)
            self.cells.append(row)

        # The inputs and outputs will both be FIFO queues
        self.input = [Queue() for _ in range(self.array_size)]
        self.output = [Queue() for _ in range(self.array_size)]
        
        self.edge_bank   = [Queue() for _ in range(self.array_size)]
        self.result_bank = [[list() for _ in range(array_size)] for _ in range(array_size)]

        # When all cells and inputs are created, then they can be connected 
        # (again, this would be accomplished with wiring)
        for row_num, row in enumerate(self.cells):
            for col_num, cell in enumerate(row):
                cell.connect(col_num, row_num, self) #每一行对应一个pos_y, 每一列对应一个pos_x

    # Accept a 2d array of weights, and "hack" them in. The hardware way to 
    # fill weights is interesting but outside the scope of this demo.
    def fill_weights(self, weights):
        for row_num, row in enumerate(weights):
            for col_num, weight in enumerate(row):
                self.cells[row_num][col_num].set_weight(weight)

    # Accept a 2d array of activations.
    def fill_activations(self, activations):
        # For the systolic array to function properly, the activations must be
        # padded with a triangle of zeroes
        for row_num in range(self.array_size):
            for _ in range(row_num):
                self.input[row_num].put(0)

        # And the activations must be transposed before being added to the 
        # input queue
        for row_num in range(self.array_size):
            col = [activations[x][row_num] for x in range(self.array_size)]
            for activation in col:
                self.input[row_num].put(activation)
                
    def fill_result(self, result):
        for row_num in range(self.array_size):
            for i in range(self.array_size):
                self.result_bank[row_num].append(i)
    
    #ring dataflow
    def fill_edges(self, edges):
        for row_num in range(self.array_size):
            for idx_ in range(len(edges[row_num])):  
                print(edges[row_num][idx_])
                self.edge_bank[row_num].put(edges[row_num][idx_])
    
    def fill_result_banks(self, num_nodes):
        for row_num in range(self.array_size):
            for idx_ in range(self.array_size):  
                for _ in range(math.ceil(num_nodes/self.array_size)):
                    self.result_bank[row_num][idx_].append(0)
                    
    def fill_idx(self,idx):
        for col_num in range(self.array_size):
            for row_num in range(self.array_size):
                self.cells[col_num][row_num].set_process_id(idx[col_num])
    
    # For this demo, all cells will read() the values of their neighbors first
    def read(self,edge_update):
        for row in self.cells:
            for cell in row:
                cell.read(edge_update)

    # And then after all cells have read(), they will compute() the next step
    def compute(self):
        for row in self.cells:
            for cell in row:
                cell.compute()

        # After each step of compute(), new outputs will be but onto the output
        # queue
        for col_num in range(self.array_size):
            self.output[col_num].put(self.cells[-1][col_num].partial_sum_out)
            
    def show_staus(self):
        for row in self.cells:
            for cell in row:
                cell.cell_state()
                
    # Each cycle involves a read() and a compute()
    def cycle(self, edge_update):
        # read() models register sampling on the positive edge of the clock
        self.read(edge_update)
        # compute() models the combinational logic between clock edges
        self.compute()
        self.show_staus()

    # run() will execute the array's computation, assuming it's been filled
    def run(self, num_nodes):
        # It takes 3n-2 cycles to compute the full matrix of results
        edge_update = True
        for cycle in range(3*self.array_size - 2):
            print("-----Cycle----{:d}----------". format(cycle))
            self.cycle(edge_update)
            edge_update = False
            self.get_edge_output(num_nodes)
        return self.get_outputs()

    # The outputs are also staggered and transposed, so we'll format them 
    # before returning the results
    def get_outputs(self):
        ret = []

        # Remove the staggering by throwing away the appropriate number of 0's
        for col_num in range(self.array_size):
            for _ in range(col_num + self.array_size - 1):
                self.output[col_num].get()

        # And transpose the results 
        for row_num in range(self.array_size):
            row = []
            for output_col in self.output:
                row.append(output_col.get())
            ret.append(row)

        return ret

    def get_edge_output(self, num_nodes):
        for id_x in range(num_nodes):
            print("id={:d}-|-{:d}". format(id_x, self.result_bank[id_x%self.array_size][0][0]))

In [258]:
# Here we'll use a small 3x3 test multiplication to see the systolic array
# in action
array_size = 3
myArray = SystolicArray(3)

activations = [
    [1, 2, 3], 
    [4, 5, 6], 
    [7, 8, 9]
]
myArray.fill_activations(activations)

weights = [
    [10, 20, 30],
    [40, 50, 60],
    [70, 80, 90]
]
myArray.fill_weights(weights)
myArray.fill_result(weights)

src = [0,1,2,1,2,0,2,0,1]
dst = [0,1,2,0,1,2,0,1,2]
edge_banks = []
for _ in range(array_size):
    edge_banks.append(list(zip(src,dst)))
myArray.fill_edges(edge_banks)

idx = [0,1,2]
myArray.fill_idx(idx)
myArray.fill_result_banks(9)       

res = myArray.run(3)
assert (res == np.matmul(activations, weights)).all()
print('Systolic array matches numpy matmul')

(0, 0)
(1, 1)
(2, 2)
(1, 0)
(2, 1)
(0, 2)
(2, 0)
(0, 1)
(1, 2)
(0, 0)
(1, 1)
(2, 2)
(1, 0)
(2, 1)
(0, 2)
(2, 0)
(0, 1)
(1, 2)
(0, 0)
(1, 1)
(2, 2)
(1, 0)
(2, 1)
(0, 2)
(2, 0)
(0, 1)
(1, 2)
-----Cycle----0----------
cell(0,0) src 0, dst 0, process_id 0
cell(1,0) src 0, dst 0, process_id 0
cell(2,0) src 0, dst 0, process_id 0
cell(0,1) src 1, dst 1, process_id 1
cell(1,1) src 1, dst 1, process_id 1
cell(2,1) src 1, dst 1, process_id 1
cell(0,2) src 2, dst 2, process_id 2
cell(1,2) src 2, dst 2, process_id 2
cell(2,2) src 2, dst 2, process_id 2
cell(0,0),rec_reg=0, proc_id=0, rb_value=0
cell(1,0),rec_reg=0, proc_id=0, rb_value=0
cell(2,0),rec_reg=0, proc_id=0, rb_value=0
cell(0,1),rec_reg=1, proc_id=1, rb_value=0
cell(1,1),rec_reg=1, proc_id=1, rb_value=0
cell(2,1),rec_reg=1, proc_id=1, rb_value=0
cell(0,2),rec_reg=2, proc_id=2, rb_value=0
cell(1,2),rec_reg=2, proc_id=2, rb_value=0
cell(2,2),rec_reg=2, proc_id=2, rb_value=0
id=0-|-0
id=1-|-1
id=2-|-2
-----Cycle----1----------
cell(0,0) sr

In [173]:
src = [1,2,3]
dst = [1,2,3]
B = []

In [151]:
A = list(zip(src,dst))

In [154]:
B.append(A)

In [155]:
B

[[(1, 1), (2, 2), (3, 3)], [(1, 1), (2, 2), (3, 3)]]

In [144]:
src

[1, 2, 3]

In [96]:
dst = 2

In [102]:
row = int(dst%3)

In [103]:
row

2

In [101]:
int(dst%3)

2