In [399]:
from queue import Queue
import numpy as np
import math

class SystolicArrayCell:
    def __init__(self, row_n, col_n):
        self.pos_x = 0
        self.pos_y = 0
        self.row_n = row_n
        self.col_n = col_n
        
        #ring register
        self.receive_cell = None  #接收数据寄存器
        self.receive_reg = 1
        self.receive_out = 0
        #self.send    =   #发送数据寄存器
        
        #On chip buffer
        self.result_bank_input = None
        
        #edge update
        self.process_id = 0
        self.process_id_out = 0
        self.next_src = -1
        self.next_dst = -1
        self.src = -1
        self.dst = -1
        self.rb_depth = 0
        self.rb_value = 0
        self.edge_empty   = False
        self.edge_compute = True
        self.hold         = False
        self.edge_number = 0

    # Connects this cell to its neighbors above and to the left
    def connect(self, pos_x, pos_y, array):
        self.pos_x = pos_x
        self.pos_y = pos_y
        self.edge_number = pos_y
        #ring dataflow
        if self.pos_y is array.row_n-1:
            self.receive_cell = array.cells[0][self.pos_x] #cell 第一个代表行数，也就是Y， 第二个代表列数，也就是X
        # Otherwise, it's another cell
        else:
            self.receive_cell = array.cells[self.pos_y+1][self.pos_x]
        #each PE on the same row connect to the same result bank
        self.result_bank_input = array.result_bank[self.pos_y][self.pos_x]
        self.edge_bank         = array.edge_bank[self.pos_y]

    def set_process_id(self, idx):
        self.process_id = idx
        
    # We'll model the transfer of signals through registers with a read() and a
    # compute() method. 
    # read() represents the registers sampling data at the positive edge of the
    # clock
    def read(self, edge_update):
        #ring dataflow 
        #print("Enter | cell({:d},{:d}) next_src {:d}, next_dst {:d}, src {:d}, dst {:d}, process_id {:d}". format(self.pos_x, self.pos_y, self.next_src, self.next_dst, self.src, self.dst, self.process_id))
        if self.edge_bank.empty():
            self.edge_empty = True
        elif self.edge_compute or edge_update:
            self.src, self.dst = self.edge_bank.get()
            self.hold         = True
            self.edge_compute = False
        else:
            self.edge_compute = False
        
        if edge_update:
            self.process_id = self.process_id
        else:
            self.receive_reg = self.receive_cell.receive_out
            self.process_id  = self.receive_cell.process_id_out
        
        #print("Medium | cell({:d},{:d}) next_src {:d}, next_dst {:d}, src {:d}, dst {:d}, process_id {:d}". format(self.pos_x, self.pos_y, self.next_src, self.next_dst, self.src, self.dst, self.process_id))
        if self.src == self.process_id and self.hold:
            self.edge_compute = True
            self.hold         = False
        else:
            self.edge_compute = False
        '''
        if edge_update:
            if self.edge_bank.empty():
                self.edge_empty = True
                self.hold       = False
            else:
                self.next_src, self.next_dst = self.edge_bank.get()
                self.hold                    = True
        
        if self.next_src == self.process_id and self.hold:
            self.edge_compute = True
            self.src  = self.next_src
            self.dst  = self.next_dst
            self.hold = False
            if self.edge_bank.empty():
                self.edge_empty = True
            else:
                self.next_src, self.next_dst = self.edge_bank.get()
                self.hold = True
        else:
            self.edge_compute = False
        '''
        
        self.rb_depth = int(self.dst/self.row_n)
        print("Out | cell({:d},{:d}) next_src {:d}, next_dst {:d}, src {:d}, dst {:d}, process_id {:d}". format(self.pos_x, self.pos_y, self.next_src, self.next_dst, self.src, self.dst, self.process_id))
        self.rb_value = self.result_bank_input[self.rb_depth]
        

    # compute() represents combinational logic that takes place between 
    # positive edges of the clock (multiplication and addition)
    def compute(self):
        #ring dataflow
        if self.edge_compute:
            print("compute cell({:d},{:d}) src {:d}, dst {:d}". format(self.pos_x, self.pos_y, self.src, self.dst))
            self.result_bank_input[self.rb_depth] = self.rb_value + self.receive_reg
        self.receive_out = self.receive_reg
        self.process_id_out = self.process_id
        #print("cell({:d},{:d}), hold {:d}, edge_empty {:d}". format(self.pos_x, self.pos_y, self.hold, self.edge_empty))
        #print(self.edge_number)
        
        
    def cell_state(self):
        #print("cell({:d},{:d}),rec_reg={:d}, rec_out={:d}, proc_id={:d}, proc_out={:d}". format(self.pos_x, self.pos_y, self.receive_reg, self.receive_out, self.process_id, self.process_id_out))
        print("cell({:d},{:d}),rec_reg={:d}, proc_id={:d}, rb_value={:d}". format(self.pos_x, self.pos_y, self.receive_reg, self.process_id, self.rb_value))

In [403]:
# This represents our entire array: cells, inputs, and outputs
class SystolicArray:
    # We'll take a parameter for the size of the square arrays to be multiplied
    def __init__(self, row_n, col_n):
        self.row_n = row_n
        self.col_n = col_n

        # "cells" will hold the array of processing elements
        self.cells = []
        # This array is a square with dimensions "array_size"
        for _ in range(self.row_n):
            row = []
            for _ in range(self.col_n):
                cell = SystolicArrayCell(row_n, col_n)
                row.append(cell)
            self.cells.append(row)
        
        
        self.edge_bank   = [Queue() for _ in range(self.row_n)]
        self.result_bank = [[list() for _ in range(self.col_n)] for _ in range(self.row_n)]

        # When all cells and inputs are created, then they can be connected 
        # (again, this would be accomplished with wiring)
        for row_num, row in enumerate(self.cells):
            for col_num, cell in enumerate(row):
                cell.connect(col_num, row_num, self) #每一行对应一个pos_y, 每一列对应一个pos_x
    
    #ring dataflow
    def edge_preprocess(num_node, edge_src, edge_dst):
        src, dst = zip(*(sorted(zip(edge_src, edge_dst), key=itemgetter(1))))
        result = [list() for _ in range(num_node)]
        for idx in range(len(dst)):
            result[dst[idx]].append((src[idx],dst[idx]))
        for idx in range(len(result)):
            #print(idx)
            #print(len(result[idx]))
            if len(result[idx]) is 0:
                result[idx] = []
            else:
                src, dst = zip(*result[idx])
                result_A = []
                result_B = []
                for idx_ in range(len(src)):
                    if(src[idx_] >= (dst[idx_]%3)):
                        result_A.append((src[idx_], dst[idx_]))
                    else:
                        result_B.append((src[idx_], dst[idx_]))
                result_A.extend(result_B)
                result[idx] = result_A
        return result

    def fill_edges(self, num_node, edge_src, edge_dst):
        edge_ = edge_preprocess(num_node, edge_src, edge_dst)
        for i, val in enumerate(edge_):
            for e in val:
                self.edge_bank[i%3].put(e)
                
    def fill_result_banks(self, num_nodes):
        for row_num in range(self.row_n):
            for idx_ in range(self.col_n):  
                for _ in range(math.ceil(num_nodes/self.row_n)):
                    self.result_bank[row_num][idx_].append(0)
                    
    def fill_idx(self,idx):
        for row_num in range(self.row_n):
            for col_num in range(self.col_n):
                self.cells[row_num][col_num].set_process_id(idx[row_num])
    
    # For this demo, all cells will read() the values of their neighbors first
    def read(self,edge_update):
        for row in self.cells:
            for cell in row:
                cell.read(edge_update)

    # And then after all cells have read(), they will compute() the next step
    def compute(self):
        for row in self.cells:
            for cell in row:
                cell.compute()
                
    def terminal_signal(self):
        for row in self.cells:
            for cell in row:
                #print(cell.hold)
                #print(cell.edge_bank.empty())
                if cell.hold or not cell.edge_empty:
                    return False
        return True
        #for id_x in self.edge_bank:
        #    if id_x.empty() is False:
        #        return False
        #return True
            
            
    def show_staus(self):
        for row in self.cells:
            for cell in row:
                cell.cell_state()
                
    # Each cycle involves a read() and a compute()
    def cycle(self, edge_update):
        # read() models register sampling on the positive edge of the clock
        self.read(edge_update)
        # compute() models the combinational logic between clock edges
        self.compute()
        self.show_staus()

    # run() will execute the array's computation, assuming it's been filled
    def run(self, num_nodes):
        # It takes 3n-2 cycles to compute the full matrix of results
        edge_update = True
        for cycle in range(3*self.row_n):
            print("-----Cycle----{:d}----------". format(cycle))
            self.cycle(edge_update)
            edge_update = False
            self.get_edge_output(num_nodes)
            if(self.terminal_signal()):
                break
        return self.get_outputs()

    # The outputs are also staggered and transposed, so we'll format them 
    # before returning the results
    def get_outputs(self):
        ret = []

        return ret

    def get_edge_output(self, num_nodes):
        for id_x in range(num_nodes):
            print("id={:d}-|-{:d}". format(id_x, self.result_bank[int(id_x%self.row_n)][0][int(id_x/self.row_n)]))

In [404]:
# Here we'll use a small 3x3 test multiplication to see the systolic array
# in action
row_n = 3
col_n = 1
myArray = SystolicArray(row_n, col_n)

#src = [0,1,2,1,2,0,2,0,1]
#dst = [0,1,2,0,1,2,0,1,2]

#src = [0,0,0,0,1,1,2,2,2]
#dst = [0,1,3,5,2,4,0,3,5]

src = [0,1,1,2,0,0,0,2,2]
dst = [0,2,4,0,1,3,5,3,5]

#src = [0,0,1,2]
#dst = [0,1,2,0]

myArray.fill_edges(6, src, dst)

idx = [0,1,2]
myArray.fill_idx(idx)
myArray.fill_result_banks(6)       

res = myArray.run(6)
#assert (res == np.matmul(activations, weights)).all()
#print('Systolic array matches numpy matmul')

-----Cycle----0----------
Out | cell(0,0) next_src -1, next_dst -1, src 0, dst 0, process_id 0
Out | cell(0,1) next_src -1, next_dst -1, src 0, dst 1, process_id 1
Out | cell(0,2) next_src -1, next_dst -1, src 1, dst 2, process_id 2
compute cell(0,0) src 0, dst 0
cell(0,0),rec_reg=1, proc_id=0, rb_value=0
cell(0,1),rec_reg=1, proc_id=1, rb_value=0
cell(0,2),rec_reg=1, proc_id=2, rb_value=0
id=0-|-1
id=1-|-0
id=2-|-0
id=3-|-0
id=4-|-0
id=5-|-0
-----Cycle----1----------
Out | cell(0,0) next_src -1, next_dst -1, src 2, dst 0, process_id 1
Out | cell(0,1) next_src -1, next_dst -1, src 0, dst 1, process_id 2
Out | cell(0,2) next_src -1, next_dst -1, src 1, dst 2, process_id 0
cell(0,0),rec_reg=1, proc_id=1, rb_value=1
cell(0,1),rec_reg=1, proc_id=2, rb_value=0
cell(0,2),rec_reg=1, proc_id=0, rb_value=0
id=0-|-1
id=1-|-0
id=2-|-0
id=3-|-0
id=4-|-0
id=5-|-0
-----Cycle----2----------
Out | cell(0,0) next_src -1, next_dst -1, src 2, dst 0, process_id 2
Out | cell(0,1) next_src -1, next_dst -1, 

In [390]:
#src = [0,0,0,0,1,1,2,2,2]
#dst = [0,1,3,5,2,4,0,3,5]
src = [0,1,2,1,2,0,2,0,1]
dst = [0,1,2,0,1,2,0,1,2]

In [270]:
    edge_bank   = [Queue() for _ in range(3)]
    def edge_preprocess(num_node, edge_src, edge_dst):
        src, dst = zip(*(sorted(zip(edge_src, edge_dst), key=itemgetter(1))))
        result = [list() for _ in range(num_node)]
        for idx in range(len(dst)):
            result[dst[idx]].append((src[idx],dst[idx]))
        for idx in range(len(result)):
            #print(idx)
            #print(len(result[idx]))
            if len(result[idx]) is 0:
                result[idx] = []
            else:
                src, dst = zip(*result[idx])
                result_A = []
                result_B = []
                for idx_ in range(len(src)):
                    if(src[idx_] >= (dst[idx_]%3)):
                        result_A.append((src[idx_], dst[idx_]))
                    else:
                        result_B.append((src[idx_], dst[idx_]))
                result_A.extend(result_B)
                result[idx] = result_A
        return result
    def fill_edges(num_node, edge_src, edge_dst):
        edge_ = edge_preprocess(num_node, edge_src, edge_dst)
        for i, val in enumerate(A):
            for edge in val:
                edge_bank[i%3].put(edge)

In [271]:
A = edge_preprocess(3, src, dst)

In [272]:
for i, val in enumerate(A):
    for edge in val:
        edge_bank[i%3].put(edge)

In [273]:
for val in edge_bank:
    print(val.qsize())

3
3
3


In [102]:
src

(0, 1, 2)

In [103]:
dst

(1, 1, 1)

In [104]:
result_A = []
result_B = []
result_  = []
for idx in range(len(src)):
    if(src[idx] >= dst[idx]):
        result_A.append((src[idx], dst[idx]))
    else:
        result_B.append((src[idx], dst[idx]))
result_.append(result_A.extend(result_B))

In [105]:
result_A

[(1, 1), (2, 1), (0, 1)]

In [106]:
result_B

[(0, 1)]

In [107]:
result_

[None]

In [243]:
edge_bank   = [Queue() for _ in range(3)]

In [245]:
for id_x in edge_bank:
    print(1)

1
1
1
