In [1]:
from queue import Queue
import numpy as np
import math
from operator import itemgetter
from cacheout import Cache

edge_count = 0

class SystolicArrayCell:
    def __init__(self, row_n, col_n):
        self.pos_x = 0
        self.pos_y = 0
        self.row_n = row_n
        self.col_n = col_n
        
        #ring register
        self.receive_cell = None  #接收数据寄存器
        self.receive_reg = 1
        self.receive_out = 0
        #self.send    =   #发送数据寄存器
        
        #On chip buffer
        self.result_bank_input = None
        
        #edge update
        self.process_id = 0
        self.process_id_out = 0
        self.next_src = -1
        self.next_dst = -1
        self.src = -1
        self.dst = -1
        self.rb_depth = 0
        self.rb_value = 0
        self.edge_empty   = False
        self.edge_compute = True
        self.hold         = False
        self.edge_number = 0
        
        self.cache_bank = None

    # Connects this cell to its neighbors above and to the left
    def connect(self, pos_x, pos_y, array):
        self.pos_x = pos_x
        self.pos_y = pos_y
        self.edge_number = pos_y
        #ring dataflow
        if self.pos_y is array.row_n-1:
            self.receive_cell = array.cells[0][self.pos_x] #cell 第一个代表行数，也就是Y， 第二个代表列数，也就是X
        # Otherwise, it's another cell
        else:
            self.receive_cell = array.cells[self.pos_y+1][self.pos_x]
        #each PE on the same row connect to the same result bank
        self.result_bank_input = array.result_bank[self.pos_y][self.pos_x]
        self.edge_bank         = array.edge_bank[self.pos_y]
        self.cache_bank        = array.cache_bank[self.pos_y]

    def set_process_id(self, idx):
        self.process_id = idx
        self.cache_bank.set(idx, 'none')
        
    # We'll model the transfer of signals through registers with a read() and a
    # compute() method. 
    # read() represents the registers sampling data at the positive edge of the
    # clock
    def read(self, edge_update):
        #ring dataflow 
        #print("Enter | cell({:d},{:d}) next_src {:d}, next_dst {:d}, src {:d}, dst {:d}, process_id {:d}". format(self.pos_x, self.pos_y, self.next_src, self.next_dst, self.src, self.dst, self.process_id))
        if self.edge_bank.empty():
            self.edge_empty = True
        elif self.edge_compute or edge_update:
            self.src, self.dst = self.edge_bank.get()
            self.hold         = True
            self.edge_compute = False
        else:
            self.edge_compute = False
        
        if edge_update:
            self.process_id = self.process_id
        else:
            self.receive_reg = self.receive_cell.receive_out
            self.process_id  = self.receive_cell.process_id_out
            self.cache_bank.set(self.process_id,'none')
        
        #print("Medium | cell({:d},{:d}) next_src {:d}, next_dst {:d}, src {:d}, dst {:d}, process_id {:d}". format(self.pos_x, self.pos_y, self.next_src, self.next_dst, self.src, self.dst, self.process_id))
        if self.cache_bank.has(self.src) and self.hold:
            self.edge_compute = True
            self.hold         = False
        else:
            self.edge_compute = False
        '''
        if edge_update:
            if self.edge_bank.empty():
                self.edge_empty = True
                self.hold       = False
            else:
                self.next_src, self.next_dst = self.edge_bank.get()
                self.hold                    = True
        
        if self.next_src == self.process_id and self.hold:
            self.edge_compute = True
            self.src  = self.next_src
            self.dst  = self.next_dst
            self.hold = False
            if self.edge_bank.empty():
                self.edge_empty = True
            else:
                self.next_src, self.next_dst = self.edge_bank.get()
                self.hold = True
        else:
            self.edge_compute = False
        '''
        
        self.rb_depth = int(self.dst/self.row_n)
        #print("Out | cell({:d},{:d}) next_src {:d}, next_dst {:d}, src {:d}, dst {:d}, process_id {:d}". format(self.pos_x, self.pos_y, self.next_src, self.next_dst, self.src, self.dst, self.process_id))
        self.rb_value = self.result_bank_input[self.rb_depth]
        

    # compute() represents combinational logic that takes place between 
    # positive edges of the clock (multiplication and addition)
    def compute(self):
        #ring dataflow
        if self.edge_compute:
            print("compute cell({:d},{:d}) src {:d}, dst {:d}". format(self.pos_x, self.pos_y, self.src, self.dst))
            global edge_count
            edge_count = edge_count + 1
            self.result_bank_input[self.rb_depth] = self.rb_value + self.receive_reg
        self.receive_out = self.receive_reg
        self.process_id_out = self.process_id
        #print("cell({:d},{:d}), hold {:d}, edge_empty {:d}". format(self.pos_x, self.pos_y, self.hold, self.edge_empty))
        #print(self.edge_number)
        
        
    def cell_state(self):
        #print("cell({:d},{:d}),rec_reg={:d}, rec_out={:d}, proc_id={:d}, proc_out={:d}". format(self.pos_x, self.pos_y, self.receive_reg, self.receive_out, self.process_id, self.process_id_out))
        print("cell({:d},{:d}),rec_reg={:d}, proc_id={:d}, rb_value={:d}". format(self.pos_x, self.pos_y, self.receive_reg, self.process_id, self.rb_value))

In [11]:
# This represents our entire array: cells, inputs, and outputs
class SystolicArray:
    # We'll take a parameter for the size of the square arrays to be multiplied
    def __init__(self, row_n, col_n):
        self.row_n = row_n
        self.col_n = col_n

        # "cells" will hold the array of processing elements
        self.cells = []
        # This array is a square with dimensions "array_size"
        for _ in range(self.row_n):
            row = []
            for _ in range(self.col_n):
                cell = SystolicArrayCell(row_n, col_n)
                row.append(cell)
            self.cells.append(row)
        
        self.cache_bank  = [Cache(maxsize=self.row_n) for _ in range(self.row_n)]
        self.edge_bank   = [Queue() for _ in range(self.row_n)]
        self.result_bank = [[list() for _ in range(self.col_n)] for _ in range(self.row_n)]

        # When all cells and inputs are created, then they can be connected 
        # (again, this would be accomplished with wiring)
        for row_num, row in enumerate(self.cells):
            for col_num, cell in enumerate(row):
                cell.connect(col_num, row_num, self) #每一行对应一个pos_y, 每一列对应一个pos_x
    
    #ring dataflow
    def edge_dual_ring_boardcase(self, row_n, src, dst):
        Edge_list  = [list() for _ in range(row_n)]
        for idx in sorted(zip(src, dst), key=itemgetter(1)):
            src, dst = idx
            Edge_list[dst%row_n].append(idx)
        for idx_, val in enumerate(Edge_list):
            for val_ in sorted(val):
                self.edge_bank[idx_].put(val_)
    
    def edge_bucket_empty(self, e_b):
        for idx in e_b:
            for id_ in idx:
                if id_.empty() is False:
                    return True
        return False

    def edge_load_balance(self, row_n, src, dst):
        Edge_bucket   = [[Queue() for _ in range(row_n)] for _ in range(row_n)]
        for idx in sorted(zip(src, dst)):
            src, dst = idx
            Edge_bucket[src%row_n][dst%row_n].put(idx)
        while(self.edge_bucket_empty(Edge_bucket)):
            for i in range(row_n):
                num = [j for j in range(i,row_n)]
                for n in range(i):
                    num.append(n)
                for id_, val in enumerate(num):
                    #print("--({:d}, {:d})". format(val, id_))
                    if Edge_bucket[val][id_].empty() is False:
                        self.edge_bank[id_].put(Edge_bucket[val][id_].get())
    
    def edge_preprocess(self, num_node, edge_src, edge_dst):
        src, dst = zip(*(sorted(zip(edge_src, edge_dst), key=itemgetter(1))))
        result = [list() for _ in range(num_node)]
        for idx in range(len(dst)):
            result[dst[idx]].append((src[idx],dst[idx]))
        for idx in range(len(result)):
            #print(idx)
            #print(len(result[idx]))
            if len(result[idx]) is 0:
                result[idx] = []
            else:
                src, dst = zip(*result[idx])
                result_A = []
                result_B = []
                for idx_ in range(len(src)):
                    if(src[idx_] >= (dst[idx_]%self.row_n)):
                        result_A.append((src[idx_], dst[idx_]))
                    else:
                        result_B.append((src[idx_], dst[idx_]))
                result_A.extend(result_B)
                result[idx] = result_A
        return result

    def fill_edges(self, num_node, edge_src, edge_dst):
        edge_ = self.edge_preprocess(num_node, edge_src, edge_dst)
        for i, val in enumerate(edge_):
            for e in val:
                self.edge_bank[i%self.row_n].put(e)
                
    def fill_result_banks(self, num_nodes):
        for row_num in range(self.row_n):
            for idx_ in range(self.col_n):  
                for _ in range(math.ceil(num_nodes/self.row_n)):
                    self.result_bank[row_num][idx_].append(0)
                    
    def fill_idx(self,idx):
        for row_num in range(self.row_n):
            for col_num in range(self.col_n):
                self.cells[row_num][col_num].set_process_id(idx[row_num])
    
    # For this demo, all cells will read() the values of their neighbors first
    def read(self,edge_update):
        for row in self.cells:
            for cell in row:
                cell.read(edge_update)

    # And then after all cells have read(), they will compute() the next step
    def compute(self):
        for row in self.cells:
            for cell in row:
                cell.compute()
                
    def terminal_signal(self):
        for row in self.cells:
            for cell in row:
                #print(cell.hold)
                #print(cell.edge_bank.empty())
                if cell.hold or not cell.edge_empty:
                    return False
        return True
        #for id_x in self.edge_bank:
        #    if id_x.empty() is False:
        #        return False
        #return True
            
            
    def show_staus(self):
        for row in self.cells:
            for cell in row:
                cell.cell_state()
                
    # Each cycle involves a read() and a compute()
    def cycle(self, edge_update):
        # read() models register sampling on the positive edge of the clock
        self.read(edge_update)
        # compute() models the combinational logic between clock edges
        self.compute()
        #self.show_staus()

    # run() will execute the array's computation, assuming it's been filled
    def run(self, num_nodes):
        # It takes 3n-2 cycles to compute the full matrix of results
        edge_update = True
        cycle = 0
        while 1:
            print("-----Cycle----{:d}----------". format(cycle))
            self.cycle(edge_update)
            edge_update = False
            #self.get_edge_output(num_nodes)
            if(self.terminal_signal()):
                break
            cycle = cycle + 1
        return cycle
        #return self.get_outputs()

    # The outputs are also staggered and transposed, so we'll format them 
    # before returning the results
    def get_outputs(self):
        ret = []

        return ret

    def get_edge_output(self, num_nodes):
        for id_x in range(num_nodes):
            print("id={:d}-|-{:d}". format(id_x, self.result_bank[int(id_x%self.row_n)][0][int(id_x/self.row_n)]))

In [21]:
# Here we'll use a small 3x3 test multiplication to see the systolic array
# in action
row_n = 3
col_n = 1
myArray = SystolicArray(row_n, col_n)

#src = [0,1,2,1,2,0,2,0,1]
#dst = [0,1,2,0,1,2,0,1,2]

#src = [0,0,0,0,1,1,2,2,2]
#dst = [0,1,3,5,2,4,0,3,5]

src = [0,1,1,2,0,0,0,2,2]
dst = [0,2,4,0,1,3,5,3,5]

#src = [0,1,1,2,0,1,0,2,2]
#dst = [0,2,4,0,1,3,5,4,5]

#src = [0,0,1,2]
#dst = [0,1,2,0]

#myArray.fill_edges(6, src, dst)
#myArray.edge_load_balance(row_n, src, dst)
myArray.edge_dual_ring_boardcase(row_n, src, dst)
idx = [0,1,2]
myArray.fill_idx(idx)
myArray.fill_result_banks(6)       

res = myArray.run(6)
#assert (res == np.matmul(activations, weights)).all()
#print('Systolic array matches numpy matmul')

-----Cycle----0----------
compute cell(0,0) src 0, dst 0
-----Cycle----1----------
compute cell(0,0) src 0, dst 3
compute cell(0,2) src 0, dst 5
-----Cycle----2----------
compute cell(0,0) src 2, dst 0
compute cell(0,1) src 0, dst 1
compute cell(0,2) src 1, dst 2
-----Cycle----3----------
compute cell(0,0) src 2, dst 3
compute cell(0,1) src 1, dst 4
compute cell(0,2) src 2, dst 5
-----Cycle----4----------


In [3]:
import argparse, time
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph
from dgl import graph_index
from dgl.graph_index import disjoint_partition
from dgl.data import register_data_args, load_data
import math

In [4]:
    parser = argparse.ArgumentParser(description='GCN')
    parser.add_argument("--dataset", type=str, default="cora",
            help="dropout probability")
    parser.add_argument("--dropout", type=float, default=0.5,
            help="dropout probability")
    parser.add_argument("--gpu", type=int, default=-1,
            help="gpu")
    parser.add_argument("--lr", type=float, default=1e-2,
            help="learning rate")
    parser.add_argument("--n-epochs", type=int, default=200,
            help="number of training epochs")
    parser.add_argument("--n-hidden", type=int, default=16,
            help="number of hidden gcn units")
    parser.add_argument("--n-layers", type=int, default=1,
            help="number of hidden gcn layers")
    parser.add_argument("--weight-decay", type=float, default=5e-4,
            help="Weight for L2 loss")
    parser.add_argument("--self-loop", action='store_true',
            help="graph self-loop (default=False)")
    parser.set_defaults(self_loop=False)
    args = parser.parse_args(args=[])

In [5]:
    data = load_data(args)
    features = torch.FloatTensor(data.features)
    labels = torch.LongTensor(data.labels)
    train_mask = torch.ByteTensor(data.train_mask)
    val_mask = torch.ByteTensor(data.val_mask)
    test_mask = torch.ByteTensor(data.test_mask)
    in_feats = features.shape[1]
    n_classes = data.num_labels
    n_edges = data.graph.number_of_edges()

In [6]:
g = DGLGraph(data.graph)

In [7]:
partition_size = 32
Node_index = []
Edge = []
Edge_number = []
partition_number = math.ceil(g.number_of_nodes() / partition_size)
print("the graph split to {:d} part". format(partition_number))
for node_id in range(partition_number):
    #print(node_id)
    if node_id == partition_number-1:
        index = list(range(partition_size*node_id,g.number_of_nodes()))
    else:
        index = list(range(partition_size*node_id,partition_size*(node_id+1)))
    Node_index.append(index)
    src, dst = g.out_edges(index)
    Edge.append(list(zip(src.tolist(),dst.tolist())))
    Edge_number.append(src.shape[0])

the graph split to 85 part


In [8]:
src, dst = zip(*Edge[1])

In [35]:
len(Edge[1])

129

In [36]:
idx = Node_index[1]

In [12]:
edge_count = 0
row_n = 32
col_n = 1
cycle_list = []
for i, val in enumerate(Edge):
    src, dst = zip(*val)
    idx = Node_index[i]
    for i in range(len(idx),32):
        idx.append(0)
    myArray = SystolicArray(row_n, col_n)
    myArray.edge_load_balance(row_n, src, dst)
    myArray.fill_idx(idx)
    myArray.fill_result_banks(2708)       
    res = myArray.run(2708)
    cycle_list.append(res)

-----Cycle----0----------
compute cell(0,0) src 0, dst 544
compute cell(0,15) src 15, dst 399
compute cell(0,23) src 23, dst 759
-----Cycle----1----------
compute cell(0,29) src 30, dst 285
-----Cycle----2----------
compute cell(0,12) src 14, dst 268
compute cell(0,28) src 30, dst 1148
-----Cycle----3----------
compute cell(0,7) src 10, dst 519
compute cell(0,28) src 31, dst 1116
-----Cycle----4----------
compute cell(0,5) src 9, dst 453
compute cell(0,10) src 14, dst 746
-----Cycle----5----------
compute cell(0,2) src 7, dst 258
compute cell(0,9) src 14, dst 393
compute cell(0,10) src 15, dst 234
compute cell(0,17) src 22, dst 2257
compute cell(0,20) src 25, dst 20
compute cell(0,26) src 31, dst 250
-----Cycle----6----------
compute cell(0,2) src 8, dst 258
compute cell(0,4) src 10, dst 420
compute cell(0,8) src 14, dst 8
compute cell(0,12) src 18, dst 1932
compute cell(0,14) src 20, dst 334
compute cell(0,20) src 26, dst 2612
compute cell(0,22) src 28, dst 1718
-----Cycle----7-------

compute cell(0,30) src 227, dst 638
-----Cycle----22----------
compute cell(0,3) src 228, dst 323
compute cell(0,4) src 250, dst 708
compute cell(0,6) src 252, dst 294
compute cell(0,19) src 233, dst 595
compute cell(0,22) src 236, dst 1270
compute cell(0,27) src 241, dst 763
-----Cycle----23----------
compute cell(0,4) src 251, dst 932
compute cell(0,23) src 238, dst 1815
-----Cycle----24----------
compute cell(0,11) src 227, dst 1547
compute cell(0,14) src 230, dst 142
-----Cycle----25----------
compute cell(0,1) src 250, dst 33
compute cell(0,2) src 251, dst 482
compute cell(0,6) src 255, dst 70
compute cell(0,9) src 226, dst 617
-----Cycle----26----------
compute cell(0,0) src 250, dst 736
compute cell(0,1) src 251, dst 865
compute cell(0,5) src 255, dst 421
compute cell(0,11) src 229, dst 747
compute cell(0,14) src 232, dst 14
compute cell(0,21) src 239, dst 565
compute cell(0,29) src 247, dst 1117
-----Cycle----27----------
compute cell(0,0) src 251, dst 2560
compute cell(0,8) sr

-----Cycle----26----------
compute cell(0,4) src 510, dst 356
compute cell(0,15) src 483, dst 463
compute cell(0,30) src 504, dst 702
-----Cycle----27----------
compute cell(0,0) src 507, dst 1888
compute cell(0,1) src 508, dst 417
compute cell(0,7) src 482, dst 551
compute cell(0,8) src 483, dst 552
compute cell(0,13) src 488, dst 109
compute cell(0,14) src 489, dst 46
compute cell(0,21) src 496, dst 501
compute cell(0,28) src 503, dst 2396
-----Cycle----28----------
compute cell(0,3) src 511, dst 611
compute cell(0,13) src 489, dst 301
compute cell(0,19) src 495, dst 339
compute cell(0,21) src 503, dst 1813
compute cell(0,27) src 503, dst 1851
compute cell(0,28) src 486, dst 1148
-----Cycle----29----------
compute cell(0,6) src 483, dst 294
compute cell(0,13) src 490, dst 45
compute cell(0,14) src 491, dst 2318
compute cell(0,19) src 492, dst 2419
compute cell(0,21) src 492, dst 565
compute cell(0,23) src 500, dst 247
compute cell(0,30) src 507, dst 798
-----Cycle----30----------
com

compute cell(0,19) src 714, dst 1299
compute cell(0,20) src 715, dst 84
compute cell(0,24) src 719, dst 1112
compute cell(0,27) src 722, dst 635
-----Cycle----24----------
compute cell(0,2) src 730, dst 770
compute cell(0,21) src 717, dst 1877
compute cell(0,23) src 719, dst 87
compute cell(0,29) src 725, dst 93
-----Cycle----25----------
compute cell(0,1) src 730, dst 1889
-----Cycle----26----------
compute cell(0,1) src 719, dst 2561
compute cell(0,7) src 705, dst 519
compute cell(0,22) src 720, dst 438
-----Cycle----27----------
compute cell(0,0) src 731, dst 576
compute cell(0,4) src 735, dst 1956
compute cell(0,13) src 712, dst 1965
compute cell(0,31) src 730, dst 351
-----Cycle----28----------
compute cell(0,0) src 732, dst 640
compute cell(0,3) src 735, dst 195
compute cell(0,12) src 712, dst 1036
compute cell(0,30) src 730, dst 350
-----Cycle----29----------
compute cell(0,5) src 706, dst 453
-----Cycle----30----------
compute cell(0,26) src 728, dst 1466
-----Cycle----31------

-----Cycle----29----------
compute cell(0,2) src 1023, dst 1090
compute cell(0,19) src 1008, dst 1011
compute cell(0,26) src 1015, dst 1338
compute cell(0,30) src 1019, dst 862
-----Cycle----30----------
compute cell(0,14) src 1004, dst 2350
compute cell(0,26) src 1016, dst 1466
compute cell(0,30) src 1019, dst 926
-----Cycle----31----------
compute cell(0,8) src 999, dst 1512
compute cell(0,10) src 1001, dst 1354
compute cell(0,12) src 1003, dst 1004
compute cell(0,19) src 1010, dst 1011
compute cell(0,21) src 1012, dst 565
compute cell(0,25) src 1016, dst 1305
compute cell(0,26) src 995, dst 1690
-----Cycle----32----------
compute cell(0,10) src 1019, dst 1290
compute cell(0,12) src 1003, dst 2028
compute cell(0,19) src 1010, dst 1651
-----Cycle----33----------
-----Cycle----0----------
compute cell(0,24) src 1048, dst 984
compute cell(0,26) src 1050, dst 1306
compute cell(0,31) src 1055, dst 607
-----Cycle----1----------
compute cell(0,2) src 1027, dst 322
compute cell(0,7) src 1032

compute cell(0,15) src 1292, dst 1071
compute cell(0,19) src 1288, dst 787
compute cell(0,26) src 1303, dst 1274
compute cell(0,31) src 1308, dst 1503
-----Cycle----30----------
compute cell(0,6) src 1284, dst 230
compute cell(0,9) src 1287, dst 905
compute cell(0,11) src 1283, dst 1547
compute cell(0,12) src 1290, dst 1644
compute cell(0,13) src 1290, dst 1197
-----Cycle----31----------
compute cell(0,2) src 1281, dst 1282
compute cell(0,4) src 1283, dst 1284
compute cell(0,6) src 1285, dst 1382
compute cell(0,12) src 1291, dst 1548
compute cell(0,23) src 1302, dst 983
compute cell(0,26) src 1305, dst 1274
compute cell(0,27) src 1306, dst 1499
-----Cycle----32----------
compute cell(0,2) src 1281, dst 1346
compute cell(0,6) src 1283, dst 1318
compute cell(0,26) src 1290, dst 378
-----Cycle----33----------
compute cell(0,26) src 1303, dst 2202
-----Cycle----34----------
-----Cycle----0----------
compute cell(0,3) src 1315, dst 611
compute cell(0,8) src 1320, dst 808
compute cell(0,9) s

compute cell(0,29) src 1607, dst 1053
-----Cycle----11----------
compute cell(0,7) src 1618, dst 2343
compute cell(0,10) src 1621, dst 1194
compute cell(0,19) src 1630, dst 1203
-----Cycle----12----------
compute cell(0,10) src 1622, dst 1322
compute cell(0,21) src 1601, dst 1429
compute cell(0,24) src 1604, dst 1272
compute cell(0,26) src 1606, dst 826
-----Cycle----13----------
compute cell(0,8) src 1621, dst 1448
compute cell(0,10) src 1622, dst 1578
compute cell(0,12) src 1625, dst 1580
compute cell(0,20) src 1601, dst 1428
compute cell(0,25) src 1606, dst 409
compute cell(0,27) src 1608, dst 987
-----Cycle----14----------
compute cell(0,4) src 1618, dst 1220
compute cell(0,12) src 1626, dst 1164
compute cell(0,18) src 1600, dst 946
-----Cycle----15----------
compute cell(0,4) src 1611, dst 1380
compute cell(0,19) src 1602, dst 339
-----Cycle----16----------
compute cell(0,3) src 1619, dst 99
compute cell(0,23) src 1607, dst 1335
-----Cycle----17----------
-----Cycle----18---------

compute cell(0,25) src 1924, dst 345
-----Cycle----12----------
compute cell(0,0) src 1932, dst 576
compute cell(0,3) src 1935, dst 611
compute cell(0,5) src 1937, dst 389
compute cell(0,18) src 1950, dst 2226
compute cell(0,24) src 1924, dst 2040
-----Cycle----13----------
compute cell(0,5) src 1938, dst 1765
compute cell(0,11) src 1944, dst 1259
compute cell(0,23) src 1924, dst 2327
compute cell(0,24) src 1925, dst 2392
compute cell(0,27) src 1928, dst 667
-----Cycle----14----------
compute cell(0,4) src 1938, dst 1764
compute cell(0,18) src 1920, dst 1874
-----Cycle----15----------
compute cell(0,4) src 1933, dst 2660
compute cell(0,6) src 1941, dst 1926
compute cell(0,31) src 1934, dst 2463
-----Cycle----16----------
compute cell(0,3) src 1939, dst 2307
compute cell(0,9) src 1945, dst 1993
compute cell(0,19) src 1923, dst 2227
compute cell(0,21) src 1925, dst 2133
compute cell(0,23) src 1927, dst 1847
-----Cycle----17----------
compute cell(0,21) src 1926, dst 1941
compute cell(0,2

compute cell(0,14) src 2239, dst 2414
compute cell(0,27) src 2220, dst 2139
compute cell(0,30) src 2223, dst 190
-----Cycle----18----------
compute cell(0,13) src 2239, dst 2413
compute cell(0,16) src 2210, dst 1808
compute cell(0,21) src 2215, dst 1877
compute cell(0,25) src 2219, dst 1977
-----Cycle----19----------
compute cell(0,3) src 2230, dst 611
compute cell(0,16) src 2211, dst 1232
compute cell(0,18) src 2213, dst 2290
compute cell(0,24) src 2219, dst 2008
-----Cycle----20----------
compute cell(0,11) src 2239, dst 587
compute cell(0,30) src 2226, dst 1950
-----Cycle----21----------
compute cell(0,3) src 2232, dst 163
compute cell(0,30) src 2223, dst 510
compute cell(0,31) src 2228, dst 127
-----Cycle----22----------
compute cell(0,0) src 2230, dst 608
compute cell(0,23) src 2221, dst 1527
-----Cycle----23----------
compute cell(0,9) src 2208, dst 2313
compute cell(0,14) src 2213, dst 2126
compute cell(0,15) src 2214, dst 2223
compute cell(0,17) src 2216, dst 2705
compute cell(

compute cell(0,20) src 2614, dst 2484
compute cell(0,22) src 2616, dst 2614
-----Cycle----3----------
compute cell(0,2) src 2597, dst 1698
-----Cycle----4----------
compute cell(0,8) src 2604, dst 168
compute cell(0,20) src 2616, dst 2484
compute cell(0,25) src 2621, dst 2105
compute cell(0,28) src 2592, dst 1852
-----Cycle----5----------
compute cell(0,3) src 2600, dst 163
compute cell(0,14) src 2611, dst 2606
compute cell(0,15) src 2612, dst 1903
compute cell(0,20) src 2617, dst 1428
compute cell(0,21) src 2618, dst 1269
-----Cycle----6----------
compute cell(0,30) src 2596, dst 2686
-----Cycle----7----------
compute cell(0,1) src 2600, dst 1409
compute cell(0,6) src 2605, dst 2502
compute cell(0,26) src 2593, dst 634
compute cell(0,27) src 2594, dst 411
compute cell(0,31) src 2598, dst 1631
-----Cycle----8----------
compute cell(0,6) src 2606, dst 2662
compute cell(0,15) src 2615, dst 1647
compute cell(0,20) src 2620, dst 1972
-----Cycle----9----------
compute cell(0,3) src 2604, ds

In [13]:
cycle_list

[32,
 33,
 35,
 34,
 33,
 39,
 34,
 34,
 33,
 34,
 34,
 34,
 33,
 36,
 35,
 33,
 34,
 39,
 35,
 35,
 34,
 33,
 31,
 36,
 34,
 32,
 33,
 32,
 31,
 32,
 34,
 33,
 31,
 34,
 31,
 34,
 33,
 34,
 34,
 33,
 34,
 34,
 33,
 31,
 33,
 34,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 34,
 33,
 31,
 35,
 32,
 32,
 31,
 32,
 30,
 33,
 31,
 32,
 31,
 31,
 33,
 33,
 34,
 31,
 31,
 33,
 31,
 33,
 31,
 31,
 32,
 34,
 33,
 34,
 31,
 31,
 34,
 31]

In [37]:
row_n = 32
col_n = 1
myArray = SystolicArray(row_n, col_n)

#src = [0,1,2,1,2,0,2,0,1]
#dst = [0,1,2,0,1,2,0,1,2]

#src = [0,0,0,0,1,1,2,2,2]
#dst = [0,1,3,5,2,4,0,3,5]

#src = [0,1,1,2,0,0,0,2,2]
#dst = [0,2,4,0,1,3,5,3,5]

#src = [0,1,1,2,0,1,0,2,2]
#dst = [0,2,4,0,1,3,5,4,5]

#src = [0,0,1,2]
#dst = [0,1,2,0]

#myArray.fill_edges(2708, src, dst)
myArray.edge_load_balance(row_n, src, dst)
#myArray.edge_dual_ring_boardcase(row_n, src, dst)
#idx = [0,1,2]
myArray.fill_idx(idx)
myArray.fill_result_banks(2708)       
edge_count = 0
res = myArray.run(2708)
#assert (res == np.matmul(activations, weights)).all()
#print('Systolic array matches numpy matmul')

-----Cycle----0----------
compute cell(0,2) src 34, dst 66
compute cell(0,11) src 43, dst 747
compute cell(0,14) src 46, dst 238
compute cell(0,20) src 52, dst 2580
compute cell(0,28) src 60, dst 252
-----Cycle----1----------
compute cell(0,2) src 35, dst 34
compute cell(0,12) src 45, dst 492
compute cell(0,13) src 46, dst 301
compute cell(0,26) src 59, dst 570
-----Cycle----2----------
compute cell(0,19) src 53, dst 243
compute cell(0,24) src 58, dst 536
-----Cycle----3----------
compute cell(0,1) src 36, dst 257
compute cell(0,10) src 45, dst 490
-----Cycle----4----------
compute cell(0,5) src 41, dst 1733
compute cell(0,21) src 57, dst 565
-----Cycle----5----------
compute cell(0,9) src 46, dst 489
compute cell(0,25) src 62, dst 2425
-----Cycle----6----------
compute cell(0,14) src 52, dst 430
compute cell(0,17) src 55, dst 1905
compute cell(0,19) src 57, dst 1459
compute cell(0,25) src 63, dst 153
-----Cycle----7----------
compute cell(0,3) src 42, dst 163
compute cell(0,18) src 57

In [18]:
edge_count

127