In [31]:
import numpy as np
import pandas as pd


In [32]:
print(np.__version__)
print(pd.__version__)


1.24.3
2.0.3


In [33]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 10000)
pd.set_option('display.width', 10000)


In [34]:
# 以下参数目前需要手动改写，暂时无法从profile后的文件中获取
# 指定通过tpu_profile.py --format layer生成的profile文件路径
profile_dir = '/workspace/jira/VIT/int8_b16/test/'
# TPU的频率
tpu_freq = 1e9 # 1000MHz
# ModelAlgOps来自final.mlir中，这是模型原始的计算量，profile中的AlgOps是原本算子拆成多个指令后的，指令的有效计算量，与实现相关
ModelAlgOps = 564150211456
# 端到端runtime耗时，e2eTime来自bmrt_test的calculate time
e2eTime = 38300. # us
# 模型的quantize_type影响该模型的PeakTops的选择
quantize_type = 'int8'
# 模型输入输出数据量
ModelInputBytes = 16*3*224*224*4
ModelOutputBytes = 16*1000*4


In [35]:
PeakTops = {
  'int8': 32768 * tpu_freq,
  'f16': 16384 * tpu_freq,
  'bf16': 16384 * tpu_freq,
  'f32': 2048 * tpu_freq,
}
ModelPeakTops = PeakTops[quantize_type]

layer_df = pd.read_csv(profile_dir + 'layer.csv')
summary_df = pd.read_csv(profile_dir + 'summary.csv')


# Columns含义
1. inputBytes：输入字节数，不包含weight
2. outputBytes：输出字节数
3. weightBytes：权重系数字节数
4. s2lBytes：Globalmem -> Localmem字节数
5. l2sBytes：Localmem -> Globalmem字节数
6. s2sBytes：Globalmem -> Globalmem字节数
7. gdmaCycles：GDMA运行的实际真实cycle数
8. gdmaTime：GDMA运行的实际真实时间，即gdmaCycles * GDMAPeriod
9. gdmaTimeRatio：GDMA耗时占GDMA总耗时的比例
10. gdmaPTheoTime：根据profile中数据得出的GDMA理论耗时
11. ddrRate：DDR效率，即gdmaPTheoTime / gdmaTime
12. LoadAvgBandwidth：GDMA平均加载带宽（S2L）
13. StoreAvgBandwidth：GDMA平均保存带宽（L2S）
14. AlgOps：有效的算法计算量，**一般由于实现方案会和真实的该模型或Layer本身的计算量有一定的差异，模型本身的计算量请参考mlir中的数据**
15. uArchOps：TPU微架构的计算量，相当于微架构利用率打满情况下的AlgOps
16. uArchCModelCycles：cmodel仿真的cycle数，**如果与tiuCycles相差过大，说明有问题，需要校准cmodel中的仿真计算**
17. uArchCModelCycleRatio：cmodel仿真中，该部分cycle占比
18. tiuCycles：TIU运行时的实际真实cycle数
19. tiuTime：TIU运行时的实际真实耗时，即tiuCycles * TIUPeriod
20. tiuTimeRatio：该部分TIU耗时占TIU总耗时的占比
21. tiuPTheoTime：根据profile中的数据得出的TIU理论耗时
22. uArchRate：TPU微架构利用率，即AlgOps / uArchOps
23. totalTime：该部分实际真实的总耗时，由于GDMA和TIU的并行，一般totalTime <= tiuTime + GDMATime
24. PeakTops：该部分的峰值算力，由于使用的指令不同，不同算子对应的峰值算力也会有一定区别
25. ActualTops：等效算力或实际使用到的算力，即AlgOps / totalTime
26. Parallelism：该部分整体并行程度，可看出不并行时耗时的增加量，计算方式(tiuTime + gdmaTime) / totalTime
27. Concurrency：该部分TIU与GDMA的并行程度，最大为100%，计算方式为(tiuTime + gdmaTime - totalTime) / min(tiuTime, gdmaTime)，

In [36]:
layer_df.head()


Unnamed: 0,LayerID,Type,TPU/CPU,DataType,Function,in,ic,ih,iw,on,oc,oh,ow,kh,kw,KStrideH,KStrideW,Padding,Other info,inputBytes,outputBytes,weightBytes,s2lBytes,l2sBytes,s2sBytes,gdmaCycles,gdmaTime(us),gdmaTimeRatio,gdmaPTheoTime(us),ddrRate,LoadAvgBandwidth(GiB/s),StoreAvgBandwidth(GiB/s),AlgOps,uArchOps,uArchCModelCycles,uArchCModelCycleRatio,tiuCycles,tiuTime(us),tiuTimeRatio,tiuPTheoTime(us),uArchRate,totalTime(us),PeakTops,ActualTops,Parallelism,Concurrency
0,267,local,TPU,{'FP32'},Softmax,16,197,12,197,16,197,12,197,1,197,1,1,"[0, 0, 0, 0]","ins=[tensor_id=266 [16x197x12x197] FP32 nslice=1 hslice=12],outs=[tensor_id=267 [16x197x12x197] FP32 nslice=1 hslice=12]",29805312,29805312,0,0,0,0,0,0.0,0.000%,0.0,--,0.0,0.0,246017536,326713344,541392,1.717%,564736,564.736,1.624%,240.2515,75.301%,,1.0,,,
1,337,local,TPU,{'FP32'},Softmax,16,197,12,197,16,197,12,197,1,197,1,1,"[0, 0, 0, 0]","ins=[tensor_id=336 [16x197x12x197] FP32 nslice=1 hslice=12],outs=[tensor_id=337 [16x197x12x197] FP32 nslice=1 hslice=12]",29805312,29805312,0,0,0,0,0,0.0,0.000%,0.0,--,0.0,0.0,246017536,326713344,541392,1.717%,564736,564.736,1.624%,240.2515,75.301%,,1.0,,,
2,407,local,TPU,{'FP32'},Softmax,16,197,12,197,16,197,12,197,1,197,1,1,"[0, 0, 0, 0]","ins=[tensor_id=406 [16x197x12x197] FP32 nslice=1 hslice=12],outs=[tensor_id=407 [16x197x12x197] FP32 nslice=1 hslice=12]",29805312,29805312,0,0,0,0,0,0.0,0.000%,0.0,--,0.0,0.0,246017536,326713344,541392,1.717%,564736,564.736,1.624%,240.2515,75.301%,,1.0,,,
3,617,local,TPU,{'FP32'},Softmax,16,197,12,197,16,197,12,197,1,197,1,1,"[0, 0, 0, 0]","ins=[tensor_id=616 [16x197x12x197] FP32 nslice=1 hslice=12],outs=[tensor_id=617 [16x197x12x197] FP32 nslice=1 hslice=12]",29805312,29805312,0,0,0,0,0,0.0,0.000%,0.0,--,0.0,0.0,246017536,326713344,541392,1.717%,564736,564.736,1.624%,240.2515,75.301%,,1.0,,,
4,757,local,TPU,{'FP32'},Softmax,16,197,12,197,16,197,12,197,1,197,1,1,"[0, 0, 0, 0]","ins=[tensor_id=756 [16x197x12x197] FP32 nslice=1 hslice=12],outs=[tensor_id=757 [16x197x12x197] FP32 nslice=1 hslice=12]",29805312,29805312,0,0,0,0,0,0.0,0.000%,0.0,--,0.0,0.0,246017536,326713344,541392,1.717%,564736,564.736,1.624%,240.2515,75.301%,,1.0,,,


In [37]:
summary_df


Unnamed: 0,Function,weightBytes,s2lBytes,l2sBytes,s2sBytes,gdmaCycles,gdmaTime(us),gdmaTimeRatio,gdmaPTheoTime(us),ddrRate,AlgOps,AlgOpsRatio,uArchOps,uArchOpsRatio,tiuCycles,tiuTime(us),tiuTimeRatio,tiuPTheoTime(us),uArchRate,PeakTops,DataTypes,LayerTypes,totalTime(us),Parallelism,Concurrency,1684x FPS or Token/s
0,Matmul,86631328,3785632,2424448,0,129324,129.324,2.071%,112.103865,86.685%,563790963392,99.106%,622837575680,98.967%,23054565,23054.565,66.297%,17205.534771,90.520%,32.0,"UINT8,INT8",MatMul,,0,0,0.0
1,Softmax,0,0,0,0,0,0.0,0.000%,0.0,--,2952210432,0.519%,3920560128,0.623%,6776384,6776.384,19.487%,2883.018,75.301%,1.0,FP32,Softmax,,0,0,0.0
2,Cast,0,9649792,2472448,0,224854,224.854,3.601%,207.282346,92.185%,1075664256,0.189%,1333805056,0.212%,1364047,1364.047,3.923%,577.617281,80.646%,4.0,"FP32,FP16,INT8",Cast,,0,0,0.0
3,LayerNorm,153600,0,0,0,0,0.0,0.000%,0.0,--,628983224,0.111%,772317184,0.123%,1495611,1495.611,4.301%,307.120715,81.441%,2.0,FP16,LayerNorm,,0,0,0.0
4,Eltwise,151296,0,0,0,0,0.0,0.000%,0.0,--,210604032,0.037%,226197504,0.036%,110164,110.164,0.317%,51.417,93.106%,4.0,INT8,"MulShift,Add",,0,0,0.0
5,Lut,3072,0,0,0,0,0.0,0.000%,0.0,--,116998144,0.021%,118767616,0.019%,1845258,1845.258,5.306%,1828.096,98.510%,0.0625,INT8,Lut,,0,0,0.0
6,Others,24576,7225344,7225344,12288,998330,998.33,15.987%,269.362812,26.981%,99213312,0.017%,127909888,0.020%,128499,128.499,0.370%,24.813,77.565%,4.0,"FP16,INT8","Permute,Slice,Concat,Reshape",,0,0,0.0
7,Load,85598976,170312448,0,0,3554002,3554.002,56.913%,2734.755648,76.949%,0,0.000%,0,0.000%,0,0.0,0.000%,0.0,--,4.0,"FP32,FP16,INT32,INT8",Load,,0,0,0.0
8,Store,0,0,58097664,0,1338139,1338.139,21.429%,1229.719682,91.898%,0,0.000%,0,0.000%,0,0.0,0.000%,0.0,--,4.0,INT8,Store,,0,0,0.0
9,Overall,86963872,190973216,70219904,12288,6244649,6244.649,100%,4553.224353,72.914%,568874636792,100%,629337133056,100%,34774528,34774.528,100%,22877.616768,90.393%,32.0,,,37701.045,108.801%,53.136%,28.756681


In [38]:
layer_df.groupby('Function')['Function'].count()


Function
Add           25
Cast          78
Concat         1
LayerNorm     25
Load         243
Lut           12
MatMul        98
MulShift      12
Permute        1
Reshape       37
Slice          1
Softmax       12
Store         24
Name: Function, dtype: int64

# 1. ddrDataSize优化

ddrDataSize = s2lBytes + l2sBytes + s2sBytes

ddrMinDataSize = ModelInputBytes + ModelOutputBytes + ModelWeightBytes - UnusedWeightBytes

其中，**ModelInputBytes和ModelOutputBytes暂时无法从profile中获取到，需要自行计算填入**，ModelWeightBytes = sum(weightBytes), UnusedWeightBytes = GatherOpWeightBytes - GatherOpOutputBytes

ddrMinDataSize是不考虑LocalMem等各种因素情况下运行该模型的最小数据搬运量，ddrDataSize是实际的搬运量，应当尽可能让ddrDataSize接近ddrMinDataSize，可从如下几个方面着手：
1. 图优化：消除冗余算子，transformer的图优化就比较典型，消除了大量的permute，从而减少了搬运，这一步可通过Netron查看tpu_opt.onnx来进行
2. 遍历每一个global layer，弄清楚作为global layer的原因，然后尽可能减少global layer的存在；无法避免只能作为global layer的，尽可能减少重复搬运
3. 关注Concat、Slice、Permute、Pad等几乎没有计算，只有数据搬运的算子，可能存在某些特殊的优化方法，如inplace方法或在Load、Store时完成相关操作等

In [39]:
column_names = [
    'LayerID', 'Type', 'DataType', 'Function',
    'inputBytes', 'outputBytes', 'weightBytes',
    's2lBytes', 'l2sBytes', 's2sBytes'
]


gather_df = layer_df.loc[layer_df['Function'] == 'Gather']
UnusedWeightBytes = gather_df['weightBytes'].sum() - gather_df['outputBytes'].sum()
ModelWeightBytes = layer_df['weightBytes'].sum()
ddrMinDataSize = ModelInputBytes + ModelOutputBytes + ModelWeightBytes - UnusedWeightBytes

ddrDataSize = layer_df[['s2lBytes', 'l2sBytes', 's2sBytes']].sum().sum()
print(f"ddrDataSize = {ddrDataSize} Bytes, ddrMinDataSize = {ddrMinDataSize} Bytes")


ddrDataSize = 261205408 Bytes, ddrMinDataSize = 182260640 Bytes


# 2. uArchRate优化

从下表可以看出各类型Layer的uArchRate，得到一个基本信息，可以看出耗时占比最大的是MatMul和Softmax，它们的uArchRate依然有提升空间，其他几个虽然有提升空间，但相对收益要低一些，可以放在后面在考虑

In [40]:
column_names = [
    'uArchRate', 'Function', 'AlgOps', 'AlgOpsRatio', 'uArchOps', 'uArchOpsRatio',
    'tiuTime(us)', 'tiuTimeRatio', 'tiuPTheoTime(us)',
    'PeakTops', 'DataTypes', 'LayerTypes'
]
arch_summary_df = summary_df[column_names]
arch_summary_df.sort_values(by='tiuTime(us)', axis=0, ascending=False)


Unnamed: 0,uArchRate,Function,AlgOps,AlgOpsRatio,uArchOps,uArchOpsRatio,tiuTime(us),tiuTimeRatio,tiuPTheoTime(us),PeakTops,DataTypes,LayerTypes
9,90.393%,Overall,568874636792,100%,629337133056,100%,34774.528,100%,22877.616768,32.0,,
0,90.520%,Matmul,563790963392,99.106%,622837575680,98.967%,23054.565,66.297%,17205.534771,32.0,"UINT8,INT8",MatMul
1,75.301%,Softmax,2952210432,0.519%,3920560128,0.623%,6776.384,19.487%,2883.018,1.0,FP32,Softmax
5,98.510%,Lut,116998144,0.021%,118767616,0.019%,1845.258,5.306%,1828.096,0.0625,INT8,Lut
3,81.441%,LayerNorm,628983224,0.111%,772317184,0.123%,1495.611,4.301%,307.120715,2.0,FP16,LayerNorm
2,80.646%,Cast,1075664256,0.189%,1333805056,0.212%,1364.047,3.923%,577.617281,4.0,"FP32,FP16,INT8",Cast
6,77.565%,Others,99213312,0.017%,127909888,0.020%,128.499,0.370%,24.813,4.0,"FP16,INT8","Permute,Slice,Concat,Reshape"
4,93.106%,Eltwise,210604032,0.037%,226197504,0.036%,110.164,0.317%,51.417,4.0,INT8,"MulShift,Add"
7,--,Load,0,0.000%,0,0.000%,0.0,0.000%,0.0,4.0,"FP32,FP16,INT32,INT8",Load
8,--,Store,0,0.000%,0,0.000%,0.0,0.000%,0.0,4.0,INT8,Store


## MatMul
### step1
可以看到MatMul的uArchRate的主要有7种情况：
1. 98.5%对应的tiuTime为12.951ms，由于本身uArchRate接近100%，优化空间很小；
2. 2个接近76%的利用率总计耗时8.865ms，有较大的优化空间；
3. 剩余4中情况本身耗时很短，优化提升较小，可放在最后再考虑优化。

**注：优化到最后，不论哪一种情况都应该是进行到无法再优化的程度，并列出原因，如果因为时间问题来不及做，可简略备注一下。**

In [41]:
layer_df.loc[layer_df['Function'] == 'MatMul'].groupby('uArchRate')['tiuTime(us)'].sum()


uArchRate
100.000%      133.630
24.999%         3.483
59.387%       917.006
59.388%       183.513
76.683%      2896.128
76.957%      5969.136
98.500%     12951.669
Name: tiuTime(us), dtype: float64

### step2
重点关注2个76%的MatMul的情况，如果下图查看不方便，可以导出到表格中查看，这部分可以对照final.mlir查看为什么这些MatMul的利用率低，是LayerGroup问题，还是后端实现问题。

这里的利用率低主要都是因为左矩阵的C=197，导致无法充分利用lane，需要采取策略，尽可能打满lane

In [42]:
def get_layers(layer_df, layer_type:str):
    column_names = [
        'uArchRate', 'LayerID', 'Type', 'DataType', 'Function', 'Other info',
        'AlgOps', 'uArchOps', 'tiuTime(us)', 'tiuTimeRatio', 'tiuPTheoTime(us)',
        'totalTime(us)', 'PeakTops', 'ActualTops', 'Concurrency'
    ]
    return layer_df[column_names].loc[layer_df['Function'] == layer_type].sort_values(by='uArchRate', axis=0)


In [43]:
matmul = get_layers(layer_df, 'MatMul')
matmul.to_csv(profile_dir + 'matmu.csv')
matmul.head()


Unnamed: 0,uArchRate,LayerID,Type,DataType,Function,Other info,AlgOps,uArchOps,tiuTime(us),tiuTimeRatio,tiuPTheoTime(us),totalTime(us),PeakTops,ActualTops,Concurrency
97,100.000%,19,global,{'INT8'},MatMul,"ins=[tensor_id=16 [16x196x768] INT8 ,tensor_id=17 [1x768x768] INT8 CONST ,tensor_id=18 [1x1x768] INT32 CONST ],outs=[tensor_id=19 [16x196x768] INT8 ]",3709009920,3709009920,133.63,0.384%,113.19,172.73,32.0,21.472876,62.880%
250,24.999%,878,global,{'INT8'},MatMul,"ins=[tensor_id=875 [16x768] INT8 ,tensor_id=876 [768x1000] INT8 CONST ,tensor_id=877 [1x1000] INT32 CONST ],outs=[tensor_id=878 [16x1000] INT8 ]",24640000,98562048,3.483,0.010%,0.751953,27.49,32.0,0.896326,-0.459%
109,59.387%,129,local,{'UINT8'},MatMul,"ins=[tensor_id=128 [16x197x12x197] UINT8 nslice=1 hslice=12,tensor_id=123 [16x197x12x64] INT8 nslice=1 hslice=12],outs=[tensor_id=129 [16x197x12x64] INT8 nslice=1 hslice=12]",973236736,1638809600,91.685,0.264%,29.700828,,32.0,,
110,59.387%,479,local,{'UINT8'},MatMul,"ins=[tensor_id=478 [16x197x12x197] UINT8 nslice=1 hslice=12,tensor_id=473 [16x197x12x64] INT8 nslice=1 hslice=12],outs=[tensor_id=479 [16x197x12x64] INT8 nslice=1 hslice=12]",973236736,1638809600,91.685,0.264%,29.700828,,32.0,,
111,59.387%,759,local,{'UINT8'},MatMul,"ins=[tensor_id=758 [16x197x12x197] UINT8 nslice=1 hslice=12,tensor_id=753 [16x197x12x64] INT8 nslice=1 hslice=12],outs=[tensor_id=759 [16x197x12x64] INT8 nslice=1 hslice=12]",973261952,1638842368,91.711,0.264%,29.701598,,32.0,,


## Softmax
### step1
可以看到，Softmax只有1种情况，因此只需要考虑优化这种情况即可

In [44]:
layer_df.loc[layer_df['Function'] == 'Softmax'].groupby('uArchRate')['tiuTime(us)'].sum()


uArchRate
75.301%    6776.384
Name: tiuTime(us), dtype: float64

### step2
同样，Softmax利用率低的原因是因为C=197，无法充分利用lane

In [45]:
softmax = get_layers(layer_df, 'Softmax')
softmax.to_csv(profile_dir + 'softmax.csv')
softmax


Unnamed: 0,uArchRate,LayerID,Type,DataType,Function,Other info,AlgOps,uArchOps,tiuTime(us),tiuTimeRatio,tiuPTheoTime(us),totalTime(us),PeakTops,ActualTops,Concurrency
0,75.301%,267,local,{'FP32'},Softmax,"ins=[tensor_id=266 [16x197x12x197] FP32 nslice=1 hslice=12],outs=[tensor_id=267 [16x197x12x197] FP32 nslice=1 hslice=12]",246017536,326713344,564.736,1.624%,240.2515,,1.0,,
1,75.301%,337,local,{'FP32'},Softmax,"ins=[tensor_id=336 [16x197x12x197] FP32 nslice=1 hslice=12],outs=[tensor_id=337 [16x197x12x197] FP32 nslice=1 hslice=12]",246017536,326713344,564.736,1.624%,240.2515,,1.0,,
2,75.301%,407,local,{'FP32'},Softmax,"ins=[tensor_id=406 [16x197x12x197] FP32 nslice=1 hslice=12],outs=[tensor_id=407 [16x197x12x197] FP32 nslice=1 hslice=12]",246017536,326713344,564.736,1.624%,240.2515,,1.0,,
3,75.301%,617,local,{'FP32'},Softmax,"ins=[tensor_id=616 [16x197x12x197] FP32 nslice=1 hslice=12],outs=[tensor_id=617 [16x197x12x197] FP32 nslice=1 hslice=12]",246017536,326713344,564.736,1.624%,240.2515,,1.0,,
4,75.301%,757,local,{'FP32'},Softmax,"ins=[tensor_id=756 [16x197x12x197] FP32 nslice=1 hslice=12],outs=[tensor_id=757 [16x197x12x197] FP32 nslice=1 hslice=12]",246017536,326713344,564.736,1.624%,240.2515,,1.0,,
5,75.301%,127,local,{'FP32'},Softmax,"ins=[tensor_id=126 [16x197x12x197] FP32 nslice=1 hslice=12],outs=[tensor_id=127 [16x197x12x197] FP32 nslice=1 hslice=12]",246017536,326713344,564.736,1.624%,240.2515,,1.0,,
6,75.301%,62,local,{'FP32'},Softmax,"ins=[tensor_id=60 [16x197x12x197] FP32 nslice=1 hslice=12],outs=[tensor_id=62 [16x197x12x197] FP32 nslice=1 hslice=12]",246017536,326713344,564.736,1.624%,240.2515,,1.0,,
7,75.301%,477,local,{'FP32'},Softmax,"ins=[tensor_id=476 [16x197x12x197] FP32 nslice=1 hslice=12],outs=[tensor_id=477 [16x197x12x197] FP32 nslice=1 hslice=12]",246017536,326713344,564.736,1.624%,240.2515,,1.0,,
8,75.301%,827,local,{'FP32'},Softmax,"ins=[tensor_id=826 [16x197x12x197] FP32 nslice=1 hslice=12],outs=[tensor_id=827 [16x197x12x197] FP32 nslice=1 hslice=12]",246017536,326713344,564.736,1.624%,240.2515,,1.0,,
9,75.301%,197,local,{'FP32'},Softmax,"ins=[tensor_id=196 [16x197x12x197] FP32 nslice=1 hslice=12],outs=[tensor_id=197 [16x197x12x197] FP32 nslice=1 hslice=12]",246017536,326713344,564.736,1.624%,240.2515,,1.0,,


# 3. ddrRate优化

## ddrRate低的问题需要找到对应的指令，检查是否有不满足下述限制的情况：
### (1) 4DDR-interleave：DDR0，DDR1，DDRA，DDRB, 每块DDR连续4KiB数据
检查搬运的数据是否用到了每个DDR，如果只用到了2个，则性能最多只有1/2，即30GiB/s。
### (2) 地址对齐：32B，64B
指令的src_addr和dst_addr
### (3) 和TIU指令bank冲突
### (4) 数据量：大于64KB
### (5) 最小连续的长度：大于256B
### (6) stride


In [46]:
layer_df.groupby('ddrRate')['gdmaTime(us)'].sum().sort_values(ascending=False)


ddrRate
26.958%    997.675
92.537%    222.257
94.151%    105.333
92.117%     84.394
47.372%     79.971
47.424%     79.884
47.534%     79.698
47.568%     79.641
47.579%     79.623
47.631%     79.537
47.816%     79.229
47.849%     79.174
47.866%     79.145
47.950%     79.007
48.050%     78.843
53.396%     70.949
92.005%     62.836
84.058%     60.956
85.183%     60.151
85.742%     59.759
89.374%     57.330
89.900%     56.995
89.901%     56.994
90.234%     56.784
90.396%     56.682
90.506%     56.613
90.537%     56.594
90.842%     56.404
91.024%     56.291
92.911%     55.148
94.450%     54.249
94.805%     54.046
95.007%     53.931
95.042%     53.911
95.085%     53.887
95.427%     53.694
95.556%     53.621
95.653%     53.567
95.705%     53.538
95.755%     53.510
95.801%     53.484
78.147%     49.740
79.076%     49.156
79.156%     49.106
79.456%     48.921
79.503%     48.892
79.769%     48.729
79.921%     48.636
79.954%     48.616
80.467%     48.306
80.544%     48.260
80.782%     48.118
83.0

# 4 Concurrency优化

## 每个GlobalOp的Concurrency
部分Op出现Concurrency为负数的情况，这是由于指令之间存在间隙导致totalTime > tiuTime + gdmaTime

**注：指令间隙通常是因为之前的指令执行时间过短，导致后续指令尚未填充入指令buffer导致的，另外，如果是动态网络，由于指令是由CPU实时产生发送的，这种情况会更频繁。**

In [47]:
layer_df.loc[layer_df['Type'] == 'global']


Unnamed: 0,LayerID,Type,TPU/CPU,DataType,Function,in,ic,ih,iw,on,oc,oh,ow,kh,kw,KStrideH,KStrideW,Padding,Other info,inputBytes,outputBytes,weightBytes,s2lBytes,l2sBytes,s2sBytes,gdmaCycles,gdmaTime(us),gdmaTimeRatio,gdmaPTheoTime(us),ddrRate,LoadAvgBandwidth(GiB/s),StoreAvgBandwidth(GiB/s),AlgOps,uArchOps,uArchCModelCycles,uArchCModelCycleRatio,tiuCycles,tiuTime(us),tiuTimeRatio,tiuPTheoTime(us),uArchRate,totalTime(us),PeakTops,ActualTops,Parallelism,Concurrency
97,19,global,TPU,{'INT8'},MatMul,16,196,768,1,16,196,768,1,0,0,0,0,"[0, 0, 0, 0]","ins=[tensor_id=16 [16x196x768] INT8 ,tensor_id=17 [1x768x768] INT8 CONST ,tensor_id=18 [1x1x768] INT32 CONST ],outs=[tensor_id=19 [16x196x768] INT8 ]",2408448,2408448,592896,3001344,2408448,0,105333,105.333,1.687%,99.171665,94.151%,53.830825,41.999026,3709009920,3709009920,122304,0.388%,133630,133.63,0.384%,113.19,100.000%,172.73,32.0,21.472876,138.345%,62.880%
159,13,global,TPU,{'FP32'},Cast,16,3,14,16,16,3,14,16,0,0,0,0,"[0, 0, 0, 0]","ins=[tensor_id=12 [16x3x14x16x14x16] FP32 ],outs=[tensor_id=13 [16x3x14x16x14x16] INT8 ]",9633792,2408448,0,9633792,2408448,0,222257,222.257,3.559%,205.670778,92.537%,53.26499,41.682158,9633792,9699328,14208,0.045%,14269,14.269,0.041%,9.408,99.324%,222.285,1.0,0.04334,106.407%,99.804%
160,15,global,TPU,{'INT8'},Permute,16,3,14,16,16,14,14,3,0,0,0,0,"[0, 0, 0, 0]","ins=[tensor_id=13 [16x3x14x16x14x16] INT8 ,tensor_id=14 [2408448] INT8 ],outs=[tensor_id=15 [16x14x14x3x16x16] INT8 ]",4816896,2408448,0,7225344,7225344,0,997675,997.675,15.976%,268.954095,26.958%,8.056556,41.425557,7225344,7585792,55360,0.176%,12483,12.483,0.036%,1.764,95.248%,1010.184,4.0,0.007153,99.997%,-0.208%
250,878,global,TPU,{'INT8'},MatMul,16,768,1,1,16,1000,1,1,0,0,0,0,"[0, 0, 0, 0]","ins=[tensor_id=875 [16x768] INT8 ,tensor_id=876 [768x1000] INT8 CONST ,tensor_id=877 [1x1000] INT32 CONST ],outs=[tensor_id=878 [16x1000] INT8 ]",12288,16000,772000,784288,16000,0,23991,23.991,0.384%,12.932199,53.904%,31.274893,23.429499,24640000,98562048,3252,0.010%,3483,3.483,0.010%,0.751953,24.999%,27.49,32.0,0.896326,99.942%,-0.459%
326,874,global,TPU,{'INT8'},Slice,16,197,768,1,16,1,768,1,0,0,0,0,"[0, 0, 0, 0]","ins=[tensor_id=871 [16x197x768] INT8 nslice=1 hslice=768],outs=[tensor_id=874 [16x1x768] INT8 ]",2420736,12288,0,0,0,12288,655,0.655,0.010%,0.408718,62.400%,0.0,0.0,0,0,0,0.000%,0,0.0,0.000%,0.0,--,0.655,4.0,0.0,100.000%,100%
568,879,global,TPU,{'INT8'},Cast,16,1000,1,1,16,1000,1,1,0,0,0,0,"[0, 0, 0, 0]","ins=[tensor_id=878 [16x1000] INT8 ],outs=[tensor_id=879 [16x1000] FP32 ]",16000,64000,0,16000,64000,0,2597,2.597,0.042%,1.611568,62.055%,21.076607,31.536849,48000,49152,16,0.000%,29,0.029,0.000%,0.011719,97.656%,2.632,4.0,0.018237,99.772%,-20.690%


# 5. macUtil分析

### 上限1
用MacUtilUpperLimit表示：tiuModelTheoTime / sum(tiuPTheoTime)，其中tiuModelTheoTime = totalAlgOps / PeakTops，tiuLayerTheoTime = layerAlgOps / LayerPeakTops
1. 反映了不同LayerType的PeakTops的差异对于计算macUtil的影响，这个上限很难提升


In [48]:
summary_df.set_index('Function', inplace=True)
summary_df['uArchRate'] = summary_df['uArchRate'].apply(lambda x: float(x.strip('%')) if x != '--' else x)
summary_df['Concurrency'] = summary_df['Concurrency'].apply(lambda x: float(x.strip('%')) if x != '--' else x)


In [49]:
# ModelAlgOps来自final.mlir中，这是模型原始的计算量，profile中的AlgOps是原本算子拆成多个指令后的，指令的有效计算量，与实现相关
# ModelAlgOps = 564150211456
tiuModelTheoTime = ModelAlgOps / ModelPeakTops * 1e6 # us
tiuModelPTheoTime = summary_df.at['Overall', 'tiuPTheoTime(us)'] # us
MacUtilUpperLimit = tiuModelTheoTime / tiuModelPTheoTime

print(f"tiuModelTheoTime = {tiuModelTheoTime:.2f} us\ntiuModelPTheoTime = {tiuModelPTheoTime:.2f} us \nMacUtilUpperLimit = {MacUtilUpperLimit*100:.2f}%")


tiuModelTheoTime = 17216.50 us
tiuModelPTheoTime = 22877.62 us 
MacUtilUpperLimit = 75.25%


### 上限2
用MaxMacUtil表示：tiuModelTheoTime / MAX(gdmaTime, tiuTime)
1. 反映了并行度打满的情况下macUtil的上限
2. 如果这时候macUtil依然达不到期望，那就需要优化MAX(gdmaTime, tiuTime)，通常是tiuTime，因为如果gdmaTime更长，一般这会是带宽瓶颈，从ddrUtil角度优化。


In [50]:
# 端到端runtime耗时，e2eTime来自bmrt_test的calculate time
# e2eTime = 38300. # us
macUtil0 = tiuModelTheoTime / e2eTime

# 模型纯粹的运行耗时，不考虑CPU耗时及编译空间的输入输出的搬运
totalModelTime = summary_df.at['Overall', 'totalTime(us)']
macUtil1 = tiuModelTheoTime / totalModelTime

# 并行度打满
tiuModelTime = summary_df.at['Overall', 'tiuTime(us)']
macUtil2 = tiuModelTheoTime / tiuModelTime


### macUtil影响因素分析
1. e2eTime
2. Concurrency
3. 各个算子的uArchRate

In [51]:
def get_mac_util1_with_tiu_theo_time(tiuModelTheoTime, tiuTotalTime, replace_layer_type):
    alg_ops = summary_df.at[replace_layer_type, 'AlgOps']
    peak_tops = summary_df.at["Overall", "PeakTops"]
    tiu_time = summary_df.at[replace_layer_type, 'tiuTime(us)']
    tiu_theo_time = alg_ops / (peak_tops * 1024 * tpu_freq) * 1e6
    reduced_time = tiu_time - tiu_theo_time
    tiuTotalTime = tiuTotalTime - reduced_time
    macUtil = tiuModelTheoTime / tiuTotalTime
    return [replace_layer_type + f' tiuTime: {tiu_time:.2f}us -> {tiu_theo_time:.2f}us', reduced_time, tiuTotalTime , 100.00, macUtil*100, f'{replace_layer_type}的耗时用ModelPeakTops得到的理论耗时替换']


In [52]:
def get_mac_util2_with_tiu_ptheo_time(tiuModelTheoTime, tiuTotalTime, replace_layer_type):
    tiuTime = summary_df.at[replace_layer_type, 'tiuTime(us)']
    tiuPTheoTime = summary_df.at[replace_layer_type, 'tiuPTheoTime(us)']
    reduced_time =  tiuTime - tiuPTheoTime
    tiuTotalTime = tiuTotalTime - reduced_time
    macUtil = tiuModelTheoTime / tiuTotalTime
    return [replace_layer_type + f' tiuTime: {tiuTime:.2f}us -> {tiuPTheoTime:.2f}us', reduced_time, tiuTotalTime , 100.00, macUtil*100, f'{replace_layer_type}的耗时用LayerPeakTops得到的理论耗时替换']


In [53]:
def get_mac_util3_with_full_uarch_rate(tiuModelTheoTime, tiuTotalTime, replace_layer_type):
    uArchRate = summary_df.at[replace_layer_type, 'uArchRate']
    reduced_time = summary_df.at[replace_layer_type, 'tiuTime(us)'] * (1 - uArchRate / 100)
    tiuTotalTime = tiuTotalTime - reduced_time
    macUtil = tiuModelTheoTime / tiuTotalTime
    return [replace_layer_type + f' uArchRate: {uArchRate:.2f}% -> 100%', reduced_time, tiuTotalTime, 100.00, macUtil*100, f'{replace_layer_type}的耗时用uArchRate=100%时的耗时替换']


### macUtil分析总结
由于呈现给大家的macUtil是通过tiuTheoTime / e2eTime, 此处共分析macUtil的四种影响因素：
1. end2end -> origin：origin是profile中纯粹模型运行的耗时，这一步排除CPU耗时和输入输出的搬运耗时的影响，查看此时的macUtil
2. origin -> 100% Concurrency：这一步排除并行度的干扰，即假设此时GDMA耗时被完全掩盖了，查看此时的macUtil
3. 从100%并行度状态开始，依次将每种算子的tiuTime替换为tiuTheoTime（用整个模型的峰值算力计算的理论耗时），可以看出为什么macUtil无法达到100%
4. 从100%并行度状态开始，依次替换每种算子的tiuTime为tiuPTheoTime（用该类型layer对应的峰值算力计算的理论耗时），可以看到当各个Layer替换为Profile中的理论耗时时，macUtil的提升程度
5. 从100%并行度状态开始，依次将每种算子的uArchRate打满，看看当前计算方案下(这种情况与算子的实现方案强相关，可能存在更好的计算方案)，每种Layer的uArchRate打满时，macUtil的提升程度

**注：**
1. 由于每种算子的实现可能包含了其他指令，而这些指令并不能达到该算子的PeakTops，因此，通常即便uArchRate打满，其tiuTime依然会比tiuPTheoTime要长，体现在macUtil上就是macUtil很差，很难提升。
2. 由于每条指令的算力不同，uArchRate打满情况的性能估算比较复杂，此处简单的采取tiuTime * uArchRate来估算打满时的性能，实际应当对每条指令这样计算打满的性能。

In [54]:
row_names = summary_df.index.to_list()
ops = []
for op in row_names:
  if op in ['Load', 'Store', 'Overall', 'Others']:
    continue
  ops.append(op)
ops.append('Others')
print(ops)


['Matmul', 'Softmax', 'Cast', 'LayerNorm', 'Eltwise', 'Lut', 'Others']


In [55]:
columns = ['Case', 'ReducedTime(us)', 'Time(us)', 'Concurrency(%)', 'macUtil(%)', 'Remark']
infosA = [
    ['end2end', 0, e2eTime, summary_df.at['Overall', 'Concurrency'], macUtil0*100, '真正的macUtil'],
    ['origin', e2eTime - totalModelTime, totalModelTime, summary_df.at['Overall', 'Concurrency'], macUtil1*100, '排除CPU耗时及输入输出在runtime空间和用户空间的搬运耗时'],
    ['100% Concurrency', totalModelTime-tiuModelTime, tiuModelTime,  100.00, macUtil2*100, '排除并行度的干扰'],
]

infosB = []
cur_time = tiuModelTime
for op in ops:
    infosB.append(get_mac_util1_with_tiu_theo_time(tiuModelTheoTime, cur_time, op))
    cur_time = infosB[-1][2]

infosC = []
cur_time = tiuModelTime
for op in ops:
    infosC.append(get_mac_util2_with_tiu_ptheo_time(tiuModelTheoTime, cur_time, op))
    cur_time = infosC[-1][2]

infosD = []
cur_time = tiuModelTime
for op in ops:
    infosD.append(get_mac_util3_with_full_uarch_rate(tiuModelTheoTime, cur_time, op))
    cur_time = infosD[-1][2]



infos = infosA + infosB + infosC + infosD
df = pd.DataFrame(infos, columns = columns).round(2)

# 保存到csv文件中
df.to_csv(profile_dir + 'mac_util_analysis.csv', index=False)


In [56]:
df


Unnamed: 0,Case,ReducedTime(us),Time(us),Concurrency(%),macUtil(%),Remark
0,end2end,0.0,38300.0,53.14,44.95,真正的macUtil
1,origin,598.95,37701.05,53.14,45.67,排除CPU耗时及输入输出在runtime空间和用户空间的搬运耗时
2,100% Concurrency,2926.52,34774.53,100.0,49.51,排除并行度的干扰
3,Matmul tiuTime: 23054.57us -> 17205.53us,5849.03,28925.5,100.0,59.52,Matmul的耗时用ModelPeakTops得到的理论耗时替换
4,Softmax tiuTime: 6776.38us -> 90.09us,6686.29,22239.21,100.0,77.42,Softmax的耗时用ModelPeakTops得到的理论耗时替换
5,Cast tiuTime: 1364.05us -> 32.83us,1331.22,20907.99,100.0,82.34,Cast的耗时用ModelPeakTops得到的理论耗时替换
6,LayerNorm tiuTime: 1495.61us -> 19.20us,1476.42,19431.57,100.0,88.6,LayerNorm的耗时用ModelPeakTops得到的理论耗时替换
7,Eltwise tiuTime: 110.16us -> 6.43us,103.74,19327.83,100.0,89.08,Eltwise的耗时用ModelPeakTops得到的理论耗时替换
8,Lut tiuTime: 1845.26us -> 3.57us,1841.69,17486.15,100.0,98.46,Lut的耗时用ModelPeakTops得到的理论耗时替换
9,Others tiuTime: 128.50us -> 3.03us,125.47,17360.68,100.0,99.17,Others的耗时用ModelPeakTops得到的理论耗时替换
