In [None]:
import sys

sys.path.append("../..")

from prettytable import PrettyTable, ALL
from textwrap import fill
from msprof_analyze.advisor.interface.interface import Interface

In [None]:
# 配置profiling采集出来的数据,需要指定到的profiling目录是同一个工具采集的,并且需要采集l0级别以上
profiling_path = r"YOUR PROFILING PATH"
interface = Interface(profiling_path=profiling_path)

### Block Dim问题识别

Block Dim问题主要为识别相关core算子AI core核未打满或者Vector 核未打满问题,主要调优手段为AOE调优,由于AOE调优依赖静态shape,所以当所有算子都为动态shape时,将不会检测相关Block Dim问题.

下列代码为样例,主要展示如何检测Block Dim类型问题,并获取相关问题检测结果:


In [None]:
# 查询computation相关是否存在block dim问题
# 如果profiling数据采集自非8.0.RC1的CANN版本,需要在训练/推理环境中执行: 'cat CANN安装目录/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info'命令查看version
block_dim_result = interface.get_result("computation", "block_dim_analysis", cann_version="7.0.RC1")

In [14]:
problems = block_dim_result.get("问题综述")
if problems: # 如果存在相关问题则获取相关问题检测描述及建议
    problem_table = PrettyTable(problems.get("headers"))
    for row in problems.get("data"):
        row = [fill(str(element), width=80) for element in row]
        problem_table.add_row(row)
        
    problem_table.align = "l"
    problem_table.hrules = ALL
    display(problem_table)
else:
    print("There is no suggestion related to block dim.")

category,description,suggestion,problem count,total_time(us),time ratio,income(us),income ratio
AICore核数,"一些算子没有充分利用24个AICore核或者48个AIVector核; 任务耗时最长的10个算子如下：TransData, ArgMaxWithValue, GroupedMatmul, ConcatD, BroadcastTo, Tile, MatMulV2, Mul, Cast, Fill",,2935,2245970.2129999925,0.1078,,


In [16]:
if problems: # 如果存在相关问题则获取相关问题检测细节
    block_dim = block_dim_result.get("AICore核数")
    block_dim_table = PrettyTable(block_dim.get("headers"))
    for row in block_dim.get("data"):
        row = [fill(str(element), width=80) for element in row]
        block_dim_table.add_row(row)

    block_dim_table.hrules = ALL
    display(block_dim_table[:3])

op_name,op_type,task_type,task_duration,income,block_dim,mix_block_dim,input_shapes,input_data_types,input_formats,output_shapes,output_data_types,output_formats
aclnnMatmul_TransData_TransData,TransData,AI_VECTOR_CORE,4868.317,0,26,0,"""8192,155136""",DT_BF16,FORMAT_ND,"""9696,512,16,16""",DT_BF16,FRACTAL_NZ
aclnnMatmul_TransData_TransData,TransData,AI_VECTOR_CORE,4838.857,0,26,0,"""8192,155136""",DT_BF16,FORMAT_ND,"""9696,512,16,16""",DT_BF16,FRACTAL_NZ
aclnnMatmul_TransData_TransData,TransData,AI_VECTOR_CORE,4798.156,0,26,0,"""8192,155136""",DT_BF16,FORMAT_ND,"""9696,512,16,16""",DT_BF16,FRACTAL_NZ


### Operator No Bound问题识别
Operator No Bound问题主要为识别相关算子无mte, cube, vector, scalar相关bound问题,主要调优手段为AOE调优,由于AOE调优依赖静态shape,所以当所有算子都为动态shape时,将不会检测相关Operator No Bound问题.

下列代码为样例,主要展示如何检测Operator No Bound类型问题,并获取相关问题检测结果:

In [17]:
from prettytable import PrettyTable, ALL
from textwrap import fill
from msprof_analyze.advisor.interface.interface import Interface


# 配置profiling采集出来的数据,需要指定到的profiling目录是同一个工具采集的,并且需要采集l0级别以上
profiling_path = r"YOUR PROFILING PATH"
interface = Interface(profiling_path=profiling_path)

In [None]:
# 查询computation相关是否存在operator no bound问题
# 如果profiling数据采集自非8.0.RC1的CANN版本,需要在训练/推理环境中执行: 'cat CANN安装目录/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info'命令查看version
operator_no_bound_result = interface.get_result("computation", "operator_no_bound_analysis", cann_version="7.0.RC1")

In [19]:
problems = operator_no_bound_result.get("问题综述")
problem_table = PrettyTable(problems.get("headers"))
if problems: # 如果存在相关问题则获取相关问题检测描述及建议
    for row in problems.get("data"):
        row = [fill(str(element), width=80) for element in row]
        problem_table.add_row(row)

    problem_table.align = "l"
    problem_table.hrules = ALL
    display(problem_table)
else:
    print("There is no suggestion related to operator no bound.")

category,description,suggestion,problem count,total_time(us),time ratio,income(us),income ratio
AICore核数,"一些算子没有充分利用24个AICore核或者48个AIVector核; 任务耗时最长的10个算子如下：TransData, ArgMaxWithValue, GroupedMatmul, ConcatD, BroadcastTo, Tile, MatMulV2, Mul, Cast, Fill",,2935,2245970.2129999925,0.1078,,
算子瓶颈,"mte,cube,vetor,scalar比都没有超过 80.00%，需要调整的任务执行时间最长的算子如下： RealDiv, Exp, Sub, Mul, ApplyAdamW, Cast, Add, TransData, TensorMove, ArgMaxWithValue",,6612,2245970.2129999925,0.3826,,


In [22]:
if problems: # 如果存在相关问题则获取相关问题检测细节
    operator_no_bound = operator_no_bound_result.get("算子瓶颈")
    operator_no_bound_table = PrettyTable(operator_no_bound.get("headers"))
    for row in operator_no_bound.get("data"):
        row = [fill(str(element), width=80) for element in row]
        operator_no_bound_table.add_row(row)
    operator_no_bound_table.hrules = ALL
    display(operator_no_bound_table[:3])

op_name,op_type,task_type,task_duration,vec_ratio,mac_ratio,scalar_ratio,mte1_ratio,mte2_ratio,mte3_ratio,block_dim,input_shapes,input_data_types,input_formats,output_shapes,output_data_types,output_formats
aclnnDivs_RealDivAiCore_RealDiv,RealDiv,AI_VECTOR_CORE,28468.789,0.095,0.0,0.0,0.0,0.0,0.4,48,"""87351296;""",FLOAT;FLOAT,FORMAT_ND;FORMAT_ND,"""87351296""",FLOAT,FORMAT_ND
aclnnExp_ExpAiCore_Exp,Exp,AI_VECTOR_CORE,8166.943,0.06,0.0,0.0,0.0,0.0,0.505,48,"""8165,1,155136""",FLOAT,FORMAT_NCL,"""8165,1,155136""",FLOAT,FORMAT_ND
aclnnExp_ExpAiCore_Exp,Exp,AI_VECTOR_CORE,8154.463,0.06,0.0,0.0,0.0,0.0,0.509,48,"""8168,1,155136""",FLOAT,FORMAT_NCL,"""8168,1,155136""",FLOAT,FORMAT_ND


### AICPU问题识别
AICPU问题主要为识别相关算子执行时跑到AICPU上计算,并没有利用到AI CORE的计算能力的场景,主要调优手段为修改相关代码来避免AICPU算子,可参见相关资料,来避免AICPU算子的问题:
https://gitcode.com/Ascend/mstt/blob/master/profiler/msprof_analyze/advisor/doc/Samples%20of%20AI%20CPU%20Operator%20Replacement.md

下列代码为样例,主要展示如何检测Dynamic Shape类型问题,并获取相关问题检测结果:

In [23]:
from prettytable import PrettyTable, ALL
from textwrap import fill
from msprof_analyze.advisor.interface.interface import Interface


# 配置profiling采集出来的数据,需要指定到的profiling目录是同一个工具采集的,并且需要采集l0级别以上
profiling_path = r"YOUR PROFILING PATH"
interface = Interface(profiling_path=profiling_path)

In [None]:
# 查询computation相关是否存在aicpu问题
# 如果profiling数据采集自非8.0.RC1的CANN版本,需要在训练/推理环境中执行: 'cat CANN安装目录/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info'命令查看version
aicpu_result = interface.get_result("computation", "aicpu_analysis")

In [27]:
problems = aicpu_result.get("问题综述")
if problems: # 如果存在相关问题则获取相关问题检测描述及建议
    problem_table = PrettyTable(problems.get("headers"))
    for row in problems.get("data"):
        row = [fill(str(element), width=80) for element in row]
        problem_table.add_row(row)

    problem_table.align = "l"
    problem_table.hrules = ALL
    display(problem_table)
else:
    print("There is no suggestion related to operator no bound.")

category,description,suggestion,problem count,total_time(us),time ratio,income(us),income ratio
AICore核数,"一些算子没有充分利用24个AICore核或者48个AIVector核; 任务耗时最长的10个算子如下：TransData, ArgMaxWithValue, GroupedMatmul, ConcatD, BroadcastTo, Tile, MatMulV2, Mul, Cast, Fill",,2935,2245970.2129999925,0.1078,,
算子瓶颈,"mte,cube,vetor,scalar比都没有超过 80.00%，需要调整的任务执行时间最长的算子如下： RealDiv, Exp, Sub, Mul, ApplyAdamW, Cast, Add, TransData, TensorMove, ArgMaxWithValue",,6612,2245970.2129999925,0.3826,,
AICPU算子,"一些算子和任务执行时间超过了20us，比如： Min, Max, Bincount, Equal",1. 修改代码避免使用aicpu类算子 2. 尝试将double类型的算子转换成float，比如aclnnEqScalar_EqualAiCpu_Equal,100,2245970.2129999925,0.0054,,


In [29]:
if problems: # 如果存在相关问题则获取相关问题检测细节
    aicpu = aicpu_result.get("AICPU算子")
    aicpu_table = PrettyTable(aicpu.get("headers"))
    for row in aicpu.get("data"):
        row = [fill(str(element), width=80) for element in row]
        aicpu_table.add_row(row)
    aicpu_table.hrules = ALL
    display(aicpu_table[:2])

op_name,op_type,task_duration,input_shapes,input_data_types,input_formats,output_shapes,output_data_types,output_formats,stack_info
aclnnBincount_BincountAiCpu_Bincount,Bincount,363.167,"""20512;;20512""",INT32;INT32;INT64,FORMAT_ND;FORMAT_ND;FORMAT_ND,"""32""",INT64,FORMAT_ND,/opt/tiger/janus/janus/megatron/gate.py(332): native_expert_histogram; /opt/tiger/janus/janus/megatron/gate.py(383): forward; /usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py(1527): _call_impl; /usr/local/lib/python3.8/site- packages/torch/nn/modules/module.py(1518): _wrapped_call_impl; /opt/tiger/janus/janus/megatron/lego_moe_layer.py(405): forward_; /opt/tiger/janus/janus/megatron/lego_moe_layer.py(477): forward; /usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py(1527): _call_impl; /usr/local/lib/python3.8/site- packages/torch/nn/modules/module.py(1518): _wrapped_call_impl; /opt/tiger/mariana/mariana/models/layers/moe.py(468): forward; /usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py(1527): _call_impl; /usr/local/lib/python3.8/site- packages/torch/nn/modules/module.py(1518): _wrapped_call_impl; /opt/tiger/mariana/mariana/models/layers/transformer.py(735): _forward; /opt/tiger/mariana/mariana/models/layers/transformer.py(334): forward; /usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py(1527): _call_impl; /usr/local/lib/python3.8/site- packages/torch/nn/modules/module.py(1518): _wrapped_call_impl; /opt/tiger/mariana/mariana/models/text/transformer.py(509): forward; /usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py(1527): _call_impl; /usr/local/lib/python3.8/site- packages/torch/nn/modules/module.py(1518): _wrapped_call_impl; /opt/tiger/mariana/mariana/models/text/gpt2_megatron.py(204): forward; /usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py(1527): _call_impl; /usr/local/lib/python3.8/site- packages/torch/nn/modules/module.py(1518): _wrapped_call_impl; /opt/tiger/mariana/mariana/models/text/gpt2_megatron.py(445): forward; /usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py(1527): _call_impl; /usr/local/lib/python3.8/site- packages/torch/nn/modules/module.py(1518): _wrapped_call_impl; /opt/tiger/Megatron-LM/megatron/model/module.py(286): forward; /usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py(1527): _call_impl; /usr/local/lib/python3.8/site- packages/torch/nn/modules/module.py(1518): _wrapped_call_impl; /opt/tiger/Megatron-LM/megatron/model/distributed.py(230): forward; /usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py(1527): _call_impl; /usr/local/lib/python3.8/site- packages/torch/nn/modules/module.py(1518): _wrapped_call_impl; tasks/gpt2/unsup/model.py(615): forward; tasks/gpt2/unsup/model.py(881): forward_step; /opt/tiger/Megatron-LM/megatron/schedules.py(305): forward_step; /opt/tiger/Megatron-LM/megatron/schedules.py(470): forward_backward_no_pipelining; tasks/gpt2/unsup/model.py(961): _megatron_fwd_bwd_function; tasks/gpt2/unsup/model.py(1246): training_step; /opt/tiger/cruise/cruise/module/wrapper.py(28): forward; /usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py(1527): _call_impl; /usr/local/lib/python3.8/site- packages/torch/nn/modules/module.py(1518): _wrapped_call_impl; /opt/tiger/cruise/cruise/trainer/common_trainer.py(1451): _train_one_step_forward; /opt/tiger/cruise/cruise/trainer/common_trainer.py(1771): _train_one_epoch; /opt/tiger/cruise/cruise/trainer/common_trainer.py(920): fit; tasks/gpt2/unsup/model.py(1935): main; /usr/local/lib/python3.8/site- packages/torch/distributed/elastic/multiprocessing/errors/__init__.py(346): wrapper; tasks/gpt2/unsup/model.py(1939): <module>
aclnnBincount_BincountAiCpu_Bincount,Bincount,339.527,"""20512;;20512""",INT32;INT32;INT64,FORMAT_ND;FORMAT_ND;FORMAT_ND,"""32""",INT64,FORMAT_ND,/opt/tiger/janus/janus/megatron/gate.py(332): native_expert_histogram; /opt/tiger/janus/janus/megatron/gate.py(383): forward; /usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py(1527): _call_impl; /usr/local/lib/python3.8/site- packages/torch/nn/modules/module.py(1518): _wrapped_call_impl; /opt/tiger/janus/janus/megatron/lego_moe_layer.py(405): forward_; /opt/tiger/janus/janus/megatron/lego_moe_layer.py(477): forward; /usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py(1527): _call_impl; /usr/local/lib/python3.8/site- packages/torch/nn/modules/module.py(1518): _wrapped_call_impl; /opt/tiger/mariana/mariana/models/layers/moe.py(468): forward; /usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py(1527): _call_impl; /usr/local/lib/python3.8/site- packages/torch/nn/modules/module.py(1518): _wrapped_call_impl; /opt/tiger/mariana/mariana/models/layers/transformer.py(735): _forward; /opt/tiger/mariana/mariana/models/layers/transformer.py(334): forward; /usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py(1527): _call_impl; /usr/local/lib/python3.8/site- packages/torch/nn/modules/module.py(1518): _wrapped_call_impl; /opt/tiger/mariana/mariana/models/text/transformer.py(509): forward; /usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py(1527): _call_impl; /usr/local/lib/python3.8/site- packages/torch/nn/modules/module.py(1518): _wrapped_call_impl; /opt/tiger/mariana/mariana/models/text/gpt2_megatron.py(204): forward; /usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py(1527): _call_impl; /usr/local/lib/python3.8/site- packages/torch/nn/modules/module.py(1518): _wrapped_call_impl; /opt/tiger/mariana/mariana/models/text/gpt2_megatron.py(445): forward; /usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py(1527): _call_impl; /usr/local/lib/python3.8/site- packages/torch/nn/modules/module.py(1518): _wrapped_call_impl; /opt/tiger/Megatron-LM/megatron/model/module.py(286): forward; /usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py(1527): _call_impl; /usr/local/lib/python3.8/site- packages/torch/nn/modules/module.py(1518): _wrapped_call_impl; /opt/tiger/Megatron-LM/megatron/model/distributed.py(230): forward; /usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py(1527): _call_impl; /usr/local/lib/python3.8/site- packages/torch/nn/modules/module.py(1518): _wrapped_call_impl; tasks/gpt2/unsup/model.py(615): forward; tasks/gpt2/unsup/model.py(881): forward_step; /opt/tiger/Megatron-LM/megatron/schedules.py(305): forward_step; /opt/tiger/Megatron-LM/megatron/schedules.py(461): forward_backward_no_pipelining; tasks/gpt2/unsup/model.py(961): _megatron_fwd_bwd_function; tasks/gpt2/unsup/model.py(1246): training_step; /opt/tiger/cruise/cruise/module/wrapper.py(28): forward; /usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py(1527): _call_impl; /usr/local/lib/python3.8/site- packages/torch/nn/modules/module.py(1518): _wrapped_call_impl; /opt/tiger/cruise/cruise/trainer/common_trainer.py(1451): _train_one_step_forward; /opt/tiger/cruise/cruise/trainer/common_trainer.py(1771): _train_one_epoch; /opt/tiger/cruise/cruise/trainer/common_trainer.py(920): fit; tasks/gpt2/unsup/model.py(1935): main; /usr/local/lib/python3.8/site- packages/torch/distributed/elastic/multiprocessing/errors/__init__.py(346): wrapper; tasks/gpt2/unsup/model.py(1939): <module>
