In [1]:
import sys

sys.path.append("../..")

from prettytable import PrettyTable, ALL
from textwrap import fill
from msprof_analyze.advisor.interface.interface import Interface

In [2]:
# 配置profiling采集出来的数据,需要指定到的profiling目录是同一个工具采集的,并且需要采集l0级别以上
profiling_path = r"YOUR PROFILING PATH"
interface = Interface(profiling_path=profiling_path)

### Block Dim问题识别

Block Dim问题主要为识别相关core算子AI core核未打满或者Vector 核未打满问题,主要调优手段为AOE调优,由于AOE调优依赖静态shape,所以当所有算子都为动态shape时,将不会检测相关Block Dim问题.

下列代码为样例,主要展示如何检测Block Dim类型问题,并获取相关问题检测结果:


In [3]:
# 查询computation相关是否存在block dim问题
# 如果profiling数据采集自非8.0.RC1的CANN版本,需要在训练/推理环境中执行: 'cat CANN安装目录/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info'命令查看version
block_dim_result = interface.get_result("computation", "block_dim_analysis", cann_version="7.0.RC1")

In [4]:
problems = block_dim_result.get("problems")
if problems: # 如果存在相关问题则获取相关问题检测描述及建议
    problem_table = PrettyTable(problems.get("headers"))
    for row in problems.get("data"):
        row = [fill(str(element), width=80) for element in row]
        problem_table.add_row(row)
        
    problem_table.align = "l"
    problem_table.hrules = ALL
    display(problem_table)
else:
    print("There is no suggestion related to block dim.")

problem,description,suggestion,problem count,total_time(us),time ratio,income(us),income ratio
block dim,"some operator does not make full use of 25 ai core or 50 ai vector core; Top-10 operator of task duration are as follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast","1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path --tune_ops_file=c:\personalC\code\att\profiler\advi sor\operator_tuning_file_20240613153259.cfg'",101,814.0199999999999,1.0,,


In [5]:
if problems: # 如果存在相关问题则获取相关问题检测细节
    block_dim = block_dim_result.get("block dim")
    block_dim_table = PrettyTable(block_dim.get("headers"))
    for row in block_dim.get("data"):
        row = [fill(str(element), width=80) for element in row]
        block_dim_table.add_row(row)

    block_dim_table.hrules = ALL
    display(block_dim_table[:3])

op_name,op_type,task_type,task_duration,income,block_dim,mix_block_dim,input_shapes,input_data_types,input_formats,output_shapes,output_data_types,output_formats
Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention_norm- LlamaRMSNorm/Square-op34Default/model-LlamaModel/layers- CellList/0-LLamaDecodeLayer/attention_norm-LlamaRMSNorm/ReduceMean-op35,Square,AI_VECTOR_CORE,42.76,0,16,0,"""128,128""",FLOAT,NCHW,"""128,1""",FLOAT,NCHW
Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/ffn_norm- LlamaRMSNorm/Square-op77Default/model-LlamaModel/layers- CellList/0-LLamaDecodeLayer/ffn_norm-LlamaRMSNorm/ReduceMean-op78,Square,AI_VECTOR_CORE,42.24,0,16,0,"""128,128""",FLOAT,NCHW,"""128,1""",FLOAT,NCHW
Default/lm_head-Linear/MatMul-op213,MatMulV2,AI_CORE,39.02,0,20,0,"""128,128;128,32000""",FLOAT16;FLOAT16,FORMAT_ND;FORMAT_ND,"""128,32000""",FLOAT,FORMAT_ND


### Operator No Bound问题识别
Operator No Bound问题主要为识别相关算子无mte, cube, vector, scalar相关bound问题,主要调优手段为AOE调优,由于AOE调优依赖静态shape,所以当所有算子都为动态shape时,将不会检测相关Operator No Bound问题.

下列代码为样例,主要展示如何检测Operator No Bound类型问题,并获取相关问题检测结果:

In [6]:
from prettytable import PrettyTable, ALL
from textwrap import fill
from msprof_analyze.advisor.interface.interface import Interface


# 配置profiling采集出来的数据,需要指定到的profiling目录是同一个工具采集的,并且需要采集l0级别以上
profiling_path = r"YOUR PROFILING PATH"
interface = Interface(profiling_path=profiling_path)

In [7]:
# 查询computation相关是否存在operator no bound问题
# 如果profiling数据采集自非8.0.RC1的CANN版本,需要在训练/推理环境中执行: 'cat CANN安装目录/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info'命令查看version
operator_no_bound_result = interface.get_result("computation", "operator_no_bound_analysis", cann_version="7.0.RC1")

In [8]:
problems = operator_no_bound_result.get("problems")
problem_table = PrettyTable(problems.get("headers"))
if problems: # 如果存在相关问题则获取相关问题检测描述及建议
    for row in problems.get("data"):
        row = [fill(str(element), width=80) for element in row]
        problem_table.add_row(row)

    problem_table.align = "l"
    problem_table.hrules = ALL
    display(problem_table)
else:
    print("There is no suggestion related to operator no bound.")

problem,description,suggestion,problem count,total_time(us),time ratio,income(us),income ratio
block dim,"some operator does not make full use of 25 ai core or 50 ai vector core; Top-10 operator of task duration are as follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast","1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path --tune_ops_file=c:\personalC\code\att\profiler\advi sor\operator_tuning_file_20240613153259.cfg'",101,814.0199999999999,1.0,,
operator no bound,"There is no mte, cube, vector, scalar ratio is more than 80.00%; Top task duration operators need to be tuned are as follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast","1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path --tune_ops_file=c:\personalC\code\att\profiler\advi sor\operator_tuning_file_20240613153259.cfg'",95,814.0199999999999,0.7985,,


In [9]:
if problems: # 如果存在相关问题则获取相关问题检测细节
    operator_no_bound = operator_no_bound_result.get("operator no bound")
    operator_no_bound_table = PrettyTable(operator_no_bound.get("headers"))
    for row in operator_no_bound.get("data"):
        row = [fill(str(element), width=80) for element in row]
        operator_no_bound_table.add_row(row)
    operator_no_bound_table.hrules = ALL
    display(operator_no_bound_table[:3])

op_name,op_type,task_type,task_duration,vec_ratio,mac_ratio,scalar_ratio,mte1_ratio,mte2_ratio,mte3_ratio,block_dim,input_shapes,input_data_types,input_formats,output_shapes,output_data_types,output_formats
Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/attention_norm- LlamaRMSNorm/Square-op34Default/model-LlamaModel/layers- CellList/0-LLamaDecodeLayer/attention_norm-LlamaRMSNorm/ReduceMean-op35,Square,AI_VECTOR_CORE,42.76,0.4654,0.0,0.0,0.0,0.0,0.0056,16,"""128,128""",FLOAT,NCHW,"""128,1""",FLOAT,NCHW
Default/model-LlamaModel/layers-CellList/0-LLamaDecodeLayer/ffn_norm- LlamaRMSNorm/Square-op77Default/model-LlamaModel/layers- CellList/0-LLamaDecodeLayer/ffn_norm-LlamaRMSNorm/ReduceMean-op78,Square,AI_VECTOR_CORE,42.24,0.466,0.0,0.0,0.0,0.0,0.0062,16,"""128,128""",FLOAT,NCHW,"""128,1""",FLOAT,NCHW
Default/lm_head-Linear/MatMul-op213,MatMulV2,AI_CORE,39.02,0.0,0.1105,0.0119,0.0857,0.4284,0.0,20,"""128,128;128,32000""",FLOAT16;FLOAT16,FORMAT_ND;FORMAT_ND,"""128,32000""",FLOAT,FORMAT_ND


### AICPU问题识别
AICPU问题主要为识别相关算子执行时跑到AICPU上计算,并没有利用到AI CORE的计算能力的场景,主要调优手段为修改相关代码来避免AICPU算子,可参见相关资料,来避免AICPU算子的问题:
https://gitee.com/ascend/mstt/blob/master/profiler/msprof_analyze/advisor/doc/Samples%20of%20AI%20CPU%20Operator%20Replacement.md

下列代码为样例,主要展示如何检测Dynamic Shape类型问题,并获取相关问题检测结果:

In [10]:
from prettytable import PrettyTable, ALL
from textwrap import fill
from msprof_analyze.advisor.interface.interface import Interface


# 配置profiling采集出来的数据,需要指定到的profiling目录是同一个工具采集的,并且需要采集l0级别以上
profiling_path = r"YOUR PROFILING PATH"
interface = Interface(profiling_path=profiling_path)

In [11]:
# 查询computation相关是否存在aicpu问题
# 如果profiling数据采集自非8.0.RC1的CANN版本,需要在训练/推理环境中执行: 'cat CANN安装目录/ascend-toolkit/latest/aarch64-linux/ascend_toolkit_install.info'命令查看version
aicpu_result = interface.get_result("computation", "aicpu_analysis")

Please ensure only one trace_view.json in C:\personalC\profiling_data, there will analyze first timeline profiling data.
                                                                                                    

In [12]:
problems = aicpu_result.get("problems")
if problems: # 如果存在相关问题则获取相关问题检测描述及建议
    problem_table = PrettyTable(problems.get("headers"))
    for row in problems.get("data"):
        row = [fill(str(element), width=80) for element in row]
        problem_table.add_row(row)

    problem_table.align = "l"
    problem_table.hrules = ALL
    display(problem_table)
else:
    print("There is no suggestion related to operator no bound.")

problem,description,suggestion,problem count,total_time(us),time ratio,income(us),income ratio
block dim,"some operator does not make full use of 25 ai core or 50 ai vector core; Top-10 operator of task duration are as follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast","1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path --tune_ops_file=c:\personalC\code\att\profiler\advi sor\operator_tuning_file_20240613153259.cfg'",101,814.0199999999999,1.0,,
operator no bound,"There is no mte, cube, vector, scalar ratio is more than 80.00%; Top task duration operators need to be tuned are as follows: Square, MatMulV2, BatchMatMul, SoftmaxV2, Mul, Transpose, Assign, GatherV2, Sigmoid, Cast","1. Optimize operator by AOE, such as: 'aoe --job_type=2 --model_path=$user_dump_path --tune_ops_file=c:\personalC\code\att\profiler\advi sor\operator_tuning_file_20240613153259.cfg'",95,814.0199999999999,0.7985,,
AICPU operator,"Some operators and task duration exceed 20 us, such as : Cast",1. Modify code to avoid aicpu operator,39,686568.860000001,0.0189,,


In [13]:
if problems: # 如果存在相关问题则获取相关问题检测细节
    aicpu = aicpu_result.get("AICPU operator")
    aicpu_table = PrettyTable(aicpu.get("headers"))
    for row in aicpu.get("data"):
        row = [fill(str(element), width=80) for element in row]
        aicpu_table.add_row(row)
    aicpu_table.hrules = ALL
    display(aicpu_table[:2])

op_name,op_type,task_duration,input_shapes,input_data_types,input_formats,output_shapes,output_data_types,output_formats,stack_info
trans_Cast_5,Cast,493.64,"""""",INT32,FORMAT_ND,"""""",UINT64,FORMAT_ND,/usr/local/python3.7.5/lib/python3.7/site-packages/torch/nn/functional.py(1279): dropout; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/dropout.py(58): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/language_model.py(236): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/language_model.py(425): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/gpt_model.py(84): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/module.py(184): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/distributed.py(58): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; ../../pretrain_gpt.py(88): forward_step; /profiling_auto_GPT3/megatron/schedules.py(118): forward_step; /home/s30040711/Megatron- LM/megatron_npu_adaptor/megatron_npu/adaptor_schedules.py(96): forward_backward_no_pipelining; /profiling_auto_GPT3/megatron/training.py(419): train_step; /profiling_auto_GPT3/megatron/training.py(837): train; /profiling_auto_GPT3/megatron/training.py(152): pretrain; ../../pretrain_gpt.py(122): <module>
trans_Cast_5,Cast,413.4,"""""",INT32,FORMAT_ND,"""""",UINT64,FORMAT_ND,/usr/local/python3.7.5/lib/python3.7/site-packages/torch/nn/functional.py(1279): dropout; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/dropout.py(58): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/language_model.py(236): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/language_model.py(425): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/gpt_model.py(84): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/module.py(184): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; /profiling_auto_GPT3/megatron/model/distributed.py(58): forward; /usr/local/python3.7.5/lib/python3.7/site- packages/torch/nn/modules/module.py(1110): _call_impl; ../../pretrain_gpt.py(88): forward_step; /profiling_auto_GPT3/megatron/schedules.py(118): forward_step; /home/s30040711/Megatron- LM/megatron_npu_adaptor/megatron_npu/adaptor_schedules.py(109): forward_backward_no_pipelining; /profiling_auto_GPT3/megatron/training.py(419): train_step; /profiling_auto_GPT3/megatron/training.py(837): train; /profiling_auto_GPT3/megatron/training.py(152): pretrain; ../../pretrain_gpt.py(122): <module>
