In [1]:
import sys
sys.path.append("../..")

In [5]:
from msprof_analyze.advisor.interface.interface import Interface
import matplotlib.pyplot as plt
import numpy as np
from prettytable import PrettyTable, ALL
from textwrap import fill

# 集群调优分析
## 1. 集群分析的数据准备
首先我们当前支持PyTorch多卡大模型的集群分析，您需要输入集群分析的profiling_path路径，例如：  
--{profiling_path}  
    -- xxxx_ascend_pt  
    -- xxxx_ascend_pt  
    -- xxxx_ascend_pt  
    ......  
    -- xxxx_ascend_pt  
里面每张卡的profiling文件都是ascend_pt结尾的文件。  

## 2. 集群分析解决的问题  
当前的功能主要有四项：  
1）. 识别多卡间的计算慢卡（根据计算时间等推断）  
2）. 识别多卡间的通信慢现象（根据通信链路的带宽判断）  
3）. 对多卡间的计算算子进行统计展示（识别不同卡的算子差异）  
4）. 展示集群流水并行图（根据时间轴展示多卡间的计算和通信时间）  

In [6]:
# EDIT THE PROFILING DATA PATH
cluster_path = r"YOUR PROFILING PATH"
interface = Interface(profiling_path=cluster_path)

## 1) 识别慢卡

In [7]:
slow_rank_result = interface.get_result("cluster", "slow_rank", template_key='overall')

In [8]:
slow_rank_data = slow_rank_result.get("慢卡分析")
if slow_rank_data:
    slow_rank_table = PrettyTable(slow_rank_data.get("headers"))
    for row in slow_rank_data.get("data"):
        row = [fill(str(element), width=80) for element in row]
        slow_rank_table.add_row(row)
    slow_rank_table.hrules = ALL
    display(slow_rank_table[:16])

step,rank_id,compute(us),communication(us),free(us)
5,0,1725149.7719999943,520820.886,2897916.4370001294


In [9]:
problems = slow_rank_result.get("问题综述")
headers = problems.get('headers')[:2]
if problems: # 如果存在相关问题则获取相关问题检测描述及建议
    problem_table = PrettyTable(headers)
    for row in problems.get("data"):
        row = [fill(str(element), width=100) for element in row]
        problem_table.add_row(row[:2])
    display(problem_table)
else:
    print("There is no suggestion related to slow rank analysis.")

category,description
慢卡分析,没有慢节点问题


## 2）识别通信链路慢

In [10]:
slow_link_result = interface.get_result("cluster", "slow_link", template_key='overall')

In [11]:
slow_link_data = slow_link_result.get("slow_link_analysis")
if slow_link_data:
    slow_link_table = PrettyTable(slow_link_data.get("headers"))
    for row in slow_link_data.get("data"):
        for i in range(len(row)):
            row[i] = fill(str(row[i]), width=60)
        slow_link_table.add_row(row)
    slow_link_table.hrules = ALL
    display(slow_link_table[:16])

In [12]:
problems = slow_link_result.get("问题综述")
headers = problems.get('headers')[:2]
if problems: # 如果存在相关问题则获取相关问题检测描述及建议
    problem_table = PrettyTable(headers)
    for row in problems.get("data"):
        row = [fill(str(element), width=100) for element in row]
        problem_table.add_row(row[:2])
    display(problem_table)
else:
    print("There is no suggestion related to slow link analysis.")

category,description
慢卡分析,没有慢节点问题
慢链路分析,RDMA bandwidth(GB/s)： 平均值是 24.356， 但最大值是 24.356GB/s ， 最小值是 24.356GB/s。 差距为 0.0GB/s。 SDMA bandwidth(GB/s)： 平均值是 18.181， 但最大值是 18.181GB/s ， 最小值是 18.181GB/s。 差距为 0.0GB/s。


## 3) 分布式卡上的kernel算子统计展示

In [13]:
from msprof_analyze.advisor.advisor_backend.interface import Interface
import matplotlib.pyplot as plt
import numpy as np

In [14]:
interface = Interface(cluster_path)
dataset = interface.get_data('cluster', 'kernel')

In [15]:
dataset

Unnamed: 0,rank id,Name,Input Shapes,Input Data Types,Output Shapes,Duration(us)_mean,Duration(us)_var,Duration(us)_max,Duration(us)_min,Duration(us)_count,Duration(us)_sum
0,0,ApplyAdamW1,"""2048;2048;2048;;;;;;;;2048""",FLOAT;FLOAT;FLOAT;FLOAT;FLOAT;FLOAT;FLOAT;FLOA...,"""2048;2048;2048""",4.060000,0.003200,4.100,4.020,2,8.120
1,0,ApplyAdamW1,"""273890816;273890816;273890816;;;;;;;;273890816""",FLOAT;FLOAT;FLOAT;FLOAT;FLOAT;FLOAT;FLOAT;FLOA...,"""273890816;273890816;273890816""",7369.147000,,7369.147,7369.147,1,7369.147
2,0,SwiGlu,"""20480,10240""",DT_BF16,"""20480,5120""",600.245391,1004.557501,680.394,540.771,64,38415.705
3,0,SwiGluGrad,"""20480,5120;20480,10240""",DT_BF16;DT_BF16,"""20480,10240""",859.961437,83.216257,885.318,849.877,32,27518.766
4,0,aclnnAbs_AbsAiCore_Abs,"""8192,1,2048""",DT_BF16,"""8192,1,2048""",26.174406,10.995655,31.521,19.660,32,837.581
...,...,...,...,...,...,...,...,...,...,...,...
619,0,aclnnSubs_SubAiCore_Sub,"""8165,1;""",INT64;INT64,"""8165,1""",5.600000,,5.600,5.600,1,5.600
620,0,aclnnSubs_SubAiCore_Sub,"""8168,1;""",INT64;INT64,"""8168,1""",5.620000,,5.620,5.620,1,5.620
621,0,aclnnSubs_SubAiCore_Sub,"""8;""",INT32;INT32,"""8""",1.580000,,1.580,1.580,1,1.580
622,0,aclnnTopk_CastAiCore_Cast,"""4096,5""",INT32,"""4096,5""",7.250000,0.010426,7.600,7.020,32,232.000


In [16]:
# 保存到csv查看， 可修改保存路径
dataset.to_csv('cluster_kernel_details.csv', index=False, sep='\t')

## 4) 展示集群流水并行图
使用说明：  
1）. 需要使用Ascend Torch Profiler采集数据，如果需要展示FP和BP需要将activities设置为采集CPU和NPU  
2）. rank_ids为要展示的rank id列表，必选参数, 可视化顺序与rank_ids的顺序一致  
3）. worker_num为多进程数量，可选参数，请根据机器配置调整，默认值为机器可用核心数的一半  
4）. 如果没有采集CPU数据，则展示Stage和Bubble的流水图  
5）. 生成的json文件可以在chrome trace中查看  

示例图：
![pipeline_view](../../profiler/msprof_analyze/test/resource/pipeline_view.png)

In [None]:
import json

# rank_ids为要呈现的rank id列表，必选参数
# 可以使用列表推导式生成需要的rank_ids，最终展示顺序和rank_ids的顺序一致
# worker_num为多进程数量，可选参数，请根据机器配置调整，默认值为机器可用核心数的一半
dataset = interface.get_data("cluster", "pipeline", rank_ids=[0, 1, 2, 3, 4, 5, 6, 7], worker_num=8)

# 保存json数据，在chrome trace中查看
with open("./pipeline_view.json", "w") as f:
    json.dump(dataset.get("data", []), f)