In [2]:
import sys
sys.path.append("../..")

In [3]:
from msprof_analyze.advisor.interface.interface import Interface
import matplotlib.pyplot as plt
import numpy as np
from prettytable import PrettyTable, ALL
from textwrap import fill

# 集群调优分析
## 1. 集群分析的数据准备
首先我们当前支持PyTorch多卡大模型的集群分析，您需要输入集群分析的profiling_path路径，例如：  
--{profiling_path}  
    -- xxxx_ascend_pt  
    -- xxxx_ascend_pt  
    -- xxxx_ascend_pt  
    ......  
    -- xxxx_ascend_pt  
里面每张卡的profiling文件都是ascend_pt结尾的文件。  

## 2. 集群分析解决的问题  
当前的功能主要有四项：  
1）. 识别多卡间的计算慢卡（根据计算时间等推断）  
2）. 识别多卡间的通信慢现象（根据通信链路的带宽判断）  
3）. 对多卡间的计算算子进行统计展示（识别不同卡的算子差异）  
4）. 展示集群流水并行图（根据时间轴展示多卡间的计算和通信时间）  

In [4]:
# EDIT THE PROFILING DATA PATH
cluster_path = r"YOUR PROFILING PATH"
interface = Interface(profiling_path=cluster_path)

## 1) 识别慢卡

In [5]:
slow_rank_result = interface.get_result("cluster", "slow_rank")

[INFO]Cluster has been analyzed because of the existence of cluster analysis output directory.
[INFO]Skip Cluster analyze backend.


In [6]:
slow_rank_data = slow_rank_result.get("slow_rank_analysis")
if slow_rank_data:
    slow_rank_table = PrettyTable(slow_rank_data.get("headers"))
    for row in slow_rank_data.get("data"):
        row = [fill(str(element), width=80) for element in row]
        slow_rank_table.add_row(row)
    slow_rank_table.hrules = ALL
    display(slow_rank_table[:16])

rank_id,compute,communication,free
0,28976239.07999987,7586795.419999811,6836641.679994211
1,29012279.100000106,6984613.220000025,7388343.859991224
2,29019115.32300051,7489956.633000028,6881360.253991371
3,29027089.56000008,7963312.239999794,6389981.899993688
4,29044786.93699965,6533618.639000017,7780517.153990813
5,29178186.259999853,7925184.420000028,6286867.999995028
6,29025331.189999904,6386639.90799992,7941798.704992032
7,29056803.304999545,7234444.826000024,7094608.035991492
8,31383314.980000228,3973806.617,8017981.379989724
9,31360536.36200019,4757458.825000002,7277062.386991671


In [10]:
problems = slow_rank_result.get("problems")
headers = problems.get('headers')[:2]
if problems: # 如果存在相关问题则获取相关问题检测描述及建议
    problem_table = PrettyTable(headers)
    for row in problems.get("data"):
        row = [fill(str(element), width=100) for element in row]
        problem_table.add_row(row[:2])
    display(problem_table)
else:
    print("There is no suggestion related to slow rank analysis.")

problem,description
slow_rank_analysis,"Computing has some issues in the cluster, because the max difference of Computing time has reached 2411.538ms. Communication has some issues in the cluster, because the max difference of Communication time has reached 3989.506ms."


## 2）识别通信链路慢

In [12]:
slow_link_result = interface.get_result("cluster", "slow_link")

[INFO]Cluster has been analyzed because of the existence of cluster analysis output directory.
[INFO]Skip Cluster analyze backend.


In [13]:
slow_link_data = slow_link_result.get("slow_link_analysis")
if slow_link_data:
    slow_link_table = PrettyTable(slow_link_data.get("headers"))
    for row in slow_link_data.get("data"):
        for i in range(len(row)):
            row[i] = fill(str(row[i]), width=60)
        slow_link_table.add_row(row)
    slow_link_table.hrules = ALL
    display(slow_link_table[:16])

rank_id,RDMA bandwidth(GB/s),RDMA size(mb),RDMA time(ms),SDMA bandwidth(GB/s),SDMA size(mb),SDMA time(ms)
0,0,0,0,9.7668,42507.3469439998,4352.225880000002
1,0,0,0,10.1653,42507.3467759998,4181.611080000001
2,0,0,0,10.471,42507.3467759998,4059.527798999999
3,0,0,0,9.9691,42507.3467759998,4263.9230400000015
4,0,0,0,9.1469,42507.3467759998,4647.202435000001
5,0,0,0,9.4663,42507.3467759998,4490.373999999999
6,0,0,0,9.5692,42507.3467759998,4442.106745000001
7,0,0,0,9.8444,42507.3467759998,4317.931616999999
8,0,0,0,18.895,42507.389952,2249.662369
9,0,0,0,18.9112,42507.39080800006,2247.742016


In [15]:
problems = slow_link_result.get("problems")
headers = problems.get('headers')[:2]
if problems: # 如果存在相关问题则获取相关问题检测描述及建议
    problem_table = PrettyTable(headers)
    for row in problems.get("data"):
        row = [fill(str(element), width=100) for element in row]
        problem_table.add_row(row[:2])
    display(problem_table)
else:
    print("There is no suggestion related to slow link analysis.")

problem,description
slow_rank_analysis,"Computing has some issues in the cluster, because the max difference of Computing time has reached 2411.538ms. Communication has some issues in the cluster, because the max difference of Communication time has reached 3989.506ms."
slow_link_analysis,"SDMA bandwidth(GB/s): The average is 14.332, while the maximum is 18.972GB/s and the minimum is 9.147GB/s. the difference is 9.825GB/s."


## 3) 分布式卡上的kernel算子统计展示

In [66]:
from msprof_analyze.advisor.advisor_backend.interface import Interface
import matplotlib.pyplot as plt
import numpy as np

In [68]:
interface = Interface(cluster_path)
dataset = interface.get_data('cluster', 'kernel')

In [69]:
dataset

Unnamed: 0,rank id,Name,Input Shapes,Input Data Types,Output Shapes,Duration(us)_mean,Duration(us)_var,Duration(us)_max,Duration(us)_min,Duration(us)_count,Duration(us)_sum
0,0,Add100,"""4096,10880;4096,10880""",FLOAT;FLOAT,"""4096,10880""",478.210918,237.729252,721.420,449.80,1024,489687.980
1,0,Add102,"""21760;21760""",FLOAT;FLOAT,"""21760""",4.390391,0.011915,4.820,3.98,1024,4495.760
2,0,Add106,"""21760,4096;21760,4096""",FLOAT;FLOAT,"""21760,4096""",933.504395,462.979321,1257.140,927.38,1024,955908.500
3,0,Add111,"""4096,4096;4096,4096""",FLOAT;FLOAT,"""4096,4096""",91.267363,2.158275,97.120,85.12,1024,93457.780
4,0,Add118,"""12288,4096;12288,4096""",FLOAT;FLOAT,"""12288,4096""",526.312012,1462.617511,787.780,424.24,1024,538943.500
...,...,...,...,...,...,...,...,...,...,...,...
2513,15,trans_Cast_12,"""4096,1,1,128""",FLOAT,"""4096,1,1,128""",8.486495,0.060174,9.820,8.20,2048,17380.342
2514,15,trans_Cast_13,"""4096,1,1,128""",FLOAT,"""4096,1,1,128""",10.534564,0.166380,12.900,9.48,2048,21574.787
2515,15,trans_Cast_14,"""4096,1,1,128""",FLOAT,"""4096,1,1,128""",9.784551,0.295368,13.021,8.56,2048,20038.761
2516,15,trans_Cast_15,"""4096,1,1,128""",DT_BF16,"""4096,1,1,128""",8.342211,0.120471,10.220,7.86,2048,17084.848


In [5]:
# 保存到csv查看， 可修改保存路径
dataset.to_csv('cluster_kernel_details.csv', index=False, sep='\t')

## 4) 展示集群流水并行图
使用说明：  
1）. 需要使用Ascend Torch Profiler采集数据，如果需要展示FP和BP需要将activities设置为采集CPU和NPU  
2）. rank_ids为要展示的rank id列表，必选参数, 可视化顺序与rank_ids的顺序一致  
3）. worker_num为多进程数量，可选参数，请根据机器配置调整，默认值为机器可用核心数的一半  
4）. 如果没有采集CPU数据，则展示Stage和Bubble的流水图  
5）. 生成的json文件可以在chrome trace中查看  

示例图：
![pipeline_view](../../profiler/msprof_analyze/test/resource/pipeline_view.png)

In [70]:
import json

# rank_ids为要呈现的rank id列表，必选参数
# 可以使用列表推导式生成需要的rank_ids，最终展示顺序和rank_ids的顺序一致
# worker_num为多进程数量，可选参数，请根据机器配置调整，默认值为机器可用核心数的一半
dataset = interface.get_data("cluster", "pipeline", rank_ids=[0, 1, 2, 3, 4, 5, 6, 7], worker_num=8)

# 保存json数据，在chrome trace中查看
with open("./pipeline_view.json", "w") as f:
    json.dump(dataset.get("data", []), f)

[INFO] Start to process 8 rank profiling data with 8 workers.
[INFO] Pipline view data process finished, cost 98.48s.
