In [1]:
from glob import glob
import pandas as pd
import numpy as np

In [2]:
file_list = glob("data/a100/*/*output_ncu.csv")
profile_results = []
for file in file_list:
    df = pd.read_csv(file)
    used_columns = [0,6,13,14,15,16]
    unused_columns = df.columns[list(set(range(df.columns.shape[0])).difference(used_columns))]
    df = df.drop(columns=unused_columns)
    metrics = ['Duration', 'Memory Throughput', 'Compute (SM) Throughput']
    df = df[df['Metric Name'].isin(metrics) & df['Section Name'].isin(["GPU Speed Of Light Throughput"])]
    df_memory = df[0::3][['ID', 'Kernel Name', 'Metric Value']].rename(columns={'Metric Value': 'memory'})
    df_duration = df[1::3][['ID', 'Kernel Name', 'Metric Value']].rename(columns={'Metric Value': 'duration'})
    df_compute = df[2::3][['ID', 'Kernel Name', 'Metric Value']].rename(columns={'Metric Value': 'compute'})
    df = pd.merge(df_memory, df_duration, on=['ID', 'Kernel Name'])
    df = pd.merge(df, df_compute, on=['ID', 'Kernel Name'])
    df['memory'] = df['memory'].str.replace(',', '').astype(float)/100
    df['duration'] = df['duration'].str.replace(',', '').astype(float)/1e9
    df['compute'] = df['compute'].str.replace(',', '').astype(float)/100
    profile_results.append(df.groupby('Kernel Name'))

In [3]:
profile_results[0]['duration'].sum().sort_values(ascending=False)[0:3]

Kernel Name
fmha_cutlassF_f32_aligned_64x64_rf_sm80(PyTorchMemEffAttention::AttentionKernel<float, cutlass::arch::Sm80, (bool)1, (int)64, (int)64, (int)64, (bool)1, (bool)1>::Params)                                                                                                                                                  0.112427
std::enable_if<!T7, void>::type internal::gemvx::kernel<int, int, float, float, float, float, (bool)0, (bool)1, (bool)1, (bool)0, (int)7, (bool)0, cublasGemvParamsEx<int, cublasGemvTensorStridedBatched<const float>, cublasGemvTensorStridedBatched<const float>, cublasGemvTensorStridedBatched<float>, float>>(T13)    0.020275
ampere_sgemm_32x128_tn                                                                                                                                                                                                                                                                                                      0.016155
Name: duratio

In [4]:
profile_results[1]['duration'].sum().sort_values(ascending=False)[0:3]

Kernel Name
void at::native::<unnamed>::RowwiseMomentsCUDAKernel<float>(long, T1, const T1 *, T1 *, T1 *)                                                                 0.073837
sm80_xmma_fprop_implicit_gemm_tf32f32_tf32f32_f32_nhwckrsc_nchw_tilesize256x128x32_stage3_warpsize4x2x1_g1_tensor16x8x8_execute_kernel_cudnn_infer            0.049340
sm80_xmma_fprop_implicit_gemm_indexed_tf32f32_tf32f32_f32_nhwckrsc_nchw_tilesize128x128x16_stage4_warpsize2x2x1_g1_tensor16x8x8_execute_kernel_cudnn_infer    0.036558
Name: duration, dtype: float64

In [5]:
profile_results[2]['duration'].sum().sort_values(ascending=False)[0:3]

Kernel Name
void at::native::<unnamed>::RowwiseMomentsCUDAKernel<float>(long, T1, const T1 *, T1 *, T1 *)                                                         0.073906
sm80_xmma_fprop_implicit_gemm_tf32f32_tf32f32_f32_nhwckrsc_nchw_tilesize256x128x32_stage3_warpsize4x2x1_g1_tensor16x8x8_execute_kernel_cudnn_infer    0.049593
ampere_fp16_s16816gemm_fp16_128x256_ldg8_relu_f2f_stages_64x3_tn                                                                                      0.037065
Name: duration, dtype: float64

In [6]:
profile_results[3]['duration'].sum().sort_values(ascending=False)[0:3]

Kernel Name
ampere_sgemm_128x32_tn                                                                                                                                                                                                                           0.005882
ampere_sgemm_128x32_sliced1x4_tn                                                                                                                                                                                                                 0.001073
void splitKreduce_kernel<(int)32, (int)16, int, float, float, float, float, (bool)1, (bool)1, (bool)0>(cublasSplitKParams<T6>, const T4 *, const T5 *, T5 *, const T6 *, const T6 *, const T7 *, const T4 *, T7 *, void *, long, T6 *, int *)    0.000726
Name: duration, dtype: float64