In [20]:
import pandas as pd
pd.set_option('display.max_rows', 90)

# CUDA kernel durations 1 node vs 8 nodes

There is no difference in the kernel used (and their durations) between 1 and 8 nodes experiments both for dali-gpu and dali-cpu-to-gpu

In [21]:
def kernel_durations(file, direction='fprop'):
    assert direction in ['fprop', 'bprop']
    df = pd.read_parquet(file, index=0)
    
    df_direction = df[df['Direction'] == direction][['Kernel', 'Sil(ns)']]
    return df_direction.groupby('Kernel').sum().sort_values(by='Sil(ns)', ascending=False) * 1E-9

## dali-cpu-to-gpu

In [22]:
filename = 'data/pcm/run_0_config_1_2_ib_nvprof_2_resnet50_dali-cpu-to-gpu_32_fp16_fp32_ret_0_0.gzip'
node1 = kernel_durations(filename)
filename = 'data/pcm/run_0_config_8_2_ib_nvprof_2_resnet50_dali-cpu-to-gpu_32_fp16_fp32_ret_0_0.gzip'
node8 = kernel_durations(filename)

duration_kernels = node1.merge(node8, left_on='Kernel', right_on='Kernel', suffixes=('_1node', '_8node'))
duration_kernels.rename(columns={'Sil(ns)_1node': '1 node', 'Sil(ns)_8node': '8 nodes'})

Unnamed: 0_level_0,1 node,8 nodes
Kernel,Unnamed: 1_level_1,Unnamed: 2_level_1
cudnn::detail::implicit_convolve_sgemm,6.037096,5.98701
elementwise_kernel,3.279977,3.276931
cudnn::detail::wgrad_alg0_engine,3.183431,3.465039
sgemm_sm35_ldg_nn_64x16x64x16x16,2.57824,2.552302
sgemm_sm35_ldg_nt_64x16x64x16x16,2.168583,2.145964
sgemm_largek_lds64,1.732891,1.710166
cudnn::detail::dgrad_engine,1.574014,1.562378
cudnn::detail::dgrad_alg1_engine,1.228442,1.219383
cudnn::detail::bn_bw_1C11_kernel_new,1.165348,1.162638
cudnn::detail::bn_fw_tr_1C11_kernel_NCHW,1.038966,1.038086


## dali-gpu

In [23]:
filename = 'data/pcm/run_0_config_1_2_ib_nvprof_2_resnet50_dali-gpu_32_fp16_fp32_ret_0_0.gzip'
node1 = kernel_durations(filename)
filename = 'data/pcm/run_0_config_8_2_ib_nvprof_2_resnet50_dali-gpu_32_fp16_fp32_ret_0_0.gzip'
node8 = kernel_durations(filename)

duration_kernels = node1.merge(node8, left_on='Kernel', right_on='Kernel', suffixes=('_1node', '_8node'))
duration_kernels.rename(columns={'Sil(ns)_1node': '1 node', 'Sil(ns)_8node': '8 nodes'})

Unnamed: 0_level_0,1 node,8 nodes
Kernel,Unnamed: 1_level_1,Unnamed: 2_level_1
cudnn::detail::implicit_convolve_sgemm,6.036142,5.755177
elementwise_kernel,3.283388,3.283576
cudnn::detail::wgrad_alg0_engine,3.184476,3.168416
sgemm_sm35_ldg_nn_64x16x64x16x16,2.582502,2.550685
sgemm_sm35_ldg_nt_64x16x64x16x16,2.173917,2.140859
sgemm_largek_lds64,1.725359,1.706284
cudnn::detail::dgrad_engine,1.573836,1.789154
cudnn::detail::dgrad_alg1_engine,1.228878,1.032957
cudnn::detail::bn_bw_1C11_kernel_new,1.163904,1.162276
cudnn::detail::bn_fw_tr_1C11_kernel_NCHW,1.039746,1.041324
