# Чтение, обработка и сравнение данных из CSV файлов

In [1]:
%matplotlib inline
import re
import numpy as np
import matplotlib
#matplotlib.use("Agg")
import matplotlib.pyplot as plt
import os.path
import datetime
from cycler import cycler
import pandas as pd

In [2]:
time_field_index = 0
duration_field_index = 1
SSMem_field_index = 9
DSMem_field_index = 10
size_field_index = 11
throughput_field_index = 12
src_field_index = 13
dst_field_index = 14
context_index = 16
stream_index = 17
name_field_index = 18

title_pattern = re.compile("^(Start|s).*")
cuda_pattern = re.compile("\[CUDA .*\]")

filename_smi_chainer = "Chainer/nvidia-smi-cifar_M60_combined.csv"
filename_nvprof_chainer = "Chainer/nvprof-trace-cifar_M60_combined-5766.csv"

filename_nvprof_flops_hpcg = "HPCG/hpcg_m60_flops.csv"
filename_smi_hpcg = "HPCG/nvidia-smi-hpcg.csv"
filename_nvprof_hpcg = "HPCG/nvprof-trace-hpcg.csv"


maxrows = None


In [3]:
file1 = filename_nvprof_flops_hpcg
print "Reading",file1

# columns = [time_field_index, duration_field_index,size_field_index, throughput_field_index,
#            src_field_index, dst_field_index,
#            context_index, stream_index,
#            name_field_index]


Reading HPCG/hpcg_m60_flops.csv


In [4]:
print file1
df = pd.read_csv(file1, header = 0, skiprows=[0,1,2,3,4], nrows = maxrows)
print df.shape

HPCG/hpcg_m60_flops.csv
(108, 8)


In [5]:
df.iloc[:10]

Unnamed: 0,Device,Kernel,Invocations,Metric Name,Metric Description,Min,Max,Avg
0,Tesla M60 (0),"sort_matrix_kernel(double*, int*, int)",4,flop_dp_efficiency,FLOP Efficiency(Peak Double),0.000000%,0.000000%,0.000000%
1,Tesla M60 (0),"sort_matrix_kernel(double*, int*, int)",4,flop_sp_efficiency,FLOP Efficiency(Peak Single),0.000000%,0.000000%,0.000000%
2,Tesla M60 (0),"void dot_kernel<double, double, double, int=12...",361,flop_dp_efficiency,FLOP Efficiency(Peak Double),9.694148%,20.679366%,10.337052%
3,Tesla M60 (0),"void dot_kernel<double, double, double, int=12...",361,flop_sp_efficiency,FLOP Efficiency(Peak Single),0.000000%,0.000000%,0.000000%
4,Tesla M60 (0),void thrust::cuda_cub::core::_kernel_agent<thr...,32,flop_dp_efficiency,FLOP Efficiency(Peak Double),0.000000%,0.000000%,0.000000%
5,Tesla M60 (0),void thrust::cuda_cub::core::_kernel_agent<thr...,32,flop_sp_efficiency,FLOP Efficiency(Peak Single),0.000000%,0.000000%,0.000000%
6,Tesla M60 (0),void thrust::cuda_cub::cub::RadixSortScanBinsK...,15,flop_dp_efficiency,FLOP Efficiency(Peak Double),0.000000%,0.000000%,0.000000%
7,Tesla M60 (0),void thrust::cuda_cub::cub::RadixSortScanBinsK...,15,flop_sp_efficiency,FLOP Efficiency(Peak Single),0.000000%,0.000000%,0.000000%
8,Tesla M60 (0),"void reduce_1Block_kernel<double, double, doub...",361,flop_dp_efficiency,FLOP Efficiency(Peak Double),0.034196%,0.042483%,0.039521%
9,Tesla M60 (0),"void reduce_1Block_kernel<double, double, doub...",361,flop_sp_efficiency,FLOP Efficiency(Peak Single),0.000000%,0.000000%,0.000000%


## Parse data to float

In [7]:
pat = re.compile("[0-9\.]+")
# Extract float number from a string
def parseFloat(s):
    global pat
    if type(s) is not str:
        return s
    f = pat.search(s)
    if f is not None:
        d = float(f.group())
        return d
    return None

In [8]:
df[['Min','Max','Avg']] = df[['Min','Max','Avg']].applymap(parseFloat)

In [9]:
df.iloc[:10]

Unnamed: 0,Device,Kernel,Invocations,Metric Name,Metric Description,Min,Max,Avg
0,Tesla M60 (0),"sort_matrix_kernel(double*, int*, int)",4,flop_dp_efficiency,FLOP Efficiency(Peak Double),0.0,0.0,0.0
1,Tesla M60 (0),"sort_matrix_kernel(double*, int*, int)",4,flop_sp_efficiency,FLOP Efficiency(Peak Single),0.0,0.0,0.0
2,Tesla M60 (0),"void dot_kernel<double, double, double, int=12...",361,flop_dp_efficiency,FLOP Efficiency(Peak Double),9.694148,20.679366,10.337052
3,Tesla M60 (0),"void dot_kernel<double, double, double, int=12...",361,flop_sp_efficiency,FLOP Efficiency(Peak Single),0.0,0.0,0.0
4,Tesla M60 (0),void thrust::cuda_cub::core::_kernel_agent<thr...,32,flop_dp_efficiency,FLOP Efficiency(Peak Double),0.0,0.0,0.0
5,Tesla M60 (0),void thrust::cuda_cub::core::_kernel_agent<thr...,32,flop_sp_efficiency,FLOP Efficiency(Peak Single),0.0,0.0,0.0
6,Tesla M60 (0),void thrust::cuda_cub::cub::RadixSortScanBinsK...,15,flop_dp_efficiency,FLOP Efficiency(Peak Double),0.0,0.0,0.0
7,Tesla M60 (0),void thrust::cuda_cub::cub::RadixSortScanBinsK...,15,flop_sp_efficiency,FLOP Efficiency(Peak Single),0.0,0.0,0.0
8,Tesla M60 (0),"void reduce_1Block_kernel<double, double, doub...",361,flop_dp_efficiency,FLOP Efficiency(Peak Double),0.034196,0.042483,0.039521
9,Tesla M60 (0),"void reduce_1Block_kernel<double, double, doub...",361,flop_sp_efficiency,FLOP Efficiency(Peak Single),0.0,0.0,0.0


# Aggregate values for min, max and avg over SP and DP

## Remove unused columns

In [21]:
df_parts = df.drop(columns = ['Device', 'Kernel', 'Invocations', 'Metric Description'])
df_parts.iloc[:10]

Unnamed: 0,Metric Name,Min,Max,Avg
0,flop_dp_efficiency,0.0,0.0,0.0
1,flop_sp_efficiency,0.0,0.0,0.0
2,flop_dp_efficiency,9.694148,20.679366,10.337052
3,flop_sp_efficiency,0.0,0.0,0.0
4,flop_dp_efficiency,0.0,0.0,0.0
5,flop_sp_efficiency,0.0,0.0,0.0
6,flop_dp_efficiency,0.0,0.0,0.0
7,flop_sp_efficiency,0.0,0.0,0.0
8,flop_dp_efficiency,0.034196,0.042483,0.039521
9,flop_sp_efficiency,0.0,0.0,0.0


## Remove all-zero rows

In [29]:
df_nonzero = df_parts[(df_parts['Max'] != 0) & (df_parts['Avg'] != 0) & (df_parts['Min'] != 0)]

In [32]:
df_nonzero

Unnamed: 0,Metric Name,Min,Max,Avg
2,flop_dp_efficiency,9.694148,20.679366,10.337052
8,flop_dp_efficiency,0.034196,0.042483,0.039521
10,flop_dp_efficiency,6.658286,15.181595,11.678461
12,flop_dp_efficiency,10.437802,15.808632,10.638688
18,flop_dp_efficiency,1.208751,11.639881,7.765638
20,flop_dp_efficiency,1.176237,11.671563,7.754313
22,flop_dp_efficiency,6.660512,11.669874,9.920701
26,flop_dp_efficiency,16.674279,17.094977,16.898752
30,flop_dp_efficiency,1.509759,11.66038,8.067769
34,flop_dp_efficiency,1.639263,11.207828,7.925831


# Pivot table aggregating values for Min, Max and Avg over DP and SP rows 

In [34]:
df_piv = pd.pivot_table(df_nonzero, values=['Min','Max','Avg'], index='Metric Name',
                        aggfunc={'Min':np.min, 'Max':np.max, 'Avg':np.mean})
df_piv

Unnamed: 0_level_0,Avg,Max,Min
Metric Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
flop_dp_efficiency,7.840107,20.679366,0.013125


We see, that:

<div style="display:block;margin:50px;font-size:200%;font-family:cursive;">
    Maximum DP efficiency for HPCG is only about 20%.
</div>

# Remove rows with no Throughput

In [None]:
df = df.loc[df["Throughput"].notnull()]
df.iloc[:5]

In [None]:
print df.Name.unique()

In [None]:
df['FullName']   = df['Name'] + " " + df['SrcMemType']+ df['DstMemType'].fillna("")

In [None]:
df.iloc[:3]

In [None]:
# Convert unique values in column "FullName" to new columns
def mergeColumnNames(df_org):
    df = df_org.pivot(index = 'Start', columns = 'FullName', 
                         values = 'Throughput')
    df = df.fillna(0)  # Fill empty cells with 0-s
    return df

In [None]:
df_throughput = mergeColumnNames(df)

In [None]:
df_throughput.iloc[-5:-1]

In [None]:
df_memcpy = df_throughput.filter(regex=("^((?!memset).)*$"))
df_memcpy.iloc[:3]

In [None]:
def appendMaxValues2ColumnNames(df,series):
    cols = len(df.columns)
    col_names = []
    for i in range(0,cols):
        name = df.columns[i].replace("[CUDA memcpy ","")
        name = name.replace("]","")
        col_names.append(series+name+" " +'{:.3f}'.format(df.iloc[:,i].max()))
    df.columns = col_names
    return df

In [None]:
df_memcpy = appendMaxValues2ColumnNames(df_memcpy,"")

In [None]:
plt.rcParams['figure.figsize'] = 12,6

In [None]:
df_memcpy.plot()
ax = plt.gca()
ax.xaxis.grid(color="#e0e0e0", linestyle=":",linewidth=0.5)
ax.yaxis.grid(color="#e0e0e0", linestyle=":",linewidth=0.5)
#ax.xaxis.set_major_locator(plt.MaxNLocator(24))
ax.set_xlabel('Start (s)')
ax.set_ylabel('Throughput (GB/s)')
plt.show()

In [None]:
df_full.plot.box(logx = True, grid = True, vert=False)

### Without memset

In [None]:
x_arr = []
names = []
for column in df_memcpy: 
    x = df_memcpy[column]
    x = x[x != 0]
    print '{:45.43} {:8.8} st.: {:<7.3f}'.format(column, x.shape, x.iloc[0]),
    arr = x.values
    print 'elm.: {:<6d}'.format(len(arr)),
    x_arr.append(arr)
    names.append(column)
    print '{:3d}'.format(len(x_arr))

In [None]:
plt.rcParams['figure.figsize'] = 12,8

In [None]:
plt.figure()
plt.boxplot(x_arr, 0, '', labels = names)
plt.xticks(rotation=90)
ax = plt.gca()
#ax.set_yscale("log")
ax.yaxis.grid(color="#e0e0e0", linestyle=":",linewidth=0.5)
#plt.savefig("memcpy_box_.pdf", bbox_inches='tight')
plt.show()

### Split df_throughput into several dataframes for plotting

In [None]:
df_DD_TF = df_tf_throughput.filter(regex=(".*DtoD.*"))
df_DD_TF.iloc[:3]

In [None]:
df_DD_HPCG = df_hpcg_throughput.filter(regex=(".*DtoD.*"))
df_DD_HPCG.iloc[:3]

## TF vs HPCG DtoD memcpy

In [None]:
fig, axarr = plt.subplots(2)
df_DD_TF.plot(drawstyle="steps-post",ax = axarr[0])
df_DD_HPCG.plot(drawstyle="steps-post",ax = axarr[1])
for axis in axarr:
    axis.legend()
    axis.xaxis.grid(color="#e0e0e0", linestyle=":",linewidth=0.5)
    axis.xaxis.set_major_locator(plt.MaxNLocator(24))
fig.show()

In [None]:
df_DH = df_full.filter(regex=(".*(HtoD|DtoH).*"))
df_DH.iloc[:3]

In [None]:
df_DH_TF = df_DH.filter(regex=("^TF.*"))
df_DH_HPCG = df_DH.filter(regex=("^HPCG.*"))
df_DH_TF.iloc[:3]

In [None]:
fig, axarr = plt.subplots(2)
df_DH_TF.plot(drawstyle="steps-post",ax = axarr[0])
df_DH_HPCG.plot(drawstyle="steps-post",ax = axarr[1])
for axis in axarr:
    axis.legend()
    axis.xaxis.grid(color="#e0e0e0", linestyle=":",linewidth=0.5)
    axis.xaxis.set_major_locator(plt.MaxNLocator(24))
fig.show()

In [None]:
df_D2H_TF = df_DH_TF.filter(regex=(".*DtoH.*")).fillna(0)
df_D2H_TF.iloc[:3]

In [None]:
#df_D2H_TF_nonzero = df_D2H_TF[df_D2H_TF["TF[CUDA memcpy DtoH] DevicePinned 0.006"] != 0]
df_D2H_TF_nonzero = df_D2H_TF[df_D2H_TF.iloc[:,0] != 0]
df_D2H_TF_nonzero.iloc[:5]

In [None]:
df_D2H_TF.plot()

# Kernels

## Remove memcpy operations from profiles

In [None]:
df_tf_kernel = df_tf.loc[~df_tf['Name'].str.contains('\[CUDA')]
df_hpcg_kernel = df_hpcg.loc[~df_hpcg['Name'].str.contains('\[CUDA')]
df_hpcg_kernel = df_hpcg_kernel.set_index('Start')
df_hpcg_kernel.iloc[:10]