# Chainer CIFAR100 GPU and API trace

### Profiled with
```
22Feb1713peter@mouse:~$ ./profile_cifar.sh --print-gpu-trace --print-api-trace --csv --log-file chainer_api_gpu_trace.csv```

batch size 128, epoch 1

[nvprof docs](http://docs.nvidia.com/cuda/profiler-users-guide/index.html#gpu-trace-and-api-trace-modes)

In [1]:
%matplotlib inline
import re
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import os.path
import datetime
from cycler import cycler
import pandas as pd

In [5]:
!ls *.csv

K80_cifar_b128_metrics.csv
chainer_api_gpu_trace.csv
chainer_mouse_flops_nvvp_export.csv
chainer_trace_mouse_flops.csv
kernel_export.csv
mouse_cifar_b128_metrics.csv
nvidia-smi-chainer_mouse_combined.csv
nvidia-smi-chainer_mouse_nvsmi_25.csv
nvidia-smi-chainer_mouse_nvsmi_50.csv
nvprof-trace-chainer_mouse_combined-29174.csv


In [11]:
filename="chainer_api_gpu_trace.csv"
nvprof = pd.read_csv(filename, header=[0,1], skiprows=[0,1,2])
nvprof.iloc[:5]

Unnamed: 0_level_0,Start,Duration,Grid X,Grid Y,Grid Z,Block X,Block Y,Block Z,Registers Per Thread,Static SMem,Dynamic SMem,Size,Throughput,SrcMemType,DstMemType,Device,Context,Stream,Name
Unnamed: 0_level_1,s,ms,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,KB,KB,MB,GB/s,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,0.112938,0.001085,,,,,,,,,,,,,,,,,cuDeviceGetCount
1,0.112939,0.000115,,,,,,,,,,,,,,,,,cuDeviceGetCount
2,0.113051,0.000452,,,,,,,,,,,,,,,,,cuDeviceGet
3,0.113052,0.000329,,,,,,,,,,,,,,,,,cuDeviceGetAttribute
4,0.113069,0.000153,,,,,,,,,,,,,,,,,cuDeviceGetAttribute


### Rename columns and Multiindex change to 1 level names

In [39]:
print nvprof.columns.levels[0]
print nvprof.columns.levels[1]
print nvprof.columns.labels[0]
print nvprof.columns.labels[1]

Index([u'Block X', u'Block Y', u'Block Z', u'Context', u'Device',
       u'DstMemType', u'Duration', u'Dynamic SMem', u'Grid X', u'Grid Y',
       u'Grid Z', u'Name', u'Registers Per Thread', u'Size', u'SrcMemType',
       u'Start', u'Static SMem', u'Stream', u'Throughput'],
      dtype='object')
Index([u'GB/s', u'KB', u'MB', u'Unnamed: 13_level_1', u'Unnamed: 14_level_1',
       u'Unnamed: 15_level_1', u'Unnamed: 16_level_1', u'Unnamed: 17_level_1',
       u'Unnamed: 18_level_1', u'Unnamed: 2_level_1', u'Unnamed: 3_level_1',
       u'Unnamed: 4_level_1', u'Unnamed: 5_level_1', u'Unnamed: 6_level_1',
       u'Unnamed: 7_level_1', u'Unnamed: 8_level_1', u'ms', u's'],
      dtype='object')
FrozenNDArray([15, 6, 8, 9, 10, 0, 1, 2, 12, 16, 7, 13, 18, 14, 5, 4, 3, 17, 11], dtype='int8')
FrozenNDArray([17, 16, 9, 10, 11, 12, 13, 14, 15, 1, 1, 2, 0, 3, 4, 5, 6, 7, 8], dtype='int8')


In [45]:
columns = []
names = nvprof.columns.levels[0]
names2 = nvprof.columns.levels[1]
for i in range(len(nvprof.columns.labels[0])):
    name = names[nvprof.columns.labels[0][i]]
    name2= names2[nvprof.columns.labels[1][i]]
    if "Unnamed" not in name2:
        name +=" ("+name2+")"
    columns.append(name)
print columns

['Start (s)', 'Duration (ms)', 'Grid X', 'Grid Y', 'Grid Z', 'Block X', 'Block Y', 'Block Z', 'Registers Per Thread', 'Static SMem (KB)', 'Dynamic SMem (KB)', 'Size (MB)', 'Throughput (GB/s)', 'SrcMemType', 'DstMemType', 'Device', 'Context', 'Stream', 'Name']


In [46]:
nvprof.columns = columns

In [47]:
nvprof.iloc[:5]

Unnamed: 0,Start (s),Duration (ms),Grid X,Grid Y,Grid Z,Block X,Block Y,Block Z,Registers Per Thread,Static SMem (KB),Dynamic SMem (KB),Size (MB),Throughput (GB/s),SrcMemType,DstMemType,Device,Context,Stream,Name
0,0.112938,0.001085,,,,,,,,,,,,,,,,,cuDeviceGetCount
1,0.112939,0.000115,,,,,,,,,,,,,,,,,cuDeviceGetCount
2,0.113051,0.000452,,,,,,,,,,,,,,,,,cuDeviceGet
3,0.113052,0.000329,,,,,,,,,,,,,,,,,cuDeviceGetAttribute
4,0.113069,0.000153,,,,,,,,,,,,,,,,,cuDeviceGetAttribute


In [48]:
nvprof.xs("Name",axis=1)

0               cuDeviceGetCount
1               cuDeviceGetCount
2                    cuDeviceGet
3           cuDeviceGetAttribute
4           cuDeviceGetAttribute
5           cuDeviceGetAttribute
6               cuDeviceGetCount
7                    cuDeviceGet
8                cuDeviceGetName
9               cuDeviceTotalMem
10          cuDeviceGetAttribute
11          cuDeviceGetAttribute
12          cuDeviceGetAttribute
13          cuDeviceGetAttribute
14          cuDeviceGetAttribute
15          cuDeviceGetAttribute
16          cuDeviceGetAttribute
17          cuDeviceGetAttribute
18          cuDeviceGetAttribute
19          cuDeviceGetAttribute
20          cuDeviceGetAttribute
21          cuDeviceGetAttribute
22          cuDeviceGetAttribute
23          cuDeviceGetAttribute
24          cuDeviceGetAttribute
25          cuDeviceGetAttribute
26          cuDeviceGetAttribute
27          cuDeviceGetAttribute
28          cuDeviceGetAttribute
29          cuDeviceGetAttribute
          

In [54]:
wgrad = nvprof[nvprof.Name.str.contains("wgrad_alg0_engine")]
wgrad.reset_index(inplace=True)
print wgrad["Name"].unique()
wgrad.iloc[:10]

[ 'cudaLaunch (void cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1, int=512>(int, int, int, float const *, int, cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1, int=512>*,'
 'void cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1, int=512>(int, int, int, float const *, int, cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1, int=512>*, float const , kernel_grad_params, int, float, int, int, int, int) [3561]'
 'cudaLaunch (void cudnn::detail::wgrad_alg0_engine<float, int=512, int=6, int=5, int=3, int=3, int=3, bool=1, int=512>(int, int, int, float const *, int, cudnn::detail::wgrad_alg0_engine<float, int=512, int=6, int=5, int=3, int=3, int=3, bool=1, int=512>*,'
 ...,
 'void cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1, int=512>(int, int, int, float const *, int, cudnn::detai

Unnamed: 0,index,Start (s),Duration (ms),Grid X,Grid Y,Grid Z,Block X,Block Y,Block Z,Registers Per Thread,Static SMem (KB),Dynamic SMem (KB),Size (MB),Throughput (GB/s),SrcMemType,DstMemType,Device,Context,Stream,Name
0,3854,1.443855,0.033852,,,,,,,,,,,,,,,,,cudaLaunch (void cudnn::detail::wgrad_alg0_eng...
1,3957,1.444518,0.010972,,,,,,,,,,,,,,,,,cudaLaunch (void cudnn::detail::wgrad_alg0_eng...
2,4062,1.445149,0.010915,,,,,,,,,,,,,,,,,cudaLaunch (void cudnn::detail::wgrad_alg0_eng...
3,4267,1.446409,38.672432,72.0,2.0,128.0,8.0,32.0,1.0,105.0,10.25,0.0,,,,,Quadro P2000 (0),1.0,7.0,"void cudnn::detail::wgrad_alg0_engine<float, i..."
4,5065,1.453125,0.022996,,,,,,,,,,,,,,,,,cudaLaunch (void cudnn::detail::wgrad_alg0_eng...
5,5083,1.486445,40.110806,72.0,2.0,128.0,8.0,32.0,1.0,105.0,10.25,0.0,,,,,Quadro P2000 (0),1.0,7.0,"void cudnn::detail::wgrad_alg0_engine<float, i..."
6,5091,1.527918,39.381715,72.0,2.0,128.0,8.0,32.0,1.0,105.0,10.25,0.0,,,,,Quadro P2000 (0),1.0,7.0,"void cudnn::detail::wgrad_alg0_engine<float, i..."
7,5182,1.650861,1.029093,1.0,2.0,256.0,8.0,8.0,1.0,79.0,3.25,0.0,,,,,Quadro P2000 (0),1.0,7.0,"void cudnn::detail::wgrad_alg0_engine<float, i..."
8,7609,1.671483,0.020013,,,,,,,,,,,,,,,,,cudaLaunch (void cudnn::detail::wgrad_alg0_eng...
9,7712,1.672116,0.01076,,,,,,,,,,,,,,,,,cudaLaunch (void cudnn::detail::wgrad_alg0_eng...


In [58]:
wgrad.loc[2,"Name"]

'cudaLaunch (void cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1, int=512>(int, int, int, float const *, int, cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1, int=512>*,'

In [59]:
wgrad.loc[3,"Name"]

'void cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1, int=512>(int, int, int, float const *, int, cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, bool=1, int=512>*, float const , kernel_grad_params, int, float, int, int, int, int) [3561]'

In [87]:
# Search all dataframe for a keyword
def search_keyword(df,word):
    mask = np.column_stack([df[col].str.contains(word,na=False, regex=False) for col in df.select_dtypes([np.object])])
    return df[mask]

In [90]:
search_keyword(nvprof,"3561")

Unnamed: 0,Start (s),Duration (ms),Grid X,Grid Y,Grid Z,Block X,Block Y,Block Z,Registers Per Thread,Static SMem (KB),Dynamic SMem (KB),Size (MB),Throughput (GB/s),SrcMemType,DstMemType,Device,Context,Stream,Name
4267,1.446409,38.672432,72.0,2.0,128.0,8.0,32.0,1.0,105.0,10.250000,0.0,,,,,Quadro P2000 (0),1.0,7.0,"void cudnn::detail::wgrad_alg0_engine<float, i..."
41970,3.709265,0.180417,256.0,1.0,1.0,512.0,1.0,1.0,58.0,0.140625,12.0,,,,,Quadro P2000 (0),1.0,7.0,void cudnn::detail::bn_fw_tr_1C11_singleread<f...
148737,10.405964,0.010966,,,,,,,,,,,,,,,,,cudaLaunch (cudnn::maxwell::gemm::computeOffse...
150508,10.509191,0.002240,1.0,1.0,1.0,128.0,1.0,1.0,16.0,0.000000,0.0,,,,,Quadro P2000 (0),1.0,7.0,cudnn::maxwell::gemm::computeOffsetsKernel(cud...
201496,13.743434,0.006158,,,,,,,,,,,,,,,,,cuLaunchKernel (cupy_random_1_minus_x [183561])
202958,13.752820,0.275553,32768.0,1.0,1.0,128.0,1.0,1.0,15.0,0.000000,0.0,,,,,Quadro P2000 (0),1.0,7.0,cupy_random_1_minus_x [183561]
234397,15.757627,0.010283,,,,,,,,,,,,,,,,,cudaLaunch (void gen_sequenced<curandStateXORW...
237159,15.782674,0.040064,64.0,1.0,1.0,64.0,1.0,1.0,32.0,0.000000,0.0,,,,,Quadro P2000 (0),1.0,7.0,"void gen_sequenced<curandStateXORWOW, float, i..."
258405,17.109196,0.006447,,,,,,,,,,,,,,,,,cuLaunchKernel (weight_decay [235610])
258411,17.109225,0.006448,,,,,,,,,,,,,,,,,cuLaunchKernel (weight_decay [235616])


In [62]:
for col in nvprof:
    print col

Start (s)
Duration (ms)
Grid X
Grid Y
Grid Z
Block X
Block Y
Block Z
Registers Per Thread
Static SMem (KB)
Dynamic SMem (KB)
Size (MB)
Throughput (GB/s)
SrcMemType
DstMemType
Device
Context
Stream
Name
