# NSight System trace analysis

Parsing Nsight System trace exported as JSON

In [1]:
import pandas as pd
import json
from pandas import json_normalize
import numpy as np
from matplotlib import pyplot as plt 
import re

## Functions

In [2]:
# Get all rows from DF with the given correlation ID.
# Search in all df columns with 'correlationId' in the name.
def LookupCorrelationID(corrId, df):  # nvtx, cuda, kernels, sync):
    corrid_columns = [
        c for c in df.columns if c.lower().find('correlationid') >= 0
    ]
    dfcorr = None
    for c in corrid_columns:
        df_ = df[df[c] == corrId]
        if dfcorr is None:
            dfcorr = df_
        else:
            dfcorr = dfcorr.append(df_)
    return dfcorr.dropna(axis=1, how='all')


# Convert columns StartNs and EndNs to
# start and end in seconds.
def convertStartEndTimes(df):
    df_ = df.copy()
    df_['start'] = None
    df_['end'] = None
    start_cols = [c for c in df.columns if c.lower().find('startns') >= 0]
    end_cols = [c for c in df.columns if c.lower().find('endns') >= 0]
    for c in start_cols:
        rows = df_[c].notna()
        df_.loc[rows, 'start'] = df_.loc[rows, c] * 10e-10
    for c in end_cols:
        rows = df_[c].notna()
        df_.loc[rows, 'end'] = df_.loc[rows, c] * 10e-10
    return df_


# Get CUDA kernel names for events with the given correlationIDs
def LookupNamebyCorrID(corrid, df, names):
    dfcorr = LookupCorrelationID(corrid, df)
    if dfcorr.shape[0] == 0:
        return []
    namestrings = []
    if 'CudaEvent.kernel.shortName' in dfcorr.columns:
        if dfcorr['CudaEvent.kernel.shortName'].notna().any():
            shortnames = dfcorr['CudaEvent.kernel.shortName']
            shortnames = shortnames[shortnames.notna()].values
            for ID in shortnames:
                try:
                    n = int(ID)
                except:
                    print('Cannot convert {} to int.'.format(n))
                    continue
                namestrings.append(names[names['id'] == n]['value'].values[0])
    return namestrings


# Return rows that contain string
def searchRowsContaining(s, df):
    mask = df.applymap(lambda x: s.lower() in str(x).lower())
    df_ = df[mask.any(axis=1)]
    return df_


# Search events from df within the time range.
# DF must have 'start' and 'end' columns.
def lookupTimeRange(start, end, df):
    startdf = df[df['start'] >= start]
    rangedf = startdf[startdf['end'] <= end]
    return rangedf


# Combine trace evenets within time range and cuda kernels lookup
def lookupAPIandKernelsInTimerange(start, end, traces, kernels, names):
    # Lookup traces (API) events in the given range
    startdf = traces[traces['start'] >= start]
    rangedf = startdf[startdf['end'] <= end]
    # Store results in the DF
    results = pd.DataFrame(columns=[
        'correlationId', 'api_start', 'api_end', 'kernel', 'start', 'end',
        'duration'
    ])

    for i, row in rangedf.iterrows():
        # Get correlation ID from the trace event
        corrID = row['TraceProcessEvent.correlationId']
        if corrID == 0:
            continue
        # Get CUDA kernel by correlation ID
        kernel_event = LookupCorrelationID(corrID, kernels)
        if kernel_event is None or kernel_event.shape[0] == 0:
            # No kernels for trace event with the corrID
            continue
        # Get the name of the CUDA kernel
        name = LookupNamebyCorrID(corrID, kernels, names)
        # Append to results DF
        results.loc[results.shape[0]] = [
            corrID, row['start'], row['end'], name[0],
            kernel_event['start'].values[0], kernel_event['end'].values[0],
            kernel_event['duration'].values[0]
        ]
    return results


# Find NVTX event which encompasses given trace event
def NVTXforAPIevent(trace_event,nvtx,debug=False):
    if nvtx is None:
        return None

    # Start and end in seconds
    try:
        start = trace_event.loc['TraceProcessEvent.startNs'] * 10e-10
        end = trace_event.loc['TraceProcessEvent.endNs'] * 10e-10
    except KeyError as e:
        print(e)
        print('columns: {}'.format(trace_event))
    if debug:
        print('{} - {}'.format(start,end))
    # Search NVTX object encompassing events
    nvtxranges = nvtx[nvtx['end'].notna()].copy()
    nvtxranges = nvtxranges[nvtxranges['start'] <= start]
    nvtxranges = nvtxranges[nvtxranges['end'] >= end]
    names = nvtxranges['NvtxEvent.Text'].values
    return names


# Return True if the row value contains any of event name patterns
def searchEventPattern(row, event_names=None, debug=False):
    s = None
    if 'value' in row.index:
        s = row.loc['value']
    elif 'NvtxEvent.Text' in row.index:
        s = row.loc['NvtxEvent.Text']
    else:
        print('Can search only Names and NVTX dataframes.')
        return False
    for pattern in event_names:
        m = re.match(pattern, s, re.I)
        if m is not None:
            return True
        else:
            if debug:
                print("{} not found in {}".format(pattern, s))
    return False


# Parse an array of nvtx range names for iteration number
def GetIterationNumber(nvtx_arr):
    nvtx_name = [n for n in nvtx_arr if 'iteration' in n.lower()]
    if len(nvtx_name) == 0:
        return None
    nvtx_name = nvtx_name[0]  # Convert list to string
    s = nvtx_name.replace('Iteration ', '')
    try:
        i = int(s)
    except:
        print('Cannot convert {} to int'.format(s))
        return None
    return i

## Read CSV

In [3]:
df = pd.read_csv("logs/p3.2xlarge.cont/traces/20210108/nsys_trace_10_raw.csv")
int_columns = [
    'NvtxEvent.Timestamp', 'NvtxEvent.EndTimestamp', 'CudaEvent.startNs',
    'CudaEvent.endNs', 'CudaEvent.correlationId', 'CudaEvent.sync.eventId',
    'TraceProcessEvent.correlationId', 'TraceProcessEvent.name',
    'TraceProcessEvent.startNs', 'TraceProcessEvent.endNs', 'id'
]
for c in int_columns:
    df[c] = df[c].fillna(-1).astype(int).replace(-1, np.nan)
#     df[c] = df[c].astype(int)

df.sample(n=5)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,type,id,value,globalPid,filename,content,Type,MmapEvent.Timestamp,MmapEvent.Cpu,MmapEvent.KernelMode,...,CudaEvent.kernel.localMemoryTotal,CudaEvent.kernel.gridId,CudaEvent.kernel.registersPerThread,CudaEvent.kernel.sharedMemoryExecuted,CudaEvent.kernel.cacheConfig,CudaEvent.kernel.launched,CudaEvent.kernel.sharedMemoryConfig,globalTid,nameId,priority
631703,,,,,,,31.0,,,,...,,,,,,,,,,
486196,,,,,,,48.0,,,,...,,,,,,,,,,
684748,,,,,,,79.0,,,,...,193986560.0,14960.0,118.0,32768.0,1.0,1.0,0.0,,,
655563,,,,,,,48.0,,,,...,,,,,,,,,,
654192,,,,,,,48.0,,,,...,,,,,,,,,,


In [4]:
print('\n'.join(df.columns))

type
id
value
globalPid
filename
content
Type
MmapEvent.Timestamp
MmapEvent.Cpu
MmapEvent.KernelMode
MmapEvent.FilenameId
MmapEvent.VMA
MmapEvent.Start
MmapEvent.Size
MmapEvent.Pgoffset
MmapEvent.IdForBuildId
MmapEvent.GlobalTid
MmapEvent.NsTime
MmapEvent.DebuglinkFilenameId
MmapEvent.DebuglinkCRC
CommEvent.Timestamp
CommEvent.GlobalPid
CommEvent.NumOfCpus
CommEvent.Command
CommEvent.Commname
CommEvent.PathToExe
CommEvent.WorkDir
CommEvent.Args
CommEvent.Bitness
CommEvent.NsTime
CommEvent.EnvironId
DiagnosticEvent.Timestamp.Time
DiagnosticEvent.Timestamp.Type
DiagnosticEvent.Timestamp.NsTime
DiagnosticEvent.Source
DiagnosticEvent.Level
DiagnosticEvent.Text
DiagnosticEvent.GlobalProcess
SchedEvent.Timestamp
SchedEvent.Cpu
SchedEvent.SchedIn
SchedEvent.ThreadState
SchedEvent.GlobalTid
SchedEvent.NsTime
CompositeEvent.Timestamp
CompositeEvent.Cpu
CompositeEvent.Callchain
CompositeEvent.Events
CompositeEvent.State
CompositeEvent.FpUnwindResult
CompositeEvent.DwarfUnwindResult
CompositeEven

## Create DF for each type of objects: NVTX, trace, sync, CUDA events and CUDA kernels

In [5]:
debug = True
# Create detaframes for each event type
# NVTX objects that have NvtxEvent Timestamp
if 'NvtxEvent.Timestamp' in df.columns:
    nvtx = df[df['NvtxEvent.Timestamp'].notna()].dropna(axis=1, how='all')
    # Convert to seconds as displayed in the Nsight System window
    nvtx['start'] = nvtx['NvtxEvent.Timestamp'] * 10e-10
    nvtx['end'] = nvtx['NvtxEvent.EndTimestamp'].fillna(0) * 10e-10
    nvtx['end'] = nvtx['end'].replace(0, np.nan)
    print('NVTX: {}'.format(nvtx['NvtxEvent.Text'].unique()))
else:
    nvtx = None

traces = df[df['TraceProcessEvent.startNs'].notna()].dropna(axis=1, how='all')
traces['start'] = traces['TraceProcessEvent.startNs'] * 10e-10
traces['end'] = traces['TraceProcessEvent.endNs'] * 10e-10
traces['duration'] = (traces['TraceProcessEvent.endNs'] -
                      traces['TraceProcessEvent.startNs']) * 10e-10
if debug:
    print("Traces DF has {} rows.".format(traces.shape[0]))
    print(traces.head())
    print("." * 50)
    print(traces.dtypes)
    print("-" * 50)

sync = df[df['CudaEvent.sync.eventId'].notna()].dropna(axis=1, how='all')
# Convert to seconds
sync['start'] = sync['CudaEvent.startNs'] * 10e-10
sync['end'] = sync['CudaEvent.endNs'] * 10e-10
sync['duration'] = (sync['CudaEvent.endNs'] -
                    sync['CudaEvent.startNs']) * 10e-10
if debug:
    print("Sync DF has {} rows.".format(sync.shape[0]))

# CUDA event kernels objects
kernels = df[df['CudaEvent.kernel.shortName'].notna()].dropna(axis=1,
                                                              how='all')
# Convert to seconds
kernels['start'] = kernels['CudaEvent.startNs'] * 10e-10
kernels['end'] = kernels['CudaEvent.endNs'] * 10e-10
kernels['duration'] = (kernels['CudaEvent.endNs'] -
                       kernels['CudaEvent.startNs']) * 10e-10

if debug:
    print("Kernels DF has {} rows.".format(kernels.shape[0]))
    print("Kernels has {} columns:".format(kernels.shape[1]))
    print(kernels.columns)

# Names
names = df[df['value'].notna()].dropna(axis=1, how='all')
if debug:
    print("Names DF has {} rows.".format(names.shape[0]))
    print("Names:")
    print(names.head())
    print('-' * 50)

print('Names')
display(names.head())
print('NVTX')
display(nvtx.head())
print('traces')
display(traces[traces['duration'] > 0.00002].head(5))
print(sorted(list(traces['TraceProcessEvent.name'].unique())))
print('sync')
display(sync.head())
# print('CUDA')
# display(cuda.head())
print('kernels')
display(kernels.head())

NVTX: ['Moving model to GPU' 'Epoch 0 start' 'Training epoch start'
 'Iteration 1' 'FWD pass' 'Prediction and loss' 'BWD pass'
 'Optimizer update' 'Iteration 2' 'Iteration 3' 'Iteration 4'
 'Iteration 5' 'Iteration 6' 'Iteration 7' 'Iteration 8' 'Iteration 9'
 'Iteration 10' 'Iteration 11' 'Done calculating loss' 'Epoch 0 done']
Traces DF has 477236 rows.
       Type MmapEvent.Start DiagnosticEvent.GlobalProcess  \
21450  48.0                                                 
21496  47.0                                                 
21545  47.0                                                 
21602  48.0                                                 
21611  48.0                                                 

       TraceProcessEvent.startNs  TraceProcessEvent.endNs  \
21450                  2902082.0                9803258.0   
21496                  3949143.0                3949143.0   
21545                  6051792.0                6051792.0   
21602                 10293213.

Unnamed: 0,type,id,value,MmapEvent.Start,DiagnosticEvent.GlobalProcess
0,String,0.0,[Unknown],,
1,String,1.0,[kernel.kallsyms],,
2,String,2.0,[Max depth],,
3,String,3.0,[Broken backtraces],,
4,String,4.0,[Called from Java],,


NVTX


Unnamed: 0,Type,DiagnosticEvent.GlobalProcess,NvtxEvent.Type,NvtxEvent.Timestamp,NvtxEvent.Text,NvtxEvent.GlobalTid,NvtxEvent.EndTimestamp,NvtxEvent.DomainId,NvtxEvent.NsTime,NvtxEvent.Color,start,end
559194,59.0,,59.0,3207448000.0,Moving model to GPU,281483900000000.0,12774900000.0,0.0,True,,3.207448,12.774896
625598,34.0,,34.0,12776200000.0,Epoch 0 start,281483900000000.0,,0.0,True,861260800.0,12.776205,
625599,34.0,,34.0,12776210000.0,Training epoch start,281483900000000.0,,0.0,True,,12.776215,
631121,59.0,,59.0,13117150000.0,Iteration 1,281483900000000.0,13297160000.0,0.0,True,4278223000.0,13.117152,13.297165
631516,59.0,,59.0,13128510000.0,FWD pass,281483900000000.0,13219280000.0,0.0,True,4286513000.0,13.128511,13.219283


traces


Unnamed: 0,Type,MmapEvent.Start,DiagnosticEvent.GlobalProcess,TraceProcessEvent.startNs,TraceProcessEvent.endNs,TraceProcessEvent.correlationId,TraceProcessEvent.eventClass,TraceProcessEvent.name,TraceProcessEvent.returnValue,TraceProcessEvent.globalTid,TraceProcessEvent.nestingLevel,TraceProcessEvent.callchain,start,end,duration
21450,48.0,,,2902082.0,9803258.0,0.0,3.0,34.0,0.0,281483900000000.0,,,0.002902,0.009803,0.006901
21646,48.0,,,15437255.0,15459609.0,0.0,27.0,40.0,0.0,281483900000000.0,0.0,,0.015437,0.01546,2.2e-05
21774,48.0,,,26974212.0,27010146.0,0.0,27.0,40.0,0.0,281483900000000.0,0.0,,0.026974,0.02701,3.6e-05
21887,48.0,,,37885991.0,37907988.0,0.0,27.0,37.0,0.0,281483900000000.0,0.0,,0.037886,0.037908,2.2e-05
21944,48.0,,,43581865.0,43622782.0,0.0,27.0,40.0,0.0,281483900000000.0,0.0,,0.043582,0.043623,4.1e-05


[27.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 51.0, 1262.0, 1263.0, 1264.0, 1265.0, 1991.0, 1992.0, 2147.0, 3424.0, 3450.0, 4246.0, 4247.0, 4250.0, 4252.0, 4253.0, 4254.0, 4255.0, 4256.0, 4257.0, 4258.0, 4259.0, 5000.0, 5431.0, 6053.0, 6144.0, 6145.0, 6146.0, 6152.0, 6153.0, 6348.0, 6349.0, 9444.0, 9502.0, 9555.0, 9792.0, 9864.0, 16027.0, 16599.0, 16953.0, 16986.0, 17014.0, 17068.0, 17284.0, 17285.0, 17502.0, 17503.0, 17504.0, 17548.0, 17549.0, 17703.0, 20031.0, 20208.0, 20209.0, 20516.0, 20519.0, 20649.0, 20650.0, 20717.0, 20719.0, 20720.0, 20827.0, 20828.0, 20829.0, 20832.0, 20833.0, 20834.0, 20835.0, 20837.0, 20841.0, 20861.0, 20871.0, 20897.0, 20898.0, 21203.0, 21204.0, 21205.0, 21206.0, 21209.0, 21210.0, 21211.0, 21212.0]
sync


Unnamed: 0,Type,DiagnosticEvent.GlobalProcess,CudaEvent.startNs,CudaEvent.endNs,CudaEvent.correlationId,CudaEvent.deviceId,CudaEvent.contextId,CudaEvent.streamId,CudaEvent.eventClass,CudaEvent.globalPid,CudaEvent.sync.eventId,CudaEvent.sync.syncType,start,end,duration
623805,106.0,,12729080000.0,12729090000.0,233.0,0.0,1.0,7.0,5.0,281483900000000.0,4294967000.0,3.0,12.729081,12.729093,1.1e-05
623812,106.0,,12729440000.0,12729450000.0,242.0,0.0,1.0,7.0,5.0,281483900000000.0,4294967000.0,3.0,12.729441,12.729452,1e-05
623817,106.0,,12729540000.0,12729550000.0,251.0,0.0,1.0,7.0,5.0,281483900000000.0,4294967000.0,3.0,12.72954,12.72955,1e-05
623822,106.0,,12729660000.0,12729670000.0,260.0,0.0,1.0,7.0,5.0,281483900000000.0,4294967000.0,3.0,12.729663,12.729669,6e-06
623826,106.0,,12729730000.0,12729740000.0,269.0,0.0,1.0,7.0,5.0,281483900000000.0,4294967000.0,3.0,12.729726,12.729737,1.1e-05


kernels


Unnamed: 0,Type,DiagnosticEvent.GlobalProcess,CudaEvent.startNs,CudaEvent.endNs,CudaEvent.correlationId,CudaEvent.deviceId,CudaEvent.contextId,CudaEvent.streamId,CudaEvent.eventClass,CudaEvent.globalPid,...,CudaEvent.kernel.localMemoryTotal,CudaEvent.kernel.gridId,CudaEvent.kernel.registersPerThread,CudaEvent.kernel.sharedMemoryExecuted,CudaEvent.kernel.cacheConfig,CudaEvent.kernel.launched,CudaEvent.kernel.sharedMemoryConfig,start,end,duration
631850,79.0,,13139490000.0,13139490000.0,3846.0,0.0,1.0,7.0,3.0,281483900000000.0,...,193986560.0,5.0,18.0,0.0,1.0,1.0,0.0,13.139486,13.139489,2e-06
631854,79.0,,13139560000.0,13139750000.0,3849.0,0.0,1.0,7.0,3.0,281483900000000.0,...,193986560.0,6.0,128.0,65536.0,1.0,1.0,0.0,13.139557,13.139752,0.000196
631865,79.0,,13140020000.0,13140020000.0,3861.0,0.0,1.0,7.0,3.0,281483900000000.0,...,193986560.0,7.0,16.0,0.0,1.0,1.0,0.0,13.140022,13.140024,2e-06
631894,79.0,,13141060000.0,13141210000.0,3902.0,0.0,1.0,7.0,3.0,281483900000000.0,...,193986560.0,8.0,32.0,8192.0,1.0,1.0,0.0,13.141062,13.141213,0.000151
631901,79.0,,13141290000.0,13141370000.0,3910.0,0.0,1.0,7.0,3.0,281483900000000.0,...,193986560.0,9.0,16.0,0.0,1.0,1.0,0.0,13.141289,13.141371,8.2e-05


In [6]:
traces[(traces['duration']> 0.001) & (traces['start']>14)].sort_values(['start']).rename_axis("n", axis="columns").head()

n,Type,MmapEvent.Start,DiagnosticEvent.GlobalProcess,TraceProcessEvent.startNs,TraceProcessEvent.endNs,TraceProcessEvent.correlationId,TraceProcessEvent.eventClass,TraceProcessEvent.name,TraceProcessEvent.returnValue,TraceProcessEvent.globalTid,TraceProcessEvent.nestingLevel,TraceProcessEvent.callchain,start,end,duration
670141,48.0,,,14002940000.0,14070360000.0,0.0,27.0,3450.0,0.0,281483900000000.0,0.0,"[{'Symbol': 3450, 'Module': 58, 'OriginalIP': ...",14.002944,14.070356,0.067412
671354,48.0,,,14024260000.0,14025430000.0,119132.0,0.0,20720.0,0.0,281483900000000.0,,,14.024264,14.025426,0.001162
671357,48.0,,,14024280000.0,14377670000.0,0.0,27.0,9502.0,0.0,281483900000000.0,0.0,"[{'Symbol': 9551, 'Module': 79, 'OriginalIP': ...",14.02428,14.377668,0.353388
672134,48.0,,,14040300000.0,14140490000.0,0.0,27.0,9792.0,0.0,281483900000000.0,0.0,"[{'Symbol': 9793, 'Module': 58, 'OriginalIP': ...",14.040296,14.140495,0.100199
673323,48.0,,,14070280000.0,14112060000.0,0.0,27.0,3450.0,0.0,281483900000000.0,0.0,"[{'Symbol': 3450, 'Module': 58, 'OriginalIP': ...",14.070276,14.112062,0.041785


In [7]:
# Search for correlation ID
correlation_cols = [c for c in df.columns if 'correlationId' in c]
print(correlation_cols)

['TraceProcessEvent.correlationId', 'CudaEvent.correlationId']


In [10]:
id = 18979  # elementwise_kernel
dfid = df[(df['TraceProcessEvent.correlationId'] == id) |
          (df['CudaEvent.correlationId'] == id)].dropna(axis=1, how='all')
print(dfid.columns)
display(dfid.rename_axis("n", axis="columns"))
display(names[names['id'].isin([20719])].rename_axis("n", axis="columns"))

Index(['Type', 'TraceProcessEvent.startNs', 'TraceProcessEvent.endNs',
       'TraceProcessEvent.correlationId', 'TraceProcessEvent.eventClass',
       'TraceProcessEvent.name', 'TraceProcessEvent.returnValue',
       'TraceProcessEvent.globalTid', 'CudaEvent.startNs', 'CudaEvent.endNs',
       'CudaEvent.correlationId', 'CudaEvent.deviceId', 'CudaEvent.contextId',
       'CudaEvent.streamId', 'CudaEvent.eventClass', 'CudaEvent.globalPid',
       'CudaEvent.memcpy.sizebytes', 'CudaEvent.memcpy.copyKind',
       'CudaEvent.memcpy.srcKind', 'CudaEvent.memcpy.dstKind'],
      dtype='object')


n,Type,TraceProcessEvent.startNs,TraceProcessEvent.endNs,TraceProcessEvent.correlationId,TraceProcessEvent.eventClass,TraceProcessEvent.name,TraceProcessEvent.returnValue,TraceProcessEvent.globalTid,CudaEvent.startNs,CudaEvent.endNs,CudaEvent.correlationId,CudaEvent.deviceId,CudaEvent.contextId,CudaEvent.streamId,CudaEvent.eventClass,CudaEvent.globalPid,CudaEvent.memcpy.sizebytes,CudaEvent.memcpy.copyKind,CudaEvent.memcpy.srcKind,CudaEvent.memcpy.dstKind
637892,48.0,13297360000.0,13297380000.0,18979.0,0.0,20719.0,0.0,281483900000000.0,,,,,,,,,,,,
637895,80.0,,,,,,,,13297390000.0,13298530000.0,18979.0,0.0,1.0,7.0,1.0,281483900000000.0,6021120.0,1.0,1.0,2.0


n,type,id,value,MmapEvent.Start,DiagnosticEvent.GlobalProcess
20719,String,20719.0,cudaMemcpyAsync_v3020,,


## parseOneTrace

In [75]:
# Search Events by pattern
event_name_patterns =  ['cublasSgemm_.*'] #,'.*convolutionbackwarddata.*','prediction and loss']
event_names_df = names[names.apply(searchEventPattern, event_names=event_name_patterns,
                                   axis=1)]
ids = event_names_df['id'].values
eventnames = event_names_df['value'].values

event_names_df[['id','value']]

Unnamed: 0,id,value
17285,17285.0,cublasSgemm_v2


In [82]:
# Just for check
print(ids)
mask = None
show_columns = [
    'Type', 'TraceProcessEvent.startNs', 'TraceProcessEvent.endNs',
    'TraceProcessEvent.correlationId', 
    'TraceProcessEvent.name', 'CudaEvent.startNs', 'CudaEvent.endNs',
    'CudaEvent.correlationId', 'CudaEvent.kernel.shortName','name'
]
for id, name in zip(ids, eventnames):
    f = df.eq(id).any(1)
    print("{} corr ids: {}, types: {}".format(
        name,
        df.loc[f,
               ['TraceProcessEvent.correlationId', 'CudaEvent.correlationId']].
        drop_duplicates(), df.loc[f, 'Type'].unique()))
    df_ = df[f]
    df_ = df_[df_['Type'].notna()].dropna(how='all', axis=1)
    df_.loc[:,'name'] = df_['TraceProcessEvent.name'].apply(lambda s: names[names['id']==s]['value'].values[0])
    display(df_[[c for c in df_.columns if c in show_columns]].rename_axis("n", axis="columns"))
    if mask is None:
        mask = f
    else:
        mask = mask | f
df_ = df[mask].dropna(axis=1, how='all').copy()
print("-" * 12)
print(df_['Type'].unique())
if 'CudaEvent.kernel.shortName' in df_.columns:
    print(df_['CudaEvent.kernel.shortName'].unique())
    df_ = df_[(df_['Type'].notna()) |
              (df_['CudaEvent.kernel.shortName'].notna())]
display(df_[[
    'Type', 'TraceProcessEvent.startNs', 'TraceProcessEvent.endNs',
    'TraceProcessEvent.correlationId', 'TraceProcessEvent.name',
    'CudaEvent.startNs', 'CudaEvent.endNs', 'CudaEvent.correlationId',
    'CudaEvent.kernel.shortName'
]].rename_axis("n", axis="columns"))

[17285.]
cublasSgemm_v2 corr ids:         TraceProcessEvent.correlationId  CudaEvent.correlationId
17285                               NaN                      NaN
632904                              0.0                      NaN, types: [nan 48.]


n,Type,TraceProcessEvent.startNs,TraceProcessEvent.endNs,TraceProcessEvent.correlationId,TraceProcessEvent.name,name
632904,48.0,1.316972e+10,1.317178e+10,0.0,17285.0,cublasSgemm_v2
632978,48.0,1.317245e+10,1.317254e+10,0.0,17285.0,cublasSgemm_v2
634233,48.0,1.320285e+10,1.320295e+10,0.0,17285.0,cublasSgemm_v2
634254,48.0,1.320327e+10,1.320331e+10,0.0,17285.0,cublasSgemm_v2
634816,48.0,1.321903e+10,1.321918e+10,0.0,17285.0,cublasSgemm_v2
...,...,...,...,...,...,...
683982,48.0,1.430057e+10,1.430065e+10,0.0,17285.0,cublasSgemm_v2
684004,48.0,1.430108e+10,1.430112e+10,0.0,17285.0,cublasSgemm_v2
684473,48.0,1.431328e+10,1.431335e+10,0.0,17285.0,cublasSgemm_v2
684530,48.0,1.431511e+10,1.431517e+10,0.0,17285.0,cublasSgemm_v2


------------
[nan 48.]


KeyError: "['CudaEvent.kernel.shortName', 'CudaEvent.correlationId', 'CudaEvent.endNs', 'CudaEvent.startNs'] not in index"

In [83]:
# Searching for fused_dropout CPU-side events (corr id = 6138)
corrid = 3619
df_ = df[df.eq(corrid).any(1)]
df_ = df_[df_['Type'].notna()].dropna(how='all', axis=1)
print(df_.columns)
display(df_[[
    'Type', 'TraceProcessEvent.startNs', 'TraceProcessEvent.endNs',
    'TraceProcessEvent.correlationId', 
    'TraceProcessEvent.name', 'CudaEvent.startNs', 'CudaEvent.endNs',
    'CudaEvent.correlationId', 'CudaEvent.kernel.shortName'
]].rename_axis("n", axis="columns"))

Index(['Type', 'TraceProcessEvent.startNs', 'TraceProcessEvent.endNs',
       'TraceProcessEvent.correlationId', 'TraceProcessEvent.eventClass',
       'TraceProcessEvent.name', 'TraceProcessEvent.returnValue',
       'TraceProcessEvent.globalTid', 'CudaEvent.startNs', 'CudaEvent.endNs',
       'CudaEvent.correlationId', 'CudaEvent.deviceId', 'CudaEvent.contextId',
       'CudaEvent.streamId', 'CudaEvent.eventClass', 'CudaEvent.globalPid',
       'CudaEvent.sync.eventId', 'CudaEvent.sync.syncType',
       'CudaEvent.kernel.demangledName', 'CudaEvent.kernel.shortName',
       'CudaEvent.kernel.eventCategory', 'CudaEvent.kernel.gridX',
       'CudaEvent.kernel.gridY', 'CudaEvent.kernel.gridZ',
       'CudaEvent.kernel.blockX', 'CudaEvent.kernel.blockY',
       'CudaEvent.kernel.blockZ', 'CudaEvent.kernel.staticSharedMemory',
       'CudaEvent.kernel.dynamicSharedMemory',
       'CudaEvent.kernel.localMemoryPerThread',
       'CudaEvent.kernel.localMemoryTotal', 'CudaEvent.kernel.gridId',

n,Type,TraceProcessEvent.startNs,TraceProcessEvent.endNs,TraceProcessEvent.correlationId,TraceProcessEvent.name,CudaEvent.startNs,CudaEvent.endNs,CudaEvent.correlationId,CudaEvent.kernel.shortName
631385,48.0,13122850000.0,13124000000.0,3619.0,20720.0,,,,
631386,106.0,,,,,13122850000.0,13124000000.0,3619.0,
646997,79.0,,,,,13508500000.0,13508510000.0,46289.0,20936.0


In [90]:
df_ = df.loc[20841]
df_

type                                                   String
id                                                      20841
value                                  cudaLaunchKernel_v7000
globalPid                                                 NaN
filename                                                  NaN
                                                ...          
CudaEvent.kernel.launched                                 NaN
CudaEvent.kernel.sharedMemoryConfig                       NaN
globalTid                                                 NaN
nameId                                                    NaN
priority                                                  NaN
Name: 20841, Length: 109, dtype: object

In [37]:
# CUDA API and kernels
print("ID {}".format(ids))
matched_kernels = kernels[kernels['CudaEvent.kernel.shortName'].isin(
    ids)].copy()
print(matched_kernels['Type'].unique())
if matched_kernels.shape[0]>0:
    matched_kernels = matched_kernels[[
        'Type','CudaEvent.kernel.shortName', 'CudaEvent.startNs', 'CudaEvent.endNs', 'CudaEvent.correlationId',
        'start', 'end', 'duration'
    ]]
    matched_kernels.loc[:,'name'] = matched_kernels['CudaEvent.kernel.shortName'].apply(lambda s: event_names_df[event_names_df['id']==s]['value'].values[0])
    matched_kernels.loc[:,'NVTX'] = np.nan
    matched_kernels.loc[:,'GPU side'] = True
    display(matched_kernels)

ID [17285. 17504.]
[]


In [39]:
if matched_kernels.shape[0]>0:
    # Search corresponding CUDA API (traces)
    matched_traces = traces[traces['TraceProcessEvent.correlationId'].isin(
        matched_kernels['CudaEvent.correlationId'].unique())].copy()
    # TraceProcessEvent.name is not importnat (all same?)
    # using corresponding CUDA kernel names.
    matched_traces.loc[:, 'name'] = matched_traces[
        'TraceProcessEvent.correlationId'].apply(lambda s: matched_kernels[
            matched_kernels['CudaEvent.correlationId'] == s]['name'].values[0])
    matched_traces.loc[:, 'NVTX'] = matched_traces.apply(NVTXforAPIevent,
                                                         nvtx=nvtx,
                                                         axis=1)
    matched_traces.loc[:,'GPU side'] = False
    display(matched_traces[['TraceProcessEvent.startNs', 'TraceProcessEvent.endNs',
           'TraceProcessEvent.correlationId','TraceProcessEvent.name','TraceProcessEvent.globalTid','start', 'end', 'duration']])
    print("TraceProcessEvent.name")
    display(names[names['id'].isin(matched_traces['TraceProcessEvent.name'].unique())])

In [40]:
matched_traces.columns

Index(['Type', 'MmapEvent.Start', 'DiagnosticEvent.GlobalProcess',
       'TraceProcessEvent.startNs', 'TraceProcessEvent.endNs',
       'TraceProcessEvent.correlationId', 'TraceProcessEvent.eventClass',
       'TraceProcessEvent.name', 'TraceProcessEvent.returnValue',
       'TraceProcessEvent.globalTid', 'TraceProcessEvent.nestingLevel',
       'TraceProcessEvent.callchain', 'start', 'end', 'duration', 'name'],
      dtype='object')

In [32]:
# Concat API events (traces) and CUDA kernels
merged = pd.concat([
    matched_kernels[['start', 'end', 'duration', 'name', 'NVTX', 'GPU side']],
    matched_traces[['start', 'end', 'duration', 'name', 'NVTX', 'GPU side']]
],ignore_index=True)
merged

Unnamed: 0,start,end,duration,name,NVTX,GPU side
0,13.143182,13.143256,0.000073,max_pool_forward_nchw,,True
1,13.148306,13.148362,0.000056,max_pool_forward_nchw,,True
2,13.153331,13.153368,0.000037,max_pool_forward_nchw,,True
3,13.159789,13.159838,0.000049,max_pool_forward_nchw,,True
4,13.161288,13.161326,0.000038,max_pool_forward_nchw,,True
...,...,...,...,...,...,...
575,14.346310,14.346323,0.000014,max_pool_backward_nchw,"[Iteration 10, BWD pass]",False
576,14.350483,14.350498,0.000015,max_pool_backward_nchw,"[Iteration 10, BWD pass]",False
577,14.353922,14.353933,0.000011,max_pool_backward_nchw,"[Iteration 10, BWD pass]",False
578,14.355359,14.355371,0.000012,max_pool_backward_nchw,"[Iteration 10, BWD pass]",False


In [77]:
# Explore NVTX tables
# CPU-side vs GPU side?
nvtx_name = 'Iteration 1'
nvtx_ = nvtx[nvtx['NvtxEvent.Text']==nvtx_name].copy()
nvtx_.loc[:,'duration'] = nvtx_['end'] - nvtx_['start']
display(nvtx_)
# CPU-side!

Unnamed: 0,Type,DiagnosticEvent.GlobalProcess,NvtxEvent.Type,NvtxEvent.Timestamp,NvtxEvent.Text,NvtxEvent.GlobalTid,NvtxEvent.EndTimestamp,NvtxEvent.DomainId,NvtxEvent.NsTime,NvtxEvent.Color,start,end,duration
631121,59.0,,59.0,13117150000.0,Iteration 1,281483900000000.0,13297160000.0,0.0,True,4278223000.0,13.117152,13.297165,0.180012


In [76]:
print("Searching NVTX ...")
nvtx_events_df = nvtx[nvtx.apply(searchEventPattern, event_names=event_name_patterns,
                                 axis=1)].copy()
print('Matched Events:')
nvtx_events_df

Searching NVTX ...
Matched Events:


Unnamed: 0,Type,NvtxEvent.Type,NvtxEvent.Timestamp,NvtxEvent.Text,NvtxEvent.GlobalTid,NvtxEvent.EndTimestamp,NvtxEvent.DomainId,NvtxEvent.NsTime,NvtxEvent.Color,start,end


In [16]:
# Search trace events (cuDNN, cuBLAS API events, CPU side) with the names
# that were found earlier
df_ = traces.copy()
API_events = df_[df_['TraceProcessEvent.name'].isin(event_names_df['id'])].dropna(
    axis=1, how='all')
print("Found {} API events".format(API_events.shape[0]))
API_events.head()

Found 0 API events


In [63]:
# Store API event names
API_events['name'] = API_events['TraceProcessEvent.name'].apply(
    lambda x: event_names_df[event_names_df['id'] == x]['value'].values[0])

print("Unique API events:")
print(API_events['name'].unique())

# Search NVTX reagons encompassing API events
API_events['NVTX'] = API_events.apply(NVTXforAPIevent, axis=1)
API_events.sample(n=5)

Unique API events:
['cudaStreamSynchronize_v3020' 'cudnnConvolutionBackwardData']


Unnamed: 0,Type,TraceProcessEvent.startNs,TraceProcessEvent.endNs,TraceProcessEvent.correlationId,TraceProcessEvent.eventClass,TraceProcessEvent.name,TraceProcessEvent.returnValue,TraceProcessEvent.globalTid,start,end,duration,name,NVTX
7501,48.0,6068720000.0,6068771000.0,0.0,28.0,50.0,0.0,282054495340364,6.06872,6.068771,5.1e-05,cudnnConvolutionBackwardData,"[Iteration 2, BWD pass]"
4271,48.0,5953114000.0,5953174000.0,0.0,28.0,50.0,0.0,282054495340364,5.953114,5.953174,6e-05,cudnnConvolutionBackwardData,"[Iteration 1, BWD pass]"
6316,48.0,6040928000.0,6040994000.0,0.0,28.0,50.0,0.0,282054495340364,6.040928,6.040994,6.6e-05,cudnnConvolutionBackwardData,"[Iteration 2, BWD pass]"
1673,48.0,5526812000.0,5526905000.0,1913.0,0.0,63.0,0.0,282054495340270,5.526812,5.526905,9.3e-05,cudaStreamSynchronize_v3020,[Moving model to GPU]
6896,48.0,6055918000.0,6055984000.0,0.0,28.0,50.0,0.0,282054495340364,6.055918,6.055984,6.6e-05,cudnnConvolutionBackwardData,"[Iteration 2, BWD pass]"


In [73]:
events = pd.DataFrame(columns=['name', 'NVTX', 'duration', 'GPU side'])

# Search CUDA kernels for API events
# for _, row in API_events.iterrows():
row = API_events.loc[1673]

print(row[['name', 'TraceProcessEvent.correlationId']])
start = row.loc['start']
end = row.loc['end']
duration = end - start
APIname = row['name']
NVTX_arr = row['NVTX']
NVTX_s = ','.join(row['NVTX'])
# Add CPU-side event
events.loc[events.shape[0]] = [row['name'], NVTX_s, duration, False]

# Search CUDA API events in the time range,
# return CUDA kernels
df_ = lookupAPIandKernelsInTimerange(start, end, traces, kernels, names)
print('{} kernels for {:} nvtx:{} ({:.5f}-{:.5f})'.format(
    df_.shape[0], APIname, NVTX_s, df_['start'].min(), df_['end'].max()))
if df_.shape[0] > 0:
    # Execution time of all kernels from the first to the last
    duration = df_['end'].max() - df_['start'].min()
    print('CUDA kernels found by time range. Duration: {:5f}-{:5f}={:5f}'.format(
        df_['end'].max(), df_['start'].min(), duration))
    events.loc[events.shape[0]] = [APIname, NVTX_s, duration, True]
    print("API events and kernels found by time range")
    display(events)

# Search by correlationID
if row['TraceProcessEvent.correlationId'] != 0:
    dfcorr = LookupCorrelationID(row['TraceProcessEvent.correlationId'], df)
    dfcorr = convertStartEndTimes(dfcorr)
    if dfcorr.shape[0] > 0:
        print('dfcorr')
        display(dfcorr)
        try:
            # Leave only CUDA (GPU-side) events
            dfcorr = dfcorr[dfcorr['CudaEvent.startNs'].notna()]
            dfcorr = convertStartEndTimes(dfcorr)

            dfcorr = dfcorr[['CudaEvent.correlationId', 'start', 'end']]
            duration = dfcorr['end'].max() - dfcorr['start'].min()
            print("Events with correlationID {}: duration {:5f}-{:5f}={:5f}".format(
                row['TraceProcessEvent.correlationId'],dfcorr['end'].max(), dfcorr['start'].min(), duration ))

        except:
            print("Exception. No CudaEvent.startNs in ")
            print(dfcorr.columns)

        events.loc[events.shape[0]] = [APIname, NVTX_s, duration, True]

print(
    "API events and kernels found by time range, NVTX ranges and corresponding CUDA kernels"
)
display(events)

name                               cudaStreamSynchronize_v3020
TraceProcessEvent.correlationId                           1913
Name: 1673, dtype: object
0 kernels for cudaStreamSynchronize_v3020 nvtx:Moving model to GPU (nan-nan)
dfcorr


Unnamed: 0,Type,TraceProcessEvent.startNs,TraceProcessEvent.endNs,TraceProcessEvent.correlationId,TraceProcessEvent.eventClass,TraceProcessEvent.name,TraceProcessEvent.returnValue,TraceProcessEvent.globalTid,CudaEvent.startNs,CudaEvent.endNs,CudaEvent.correlationId,CudaEvent.deviceId,CudaEvent.contextId,CudaEvent.streamId,CudaEvent.eventClass,CudaEvent.globalPid,CudaEvent.sync.eventId,CudaEvent.sync.syncType,start,end
1673,48.0,5526812000.0,5526905000.0,1913.0,0.0,63.0,0.0,282054495340270.0,,,,,,,,,,,5.52681,5.5269
1674,106.0,,,,,,,,5526813000.0,5526904000.0,1913.0,0.0,1.0,7.0,5.0,282054495305728.0,4294967000.0,3.0,5.52681,5.5269


Events with correlationID 1913.0: duration 5.526904-5.526813=0.000091
API events and kernels found by time range, NVTX ranges and corresponding CUDA kernels


Unnamed: 0,name,NVTX,duration,GPU side
0,cudaStreamSynchronize_v3020,Moving model to GPU,9.3e-05,False
1,cudaStreamSynchronize_v3020,Moving model to GPU,9.1e-05,True


In [54]:
# NVTX events
# use_columns = ['NVTX', 'duration', 'start', 'end']
# nvtx_events_df = nvtx_events_df[use_columns]
nvtx_events_df.head()

Unnamed: 0,Type,NvtxEvent.Type,NvtxEvent.Timestamp,NvtxEvent.Text,NvtxEvent.GlobalTid,NvtxEvent.EndTimestamp,NvtxEvent.DomainId,NvtxEvent.NsTime,NvtxEvent.Color,start,end
3178,59.0,59.0,5914788000.0,Prediction and loss,282054495340270,5915680000.0,0,True,4294902000.0,5.914788,5.91568
6279,59.0,59.0,6039374000.0,Prediction and loss,282054495340270,6039682000.0,0,True,4294902000.0,6.039374,6.039682
10070,59.0,59.0,6175950000.0,Prediction and loss,282054495340270,6176267000.0,0,True,4294902000.0,6.17595,6.176267


In [55]:
nvtx_event = nvtx_events_df.loc[3178]
print(nvtx_event)
# Find encompassing NVTX ranges
nvtxranges =nvtx[nvtx['end'].notna()].copy()
nvtxranges = nvtxranges[nvtxranges['start'] <= nvtx_event['start']]
nvtxranges = nvtxranges[nvtxranges['end'] > nvtx_event['end']]
nvtx_names = ','.join(nvtxranges['NvtxEvent.Text'].values)

# Add NVTX event to events DF
events.loc[events.shape[0]] = [nvtx_event['NvtxEvent.Text'], nvtx_names, duration, False]
display(events.tail())


Type                                       59
NvtxEvent.Type                             59
NvtxEvent.Timestamp               5.91479e+09
NvtxEvent.Text            Prediction and loss
NvtxEvent.GlobalTid           282054495340270
NvtxEvent.EndTimestamp            5.91568e+09
NvtxEvent.DomainId                          0
NvtxEvent.NsTime                         True
NvtxEvent.Color                    4.2949e+09
start                                 5.91479
end                                   5.91568
Name: 3178, dtype: object


Unnamed: 0,name,NVTX,duration,GPU side
0,cudnnBatchNormalizationForwardTrainingEx,"Iteration 2,FWD pass",2.2e-05,False
1,cudnnBatchNormalizationForwardTrainingEx,"Iteration 2,FWD pass",2.7e-05,True
2,Prediction and loss,Iteration 1,2.2e-05,False


In [74]:
# Find CUDA kernel time (start, end, duration) for each NVTX event
start = nvtx_event['start']
end = nvtx_event['end']
cuda_kernels = lookupAPIandKernelsInTimerange(start, end, traces, kernels, names)
print('CUDA Kernels')
display(cuda_kernels.head())
cuda_start = cuda_kernels['start'].min()
cuda_end = cuda_kernels['end'].max()
duration = cuda_end - cuda_start
print('CUDA times: {:.5f}-{:.5f} ({:.5f}s)'.format(cuda_start, cuda_end,
 duration))
df_cuda = pd.DataFrame(columns=['name', 'NVTX', 'duration', 'GPU side'],
                       data=[[nvtx_event['NvtxEvent.Text'], nvtx_names, duration, True]])
print('CUDA kernels:')
display(df_cuda)
events = events.append(df_cuda, ignore_index=True)
print('Result')
display(events)

CUDA Kernels


Unnamed: 0,correlationId,api_start,api_end,kernel,start,end,duration
0,8174.0,5.915019,5.915082,kernelTransformReduceInnermostDimIndex,5.915077,5.915107,3e-05
1,8187.0,5.915318,5.91537,cunn_SoftMaxForward,5.915372,5.915383,1e-05
2,8204.0,5.91559,5.915644,cunn_ClassNLLCriterion_updateOutput_kernel,5.915647,5.915653,6e-06


CUDA times: 5.91508-5.91565 (0.00058s)
CUDA kernels:


Unnamed: 0,name,NVTX,duration,GPU side
0,Prediction and loss,Iteration 1,0.000576,True


Result


Unnamed: 0,name,NVTX,duration,GPU side
0,cudaStreamSynchronize_v3020,Moving model to GPU,9.3e-05,False
1,cudaStreamSynchronize_v3020,Moving model to GPU,9.1e-05,True
2,Prediction and loss,Iteration 1,0.000576,True


In [None]:
# List CUDA events started after 9.3576s
events_aftertime = kernels[kernels['start']>4.3576].sort_values(by=['start'])
columns = ['Type','CudaEvent.correlationId','CudaEvent.kernel.shortName','start','end','duration']
display(events_aftertime[columns].head(10))

Get name of the kernel with correlation ID 2703

In [None]:
shortNameID = df[df['CudaEvent.correlationId']==233]['CudaEvent.kernel.shortName'].values[0]
print(shortNameID)

In [None]:
dfcorr = LookupCorrelationID(row[233], df)

In [None]:
events = names.loc[names['id'] == shortNameID] #['value'].values
print(events)

In [None]:
df_ = df[df['id'].notna()].copy()
# df_['id'] = df_['id'].astype(int)
df_.loc[df_['id']==shortNameID]['value']

In [None]:
corrid_columns = [c for c in df.columns if c.lower().find('correlationid')>=0]
corrid_columns

In [None]:
ind1 = (df['TraceProcessEvent.correlationId'] == 4070)
count1 = len([i for i in ind1 if i is True])
ind2 = (df['CudaEvent.correlationId'] == 4070)
count2 = len([i for i in ind2 if i is True])
print(count1, count2)


In [None]:
df_ = None
df_ = df[df['TraceProcessEvent.correlationId'] == 4070].dropna(axis=1,how='all')
display(df_)
df_2 = df[df['CudaEvent.correlationId'] == 4070].dropna(axis=1,how='all')
df_ = pd.merge(df_,df_2, how='outer')
display(df_)

In [None]:
# All trace events ConvolutionForward
# Find sync event with event ID 37 (cudnnConvolutionForward)
df37 = df[(df['TraceProcessEvent.name']==37)].dropna(axis=1,how='all')
convertStartEndTimes(df37)

In [None]:
# Search all events with CudaEvent.sync.eventID == 37 : are they related to cudnnConvolutionForward?
df_ = convertStartEndTimes(df[(df['CudaEvent.sync.eventId']==37)])
# Print full 1st row
for c in df_.columns:    
    print('{}: \t{}'.format(c,df_.iloc[0][c]))

### Lookup event Name by correlationID

In [None]:
corrid = 2703
dfcorr = LookupCorrelationID(corrid,df)
display(dfcorr)
for c in dfcorr.columns:
    print('{}: \t {}, {}'.format(c,dfcorr.iloc[0][c],dfcorr.iloc[1][c]))

In [None]:
print(dfcorr['CudaEvent.kernel.shortName'].notna().any())
shortnames = dfcorr['CudaEvent.kernel.shortName']
shortnames = shortnames[shortnames.notna()].values
print(shortnames)
for n in shortnames:
    try:
        n = int(n)        
    except:
        print('Cannot convert {} to int.'.format(n))
        continue
    print(names[names['id']==n]['value'].values)
    
display(names[names['id']==106])

In [None]:
LookupNamebyCorrID(2703, df, names)

#### Find trace event and CUDA event for a given correlation ID

In [None]:
corrID = 2703
trace_event = LookupCorrelationID(corrID, traces)
kernel_event = LookupCorrelationID(corrID, kernels)
display(trace_event)
display(kernel_event)

### Search rows containing a string

In [None]:
searchres = searchRowsContaining('generateWinogradTiles', df).dropna(axis=1,how='all')
display(searchres)

### Search trace events within given time range

In [None]:
# Search the first occurance of convolutionForward
df37 = convertStartEndTimes(df[(df['TraceProcessEvent.name']==37)].dropna(axis=1,how='all'))
display(df37.head(1))
timerange = (df37.iloc[0]['start'],df37.iloc[0]['end'])
print(timerange)

In [None]:
df_ = lookupTimeRange(timerange[0],timerange[1],traces)
df_

### Search for CUDA kernels corresponding to trace events

In [None]:
# Store results in the DF
result_kernels = pd.DataFrame(
    columns=['correlationId', 'shortname', 'start', 'end', 'duration'])
# Take corrIDs from the lookup results (from previous block)
for corrID in df_['TraceProcessEvent.correlationId'].unique():
    if corrID == 0:
        continue
    kernel_event = LookupCorrelationID(corrID, kernels)
    name = LookupNamebyCorrID(corrID, df, names)
#     display(kernel_event)
    # Append to results DF
    result_kernels.loc[result_kernels.shape[0]] = [
        corrID, name[0], kernel_event['start'].values[0],
        kernel_event['end'].values[0], kernel_event['duration'].values[0]
    ]
    print('{} {}'.format(corrID,name))
display(result_kernels)

In [None]:
print(timerange)
results = lookupAPIandKernelsInTimerange(timerange[0], timerange[1], traces, kernels,
                               names)
display(results)

### All ConvolutionForward events by iteration number

In [None]:
# Find NVTX event which encompasses given trace event
def NVTXforAPIevent(corrID, traces, nvtx):
    events = traces[traces['TraceProcessEvent.correlationId']==corrID]
    if events.shape[0]==0:
        print('No events with correlation ID {}'.format(corrID))
        return None
    start = events['start'].min()
    end = events['start'].max()
#     print('Have {} events from {} till {}'.format(events.shape[0],start,end))
    # Search NVTX object encompassing events
    nvtxranges = nvtx[nvtx['end'].notna()]
    nvtxranges = nvtxranges[nvtxranges['start'] <= start]
    nvtxranges = nvtxranges[nvtxranges['end'] >= end]
    return nvtxranges

In [None]:
NVTXforAPIevent(9860, traces,nvtx)

## Time of all ConvolutionFoward events

In [None]:
# Find ID for the given event name
APIevent_name = 'convolutionForward'
APIevent = searchRowsContaining(APIevent_name,names)
# display(APIevent)
APIevent_id = APIevent.iloc[0]['id'].astype(int)
print(APIevent_id)
# Find events with this ID
df_APIevents = traces[traces['TraceProcessEvent.name']==APIevent_id].dropna(axis=1, how='all')
df_APIevents

In [None]:
def getAPIevent(corrId):
#     print('Got corrID {}'.format(corrId))
    nvtxevent = NVTXforAPIevent(corrId, traces, nvtx)
    return nvtxevent.iloc[0]['NvtxEvent.Text']

In [None]:
# DF for storing results
# One row for each CUDA kernel
# df_results = pd.DataFrame(columns=['NVTX range','API start','API end','kernel','start','end','duration'])
df_results = None
for i, api_range in df_APIevents.iterrows():
    #     if i > 3400:
    #         break
    #     print(i)
    api_start = api_range['start']
    api_end = api_range['end']
    kernels_ = lookupAPIandKernelsInTimerange(api_start, api_end, traces,
                                              kernels, names)
    kernels_['nvtx'] = kernels_['correlationId'].apply(getAPIevent)
    if df_results is None:
        df_results = kernels_
    else:
        df_results = df_results.append(kernels_, ignore_index=True)
display(df_results.head())

In [None]:
def GetIterationNumber(nvtx_name):
    s = nvtx_name.replace('Iteration ','')
    try:
        i = int(s)
    except:
        print('Cannot convert {} to int'.format(s))
        return None
    return i

In [None]:
itertimes = df_results[['duration','nvtx']].groupby(['nvtx'],as_index=False).sum()
itertimes['iteration'] = itertimes['nvtx'].apply(GetIterationNumber)
itertimes.sort_values(by=['iteration'],inplace=True)
itertimes.head()

In [None]:
fig, ax = plt.subplots(figsize=(10, 5), dpi=120)
itertimes.plot(x='iteration', y='duration', marker='o', ms=4, mfc='w', ax=ax)
ax.set_ylim(0, 0.016)
ax.grid(ls=':', lw=0.5, alpha=0.9)
ax.set_title('FWD convolution time per itertaion')
ax.set_ylabel('time (s)')
plt.show()

In [None]:
.
iters