In [24]:
import pandas as pd
import numpy as np
import json
import traceback, sys
from tqdm import tqdm

In [25]:
# Constants
IGNORE_TIME = 15.9e9  # Time in nanoseconds to ignore

# Helper Functions
def filter_data_after_time(df, time_column):
    """Filter the DataFrame to ignore rows before a specified time."""
    return df[df[time_column] > IGNORE_TIME]

In [26]:
BASE_PATH = "/Users/shamweelmohammed/Desktop/Masters/Research/trace/analysis/output/june_7/inference_after_model_load/"

In [27]:
REPORTS_PATH = "/Users/shamweelmohammed/Desktop/Masters/Research/trace/apr19/reports_inference_profile_llama_apr19_3_bsz_1/"

# Data

In [28]:
cuda_api_trace_csv_path = REPORTS_PATH + "cuda_api_trace.csv"
df_cuda_api_trace = pd.read_csv(cuda_api_trace_csv_path)

In [29]:
df_cuda_api_trace

Unnamed: 0,Start (ns),Duration (ns),Name,Result,CorrID,Pid,Tid,T-Pri,Thread Name
0,3340891939,2470,cuModuleGetLoadingMode,0,1,734,734,20,python
1,3381961391,2258288,cudaGetDeviceProperties_v2_v12000,0,127,734,734,20,python
2,9240404820,15439,cudaStreamIsCapturing_v10000,0,137,734,734,20,python
3,9240428409,556777,cudaMalloc,0,138,734,734,20,python
4,9240428649,556357,cudaMalloc,0,138,734,734,20,python
...,...,...,...,...,...,...,...,...,...
179034,67363811896,3800,cudaStreamSynchronize,0,1075793,734,734,20,python
179035,67363812036,3580,cudaStreamSynchronize,0,1075793,734,734,20,python
179036,67363992815,12210,cudaDeviceSynchronize,0,1075798,734,734,20,python
179037,67363992895,12040,cudaDeviceSynchronize,0,1075798,734,734,20,python


In [30]:
cuda_gpu_trace_csv_path = REPORTS_PATH + "cuda_gpu_trace.csv"
df_cuda_gpu_trace = pd.read_csv(cuda_gpu_trace_csv_path)

In [31]:
# Drop "Device" column
df_cuda_gpu_trace = df_cuda_gpu_trace.drop(columns=["Device"])

df_cuda_gpu_trace.columns

Index(['Start (ns)', 'Duration (ns)', 'CorrId', 'GrdX', 'GrdY', 'GrdZ', 'BlkX',
       'BlkY', 'BlkZ', 'Reg/Trd', 'StcSMem (MB)', 'DymSMem (MB)', 'Bytes (MB)',
       'Throughput (MB/s)', 'SrcMemKd', 'DstMemKd', 'Ctx', 'GreenCtx', 'Strm',
       'Name'],
      dtype='object')

In [32]:
df_cuda_gpu_trace[200:450]

Unnamed: 0,Start (ns),Duration (ns),CorrId,GrdX,GrdY,GrdZ,BlkX,BlkY,BlkZ,Reg/Trd,StcSMem (MB),DymSMem (MB),Bytes (MB),Throughput (MB/s),SrcMemKd,DstMemKd,Ctx,GreenCtx,Strm,Name
200,10963874588,6850323,5306,,,,,,,,,,90.178,13075.743,Pageable,Device,1,,7,[CUDA memcpy Host-to-Device]
201,11001394375,6796659,5315,,,,,,,,,,90.178,13256.098,Pageable,Device,1,,7,[CUDA memcpy Host-to-Device]
202,11013875947,2592,5324,,,,,,,,,,0.008,3160.490,Pageable,Device,1,,7,[CUDA memcpy Host-to-Device]
203,11014172076,2688,5333,,,,,,,,,,0.008,3047.612,Pageable,Device,1,,7,[CUDA memcpy Host-to-Device]
204,11027801523,3324169,5342,,,,,,,,,,33.554,10066.330,Pageable,Device,1,,7,[CUDA memcpy Host-to-Device]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445,16428832589,6176,7585,128.0,1.0,1.0,128.0,1.0,1.0,16.0,0.000,0.000,,,,,1,,7,"void at::native::elementwise_kernel<(int)128, ..."
446,16429379055,7456,7598,64.0,1.0,1.0,128.0,1.0,1.0,32.0,0.000,0.000,,,,,1,,7,void at::native::unrolled_elementwise_kernel<a...
447,16429933136,7296,7609,64.0,1.0,1.0,128.0,1.0,1.0,16.0,0.000,0.000,,,,,1,,7,"void at::native::elementwise_kernel<(int)128, ..."
448,16684915911,47424,9088,64.0,1.0,1.0,128.0,1.0,1.0,146.0,0.049,0.049,,,,,1,,7,ampere_fp16_s16816gemm_fp16_64x64_sliced1x2_ld...


In [33]:
cuda_kernel_exec_trace_csv_path = REPORTS_PATH + "cuda_kern_exec_trace.csv"
df_cuda_kernel_exec_trace = pd.read_csv(cuda_kernel_exec_trace_csv_path)

In [34]:
df_cuda_kernel_exec_trace

Unnamed: 0,API Start (ns),API Dur (ns),Queue Start (ns),Queue Dur (ns),Kernel Start (ns),Kernel Dur (ns),Total Dur (ns),PID,TID,DevId,API Function,GridXYZ,BlockXYZ,Kernel Name
0,9246594615,8518743,9.255113e+09,376.0,9255113734,8896,8528015,734,734,0,cudaLaunchKernel,4096 1 1,128 1 1,void at::native::vectorized_elementwise_kernel...
1,9246594695,8518493,9.255113e+09,546.0,9255113734,8896,8527935,734,734,0,cudaLaunchKernel,4096 1 1,128 1 1,void at::native::vectorized_elementwise_kernel...
2,9255722164,13590,9.255736e+09,2622.0,9255738376,8864,25076,734,734,0,cudaLaunchKernel,4096 1 1,128 1 1,void at::native::vectorized_elementwise_kernel...
3,9255722284,13360,9.255736e+09,2732.0,9255738376,8864,24956,734,734,0,cudaLaunchKernel,4096 1 1,128 1 1,void at::native::vectorized_elementwise_kernel...
4,9260371969,13560,9.260386e+09,2013.0,9260387542,2976,18549,734,734,0,cudaLaunchKernel,8 1 1,128 1 1,void at::native::vectorized_elementwise_kernel...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166043,67362472893,11900,6.736248e+10,1601.0,67362486394,3712,17213,734,734,0,cudaLaunchKernel,1 1 1,128 1 1,void at::native::vectorized_elementwise_kernel...
166044,67362754532,10630,6.736277e+10,1969.0,67362767131,3616,16215,734,734,0,cudaLaunchKernel,1 1 1,128 1 1,void at::native::vectorized_elementwise_kernel...
166045,67362754632,10430,6.736277e+10,2069.0,67362767131,3616,16115,734,734,0,cudaLaunchKernel,1 1 1,128 1 1,void at::native::vectorized_elementwise_kernel...
166046,67363072620,11840,6.736308e+10,1872.0,67363086332,3808,17520,734,734,0,cudaLaunchKernel,1 1 1,128 1 1,void at::native::vectorized_elementwise_kernel...


In [35]:
# Get unique Kernel Names
df_unique_kernel_names = pd.read_csv(REPORTS_PATH + "unique_kernel_names.csv")
unique_kernel_names = df_unique_kernel_names['kernel_name'].tolist()

In [36]:
# Function to map long names to short names
def map_kernel_name(long_name):
    for short_name in unique_kernel_names:
        if short_name in long_name:
            return short_name
    return long_name

In [37]:
# Get short names
df_cuda_kernel_exec_trace['Short Kernel Name'] = df_cuda_kernel_exec_trace['Kernel Name'].apply(map_kernel_name)

In [38]:
df_cuda_kernel_exec_trace

Unnamed: 0,API Start (ns),API Dur (ns),Queue Start (ns),Queue Dur (ns),Kernel Start (ns),Kernel Dur (ns),Total Dur (ns),PID,TID,DevId,API Function,GridXYZ,BlockXYZ,Kernel Name,Short Kernel Name
0,9246594615,8518743,9.255113e+09,376.0,9255113734,8896,8528015,734,734,0,cudaLaunchKernel,4096 1 1,128 1 1,void at::native::vectorized_elementwise_kernel...,vectorized_elementwise_kernel
1,9246594695,8518493,9.255113e+09,546.0,9255113734,8896,8527935,734,734,0,cudaLaunchKernel,4096 1 1,128 1 1,void at::native::vectorized_elementwise_kernel...,vectorized_elementwise_kernel
2,9255722164,13590,9.255736e+09,2622.0,9255738376,8864,25076,734,734,0,cudaLaunchKernel,4096 1 1,128 1 1,void at::native::vectorized_elementwise_kernel...,vectorized_elementwise_kernel
3,9255722284,13360,9.255736e+09,2732.0,9255738376,8864,24956,734,734,0,cudaLaunchKernel,4096 1 1,128 1 1,void at::native::vectorized_elementwise_kernel...,vectorized_elementwise_kernel
4,9260371969,13560,9.260386e+09,2013.0,9260387542,2976,18549,734,734,0,cudaLaunchKernel,8 1 1,128 1 1,void at::native::vectorized_elementwise_kernel...,vectorized_elementwise_kernel
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166043,67362472893,11900,6.736248e+10,1601.0,67362486394,3712,17213,734,734,0,cudaLaunchKernel,1 1 1,128 1 1,void at::native::vectorized_elementwise_kernel...,vectorized_elementwise_kernel
166044,67362754532,10630,6.736277e+10,1969.0,67362767131,3616,16215,734,734,0,cudaLaunchKernel,1 1 1,128 1 1,void at::native::vectorized_elementwise_kernel...,vectorized_elementwise_kernel
166045,67362754632,10430,6.736277e+10,2069.0,67362767131,3616,16115,734,734,0,cudaLaunchKernel,1 1 1,128 1 1,void at::native::vectorized_elementwise_kernel...,vectorized_elementwise_kernel
166046,67363072620,11840,6.736308e+10,1872.0,67363086332,3808,17520,734,734,0,cudaLaunchKernel,1 1 1,128 1 1,void at::native::vectorized_elementwise_kernel...,vectorized_elementwise_kernel


In [42]:
# Unique CUDA API Calls
unique_cuda_api_calls = df_cuda_api_trace['Name'].unique()
# Convert to DataFrame
df_unique_cuda_api_calls = pd.DataFrame(unique_cuda_api_calls, columns=["Name"])
df_unique_cuda_api_calls

Unnamed: 0,Name
0,cuModuleGetLoadingMode
1,cudaGetDeviceProperties_v2_v12000
2,cudaStreamIsCapturing_v10000
3,cudaMalloc
4,cudaLaunchKernel
5,cudaMemcpyAsync
6,cudaStreamSynchronize
7,cudaProfilerStart
8,cudaDeviceSynchronize
9,cuGetProcAddress_v2


In [43]:
# Unique short Kernel Names
# Shorten Kernel Names
df_api_kernel_mapping['Short Kernel Name'] = df_api_kernel_mapping['Kernels'].apply(map_kernel_name)
unique_short_kernel_names = df_api_kernel_mapping['Short Kernel Name'].unique()
# Convert to DataFrame
df_unique_short_kernel_names = pd.DataFrame(unique_short_kernel_names, columns=["Short Kernel Name"])
df_unique_short_kernel_names

Unnamed: 0,Short Kernel Name
0,vectorized_elementwise_kernel
1,elementwise_kernel_with_index
2,unrolled_elementwise_kernel
3,elementwise_kernel
4,indexSelectSmallIndex
5,triu_tril_kernel
6,CatArrayBatchedCopy
7,reduce_kernel
8,ampere_fp16_s16816gemm_fp16_64x64_sliced1x2_ld...
9,Kernel


# Map CUDA APIs with Kernel and Memory Operation

In [44]:
# Initialize lists to hold the mappings
api_kernel_mapping = []
api_memory_mapping = []

# Iterate through each API call
for _, api_row in tqdm(df_cuda_api_trace.iterrows(), total=df_cuda_api_trace.shape[0], desc="Processing API calls"):
    api_corr_id = api_row['CorrID']
    api_name = api_row['Name']
    
    # Find kernels associated with this API call from gpu trace where Bytes (MB) is null
    mapped_kernels = df_cuda_gpu_trace[(df_cuda_gpu_trace['CorrId'] == api_corr_id) & (df_cuda_gpu_trace['Bytes (MB)'].isnull())]['Name'].tolist()
    if mapped_kernels:
        api_kernel_mapping.append({
            'CUDA API': api_name,
            'Kernels': mapped_kernels
        })
    
    # Find memory operations associated with this API call from gpu trace where Bytes (MB) is not null
    mapped_memory_ops = df_cuda_gpu_trace[(df_cuda_gpu_trace['CorrId'] == api_corr_id) & (df_cuda_gpu_trace['Bytes (MB)'].notnull())]['Name'].tolist()
    if mapped_memory_ops:
        api_memory_mapping.append({
            'CUDA API': api_name,
            'Memory Operations': mapped_memory_ops
        })

# Convert mappings to DataFrames
df_api_kernel_mapping = pd.DataFrame(api_kernel_mapping)
df_api_memory_mapping = pd.DataFrame(api_memory_mapping)

Processing API calls: 100%|██████████| 179039/179039 [01:05<00:00, 2729.69it/s]


In [45]:
df_api_kernel_mapping

Unnamed: 0,CUDA API,Kernels
0,cudaLaunchKernel,[void at::native::vectorized_elementwise_kerne...
1,cudaLaunchKernel,[void at::native::vectorized_elementwise_kerne...
2,cudaLaunchKernel,[void at::native::vectorized_elementwise_kerne...
3,cudaLaunchKernel,[void at::native::vectorized_elementwise_kerne...
4,cudaLaunchKernel,[void at::native::vectorized_elementwise_kerne...
...,...,...
166043,cudaLaunchKernel,[void at::native::vectorized_elementwise_kerne...
166044,cudaLaunchKernel,[void at::native::vectorized_elementwise_kerne...
166045,cudaLaunchKernel,[void at::native::vectorized_elementwise_kerne...
166046,cudaLaunchKernel,[void at::native::vectorized_elementwise_kerne...


In [46]:
# Explode Kernels and remove duplicates
df_api_kernel_mapping_exploded = df_api_kernel_mapping.explode('Kernels').drop_duplicates()
df_api_kernel_mapping_exploded

Unnamed: 0,CUDA API,Kernels
0,cudaLaunchKernel,void at::native::vectorized_elementwise_kernel...
258,cudaLaunchKernel,void <unnamed>::elementwise_kernel_with_index<...
260,cudaLaunchKernel,void at::native::unrolled_elementwise_kernel<a...
262,cudaLaunchKernel,void at::native::vectorized_elementwise_kernel...
264,cudaLaunchKernel,void at::native::vectorized_elementwise_kernel...
...,...,...
41860,cudaLaunchKernel,"void gemv2T_kernel_val<long, long, __half, __h..."
41870,cudaLaunchKernel,void cutlass::Kernel<cutlass_80_wmma_tensorop_...
65158,cudaLaunchKernel,"void <unnamed>::softmax_warp_forward<float, fl..."
106560,cudaLaunchKernel,"void gemv2T_kernel_val<int, int, __half, __hal..."


In [47]:
# Drop duplicates
df_api_memory_mapping_exploded = df_api_memory_mapping.explode('Memory Operations').drop_duplicates()
df_api_memory_mapping_exploded

Unnamed: 0,CUDA API,Memory Operations
0,cudaMemcpyAsync,[CUDA memcpy Host-to-Device]
584,cudaMemcpyAsync,[CUDA memcpy Device-to-Device]
718,cudaMemsetAsync,[CUDA memset]
730,cudaMemcpyAsync,[CUDA memcpy Device-to-Host]


In [48]:
df_api_memory_mapping

Unnamed: 0,CUDA API,Memory Operations
0,cudaMemcpyAsync,[[CUDA memcpy Host-to-Device]]
1,cudaMemcpyAsync,[[CUDA memcpy Host-to-Device]]
2,cudaMemcpyAsync,[[CUDA memcpy Host-to-Device]]
3,cudaMemcpyAsync,[[CUDA memcpy Host-to-Device]]
4,cudaMemcpyAsync,[[CUDA memcpy Host-to-Device]]
...,...,...
10185,cudaMemcpyAsync,[[CUDA memcpy Device-to-Device]]
10186,cudaMemcpyAsync,[[CUDA memcpy Device-to-Host]]
10187,cudaMemcpyAsync,[[CUDA memcpy Device-to-Host]]
10188,cudaMemcpyAsync,[[CUDA memcpy Device-to-Host]]


In [49]:
# Explode Memory Operations and remove duplicates
df_api_memory_mapping_exploded = df_api_memory_mapping.explode('Memory Operations').drop_duplicates()
df_api_memory_mapping_exploded

Unnamed: 0,CUDA API,Memory Operations
0,cudaMemcpyAsync,[CUDA memcpy Host-to-Device]
584,cudaMemcpyAsync,[CUDA memcpy Device-to-Device]
718,cudaMemsetAsync,[CUDA memset]
730,cudaMemcpyAsync,[CUDA memcpy Device-to-Host]
