In [18]:
import pandas as pd
import os,natsort


In [19]:
e2e_dir = "/mydata/profiler_benchmark/e2e/"
raw_e2e_file = os.path.join(e2e_dir, "raw_e2e.csv")
normalized_e2e_file = os.path.join(e2e_dir, "normalized_e2e.csv")

In [20]:
# recursively find all the csv files in the directory
csv_files = []
for root, dirs, files in os.walk(e2e_dir):
    for file in files:
        if file.endswith(".log"):
            csv_files.append(os.path.join(root, file))


In [21]:
csv_files

['/mydata/profiler_benchmark/e2e/custom_log_b1024_gpu4.log',
 '/mydata/profiler_benchmark/e2e/pytorch_profiler_imagenet_subset_b512_gpu1.log',
 '/mydata/profiler_benchmark/e2e/austin_imagenet_subset_b512_gpu1.log',
 '/mydata/profiler_benchmark/e2e/no_profiler_imagenet_subset_b512_gpu1.log',
 '/mydata/profiler_benchmark/e2e/scalene_b512_gpu1.log',
 '/mydata/profiler_benchmark/e2e/no_profiler_b1024_gpu4.log',
 '/mydata/profiler_benchmark/e2e/custom_log_imagenet_subset_b512_gpu1.log',
 '/mydata/profiler_benchmark/e2e/no_profiler_b512_gpu1.log',
 '/mydata/profiler_benchmark/e2e/py-spy_b512_gpu1.log',
 '/mydata/profiler_benchmark/e2e/custom_log_b512_gpu1.log',
 '/mydata/profiler_benchmark/e2e/py-spy_b1024_gpu4.log']

In [22]:
# convert all the csv files to pandas dataframes, add file name as a column and concat them
df = pd.concat((pd.read_csv(f).assign(file_name=os.path.basename(f)) for f in csv_files),ignore_index=True)


In [23]:
# save the dataframe to a csv file
df.to_csv(raw_e2e_file, index=False)

In [24]:
# sort by file name using natsort and reset the index

df = df.iloc[natsort.index_natsorted(df['file_name'])]
df = df.reset_index(drop=True)

In [25]:
#  get row number with file_name beginning with "no_profiler"
no_profiler_index = df[df['file_name'].str.startswith("no_profiler")].index.values[0]

In [26]:
# normalize every column, except "file_name" with respect to the no_profiler_index in % and subtract 100

# copy the dataframe
df_norm = df.copy()

for col in df_norm.columns:
    if col != "file_name":
        df_norm[col] = df_norm[col].div(df_norm.iloc[no_profiler_index][col])

        

In [27]:
# save the normalized dataframe to a csv file
df_norm.to_csv(normalized_e2e_file, index=False)

In [28]:
df_norm.head(len(df_norm))

Unnamed: 0,wall(s),user(s),kernel(s),max_rss(KB),file_name
0,0.019781,0.029214,0.137278,0.963792,austin_imagenet_subset_b512_gpu1.log
1,1.001416,1.011553,1.06218,1.00976,custom_log_b512_gpu1.log
2,0.247356,1.048468,1.30517,3.059655,custom_log_b1024_gpu4.log
3,0.019541,0.02246,0.025107,0.96342,custom_log_imagenet_subset_b512_gpu1.log
4,1.0,1.0,1.0,1.0,no_profiler_b512_gpu1.log
5,0.244783,1.038044,1.314219,3.060932,no_profiler_b1024_gpu4.log
6,0.01915,0.022121,0.023485,0.962314,no_profiler_imagenet_subset_b512_gpu1.log
7,1.085003,1.310344,5.404364,0.992813,py-spy_b512_gpu1.log
8,0.309684,0.262709,2.998675,0.149951,py-spy_b1024_gpu4.log
9,0.035696,0.026644,0.042059,,pytorch_profiler_imagenet_subset_b512_gpu1.log


In [29]:
df.head(len(df))

Unnamed: 0,wall(s),user(s),kernel(s),max_rss(KB),file_name
0,186.85,240.7,92.24,4539876.0,austin_imagenet_subset_b512_gpu1.log
1,9459.55,8334.3,713.7,4756408.0,custom_log_b512_gpu1.log
2,2336.57,8638.44,876.97,14412296.0,custom_log_b1024_gpu4.log
3,184.59,185.05,16.87,4538124.0,custom_log_imagenet_subset_b512_gpu1.log
4,9446.17,8239.11,671.92,4710432.0,no_profiler_b512_gpu1.log
5,2312.26,8552.56,883.05,14418312.0,no_profiler_b1024_gpu4.log
6,180.89,182.26,15.78,4532916.0,no_profiler_imagenet_subset_b512_gpu1.log
7,10249.12,10796.07,3631.3,4676580.0,py-spy_b512_gpu1.log
8,2925.33,2164.49,2014.87,706332.0,py-spy_b1024_gpu4.log
9,337.19,219.52,28.26,,pytorch_profiler_imagenet_subset_b512_gpu1.log


In [30]:
# keep 'file_name's with 'imagenet_subset' in them
df_imagenet = df[df['file_name'].str.contains("imagenet_subset")].reset_index(drop=True)

In [31]:
df_imagenet.head(len(df_imagenet))

Unnamed: 0,wall(s),user(s),kernel(s),max_rss(KB),file_name
0,186.85,240.7,92.24,4539876.0,austin_imagenet_subset_b512_gpu1.log
1,184.59,185.05,16.87,4538124.0,custom_log_imagenet_subset_b512_gpu1.log
2,180.89,182.26,15.78,4532916.0,no_profiler_imagenet_subset_b512_gpu1.log
3,337.19,219.52,28.26,,pytorch_profiler_imagenet_subset_b512_gpu1.log


In [32]:
#  get row number with file_name beginning with "no_profiler"
no_profiler_index = df_imagenet[df_imagenet['file_name'].str.startswith("no_profiler")].index.values[0]

In [33]:
# normalize every column, except "file_name" with respect to the no_profiler_index in % and subtract 100

# copy the dataframe
df_imagenet_norm = df_imagenet.copy()

for col in df_imagenet_norm.columns:
    if col != "file_name":
        df_imagenet_norm[col] = df_imagenet_norm[col].div(df_imagenet_norm.iloc[no_profiler_index][col])

        

In [34]:
df_imagenet_norm.head(len(df_imagenet_norm))

Unnamed: 0,wall(s),user(s),kernel(s),max_rss(KB),file_name
0,1.032948,1.320641,5.845374,1.001535,austin_imagenet_subset_b512_gpu1.log
1,1.020454,1.015308,1.069075,1.001149,custom_log_imagenet_subset_b512_gpu1.log
2,1.0,1.0,1.0,1.0,no_profiler_imagenet_subset_b512_gpu1.log
3,1.864061,1.204433,1.790875,,pytorch_profiler_imagenet_subset_b512_gpu1.log
