In [1]:
import os
import statistics
from typing import Dict, Tuple
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
def load_csv_results():
    normal_datasets = ["healthcare", "folktables", "cardio", "reviews"]
    image_dataset = ["sneakers"]
    data_loadings = ["fast_loading", "slow_loading"]
    featurizations = ["featurization_0", "featurization_1", "featurization_2", "featurization_3", "featurization_4"]
    models = ["logistic_regression", "xgboost", "neural_network"]
    result_df = None
    for dataset in normal_datasets:
        for data_loading in data_loadings:
            for featurization in featurizations:
                for model in models:
                    filepath = f"{os.getcwd()}/instrumentation-benchmark-results/" \
                               f"results-instrumentation-{dataset}-{data_loading}-{featurization}-{model}.csv"
                    new_df = pd.read_csv(filepath)
                    new_df['median_total_exec_duration_with_instrum_with_tracking'] = new_df['total_exec_duration_with_instrum_with_tracking'].median()
                    new_df['median_total_exec_duration_with_instrum_without_tracking'] = new_df['total_exec_duration_with_instrum_without_tracking'].median()
                    new_df['median_total_exec_duration_without_instrum_main_func'] = new_df['total_exec_duration_without_instrum_main_func'].median()
                    new_df['median_total_exec_duration_without_instrum_load_ast_compile'] = new_df['total_exec_duration_without_instrum_load_ast_compile'].median()
                    new_df['median_overhead_with_tracking_vs_main_func'] = new_df['median_total_exec_duration_with_instrum_with_tracking'] - new_df['median_total_exec_duration_without_instrum_main_func']
                    new_df = new_df[['median_total_exec_duration_with_instrum_with_tracking',
                                     'median_total_exec_duration_with_instrum_without_tracking',
                                     'median_total_exec_duration_without_instrum_main_func',
                                     'median_total_exec_duration_without_instrum_load_ast_compile',
                                     'median_overhead_with_tracking_vs_main_func',
                                     'dataset', 'data_loading', 'featurization', 'model']]
                    new_df = new_df.head(1)
                    new_df = new_df.round(2)
                    if result_df is None:
                        result_df = new_df
                    else:
                        result_df = pd.concat([result_df, new_df], axis=0)
    for dataset in image_dataset:
        for data_loading in data_loadings:
            for featurization in ["image"]:
                for model in ["image"]:
                    filepath = f"{os.getcwd()}/instrumentation-benchmark-results/" \
                               f"results-instrumentation-{dataset}-{data_loading}-{featurization}-{model}.csv"
                    new_df = pd.read_csv(filepath)
                    new_df['median_total_exec_duration_with_instrum_with_tracking'] = new_df['total_exec_duration_with_instrum_with_tracking'].median()
                    new_df['median_total_exec_duration_with_instrum_without_tracking'] = new_df['total_exec_duration_with_instrum_without_tracking'].median()
                    new_df['median_total_exec_duration_without_instrum_main_func'] = new_df['total_exec_duration_without_instrum_main_func'].median()
                    new_df['median_total_exec_duration_without_instrum_load_ast_compile'] = new_df['total_exec_duration_without_instrum_load_ast_compile'].median()
                    new_df['median_overhead_with_tracking_vs_main_func'] = new_df['median_total_exec_duration_with_instrum_with_tracking'] - new_df['median_total_exec_duration_without_instrum_main_func']
                    new_df = new_df[['median_total_exec_duration_with_instrum_with_tracking',
                                     'median_total_exec_duration_with_instrum_without_tracking',
                                     'median_total_exec_duration_without_instrum_main_func',
                                     'median_total_exec_duration_without_instrum_load_ast_compile',
                                     'median_overhead_with_tracking_vs_main_func',
                                     'dataset', 'data_loading', 'featurization', 'model']]
                    new_df = new_df.head(1)
                    new_df = new_df.round(2)
                    if result_df is None:
                        result_df = new_df
                    else:
                        result_df = pd.concat([result_df, new_df], axis=0)
    return result_df

In [3]:
median_results = load_csv_results()

In [4]:
median_results_ordered_by_median_overhead = median_results\
    .sort_values(by=['median_overhead_with_tracking_vs_main_func'])
median_results_ordered_by_median_overhead.to_csv(
    f"{os.getcwd()}/instrumentation-benchmark-results/instrumentation_overhead_overview_ordered_by_median_overhead.csv",
    index=True)
median_results_ordered_by_median_overhead


Unnamed: 0,median_total_exec_duration_with_instrum_with_tracking,median_total_exec_duration_with_instrum_without_tracking,median_total_exec_duration_without_instrum_main_func,median_total_exec_duration_without_instrum_load_ast_compile,median_overhead_with_tracking_vs_main_func,dataset,data_loading,featurization,model
0,4678.50,4689.94,4496.86,4495.83,181.64,cardio,fast_loading,featurization_0,neural_network
0,504.51,505.44,312.85,319.40,191.66,cardio,fast_loading,featurization_0,logistic_regression
0,591.28,589.11,392.30,402.95,198.99,cardio,slow_loading,featurization_0,logistic_regression
0,951.04,950.99,750.52,758.99,200.52,cardio,slow_loading,featurization_0,xgboost
0,875.52,875.18,674.72,683.19,200.79,cardio,fast_loading,featurization_0,xgboost
...,...,...,...,...,...,...,...,...,...
0,45388.62,45696.68,15854.24,15901.86,29534.38,reviews,slow_loading,featurization_3,neural_network
0,39003.89,39009.84,9448.42,9447.84,29555.47,reviews,slow_loading,featurization_3,logistic_regression
0,50918.54,50760.20,21316.58,21306.22,29601.96,reviews,slow_loading,featurization_3,xgboost
0,50415.60,50428.75,20797.25,20785.37,29618.34,reviews,fast_loading,featurization_3,xgboost
