In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import glob
import os

%matplotlib inline

# Build Time Evaluation

In [None]:
build_time_paths = glob.glob("data/20230310/**/*build_times*", recursive=True)
build_time_dataframes = []

for path in build_time_paths:
    benchmark_name = (path.split("/")[-1]).split("-")[0]    
    csv = pd.read_csv(path)
    csv.insert(0, column="BENCHMARK_NAME", value=benchmark_name)    
    build_time_dataframes.append(csv)

build_times = pd.concat(build_time_dataframes)
build_times = build_times.query('COLUMN_DATA_TYPE != "string"')

## Summed Build Time per Benchmark

In [None]:
def sum_grouping(df):
    return df.drop(columns=["COLUMN_ID"]).groupby(["BENCHMARK_NAME", "COLUMN_DATA_TYPE", "HISTOGRAM_NAME"]).sum()

summed_bts = sum_grouping(build_times)
display(summed_bts)

### Average Build Times per Benchmark

In [None]:
def avg_grouping(df):
    return df.drop(columns=["COLUMN_ID"]).groupby(["BENCHMARK_NAME", "COLUMN_DATA_TYPE", "HISTOGRAM_NAME"]).mean()

display(avg_grouping(build_times))

## Total Build Times

In [None]:
summed_bts_df = summed_bts.reset_index()
def visualize_benchmark(benchmark_name):
    bts = summed_bts_df[summed_bts_df["BENCHMARK_NAME"] == benchmark_name]
    bts_float = bts.query('COLUMN_DATA_TYPE == "float"')
    bts_int = bts.query('COLUMN_DATA_TYPE == "int"')

    if (bts_float.size > 0):
        sns.set_context('paper')
        sns.set(rc={'figure.figsize':(15,5)})
        ax = sns.barplot(x = 'HISTOGRAM_NAME', y = 'BUILD_TIME', data = bts_float,
                    palette = 'Blues', edgecolor = 'w')
        ax.set(xlabel='Histogram Name (floats)', ylabel='Build Time in s')
        plt.show()
    
    if (bts_int.size > 0):
        sns.set_context('paper')
        sns.set(rc={'figure.figsize':(15,5)})
        ax = sns.barplot(x = 'HISTOGRAM_NAME', y = 'BUILD_TIME', data = bts_int,
                    palette = 'Blues', edgecolor = 'w')
        ax.set(xlabel='Histogram Name (ints)', ylabel='Build Time in s')
        plt.show()

### JOB Build Times

In [None]:
visualize_benchmark("job")

### TPCH Build Times

In [None]:
visualize_benchmark("tpch")

### TPCC Build Times

In [None]:
visualize_benchmark("tpcc")

### TPCDS Build Times

In [None]:
visualize_benchmark("tpcds")