# generate_error_table

## imports

In [1]:
import pandas as pd
import numpy as np

## load data

In [2]:
df0 = pd.read_csv('data/20230221/equaldistinctcount_20230221.csv')
df1 = pd.read_csv('data/20230221/equiheight_20230221.csv')
df2 = pd.read_csv('data/20230221/equiwidth_20230221.csv')
df3 = pd.read_csv('data/20230221/gdy_20230221.csv')
df4 = pd.read_csv('data/20230221/maxdiff_20230221.csv')
df5 = pd.read_csv('data/20230221/maxdiff_area_20230224.csv')
df = pd.concat([df0, df1, df2, df3, df4, df5], axis=0)
#df = df.fillna('TPC-DS')

## generate errors

In [3]:
def generate_errors(df):
    df = df.copy()  # Convenience
    
    df['absolute_error'] = np.absolute(df['estimated_output'] - df['real_output'])
    df['relative_error'] = df['absolute_error'] / df['real_output']
    df['x'] = df['estimated_output'] / df['real_output']
    df['1/x'] = 1 / df['x']
    df['q_error'] = df[['x','1/x']].max(axis=1)
    df['real_output_lower_bound'] = df['real_output']
    df['real_output_lower_bound'] = df['real_output_lower_bound'].clip(1)
    df['estimated_output_lower_bound'] = df['estimated_output']
    df['estimated_output_lower_bound'] = df['estimated_output_lower_bound'].clip(1)
    df['pseudo_x'] = df['estimated_output_lower_bound'] / df['real_output_lower_bound']
    df['pseudo_1/x'] = 1 / df['pseudo_x']
    df['pseudo_q_error'] = df[['pseudo_x','pseudo_1/x']].max(axis=1)
    df['mean_squared_error'] = df['absolute_error'] ** 2
    df.drop(['x', '1/x', 'real_output_lower_bound', 'estimated_output_lower_bound', 'pseudo_x', 'pseudo_1/x'], axis=1, inplace = True)

    return df

## result

### with_NULLs

In [4]:
df_with_errors = generate_errors(df)
df_with_errors

df_with_errors.groupby([#"operator_type",
                        "benchmark",
                        "histogram"]).agg(root_mean_squared_error=("mean_squared_error", lambda x: np.sqrt(np.mean(x))),
                                          mean_absolute_error=("absolute_error", np.mean),
                                          mean_relative_error=("relative_error", np.mean),
                                          mean_q_error=("q_error", np.mean),
                                          mean_pseudo_q_error=("pseudo_q_error", np.mean))

Unnamed: 0_level_0,Unnamed: 1_level_0,root_mean_squared_error,mean_absolute_error,mean_relative_error,mean_q_error,mean_pseudo_q_error
benchmark,histogram,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
JoinOrder,EqualDistinctCountHistogram,443698.9,116215.7,,,26587.658467
JoinOrder,EquiHeightHistogram,492314.8,118012.5,,,11669.086908
JoinOrder,EquiWidthHistogram,496165.0,118046.0,,,13069.593868
JoinOrder,GDYHistogram,495082.3,117310.8,,,12783.414794
JoinOrder,MaxDiffAreaHistogram,530054.7,127973.8,,,4670.038529
JoinOrder,MaxDiffFrequencyHistogram,505382.3,124123.2,,,4230.731778
TPC-DS,EqualDistinctCountHistogram,1412218.0,626585.4,,,20084.507124
TPC-DS,EquiHeightHistogram,1414673.0,630285.4,,,21770.434396
TPC-DS,EquiWidthHistogram,1475394.0,653617.7,,,15006.035533
TPC-DS,GDYHistogram,1405704.0,585854.8,44.320481,,49762.728413


### without_NULLs

In [5]:
size_before = len(df)
df = df.drop(df[df.real_output == 0].index)
print(f"Removed {len(df)-size_before} ({(size_before-len(df))/size_before:.2%}) rows because the 'real_output' cell was zero.")
df = df.drop(df[df.estimated_output == 0].index)
print(f"Removed {len(df)-size_before} ({(size_before-len(df))/size_before:.2%}) rows because the 'estimated_output' cell was zero.")

df_with_errors = generate_errors(df)

df_with_errors.groupby([#"operator_type",
                        "benchmark",
                        "histogram"]).agg(root_mean_squared_error=("mean_squared_error", lambda x: np.sqrt(np.mean(x))),
                                          mean_absolute_error=("absolute_error", np.mean),
                                          mean_relative_error=("relative_error", np.mean),
                                          mean_q_error=("q_error", np.mean),
                                          mean_pseudo_q_error=("pseudo_q_error", np.mean))

Removed -462 (5.39%) rows because the 'real_output' cell was zero.
Removed -2083 (24.31%) rows because the 'estimated_output' cell was zero.


Unnamed: 0_level_0,Unnamed: 1_level_0,root_mean_squared_error,mean_absolute_error,mean_relative_error,mean_q_error,mean_pseudo_q_error
benchmark,histogram,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
JoinOrder,EqualDistinctCountHistogram,494846.3,137772.3,438.844636,2451476.0,28265.144584
JoinOrder,EquiHeightHistogram,526863.3,128400.2,659.518982,843687.6,9959.464885
JoinOrder,EquiWidthHistogram,527725.3,130775.0,567.085534,269601.4,11437.045995
JoinOrder,GDYHistogram,523929.2,130003.7,401.986024,551486.7,11108.026489
JoinOrder,MaxDiffAreaHistogram,556411.8,135696.8,912.092186,6316.699,2618.663338
JoinOrder,MaxDiffFrequencyHistogram,531239.5,132689.9,648.78356,5674.955,2157.215473
TPC-DS,EqualDistinctCountHistogram,1426989.0,610947.5,66.348747,4168682000.0,17222.294263
TPC-DS,EquiHeightHistogram,1402221.0,626068.1,56.721853,1979620000.0,15821.815514
TPC-DS,EquiWidthHistogram,1482705.0,681255.6,22.892598,702867100.0,9466.658465
TPC-DS,GDYHistogram,1391202.0,576422.9,48.145364,3804304000.0,14102.476583


## compare_operators

In [15]:
grouped = df.groupby(["benchmark", "histogram", "query", "operator_type"]).size()

queries = df[["benchmark", "query"]].drop_duplicates()

# super inefficient, doesn't matter for now
op_counts = {}
for _, outer_benchmark, outer_query in queries.itertuples():
    for (_1, benchmark, histogram, query, operator_type, count) in grouped.reset_index().itertuples():
        if outer_benchmark == benchmark and outer_query == query:
            if (benchmark, query, operator_type) not in op_counts:
                op_counts[(benchmark, query, operator_type)] = count
                continue

            if op_counts[(benchmark, query, operator_type)] != count:
                print(f"WARNING: different value of {count} (previously {op_counts[(benchmark, query, operator_type)]}) for {benchmark}, query {query}, op {operator_type}")


