# generate_error_table

## imports

In [1]:
import pandas as pd
import numpy as np

## load data

In [8]:
df0 = pd.read_csv('data/benchmarks_20230122.csv')
df1 = pd.read_csv('data/benchmarks_20230126.csv')
df2 = pd.read_csv('data/equiwidth_20230209.csv')
df = pd.concat([df0, df1, df2], axis=0)
df = df.fillna('TPC-DS')

## generate errors

In [9]:
def generate_errors(df):
    df = df.copy()  # Convenience
    
    df['absolute_error'] = np.absolute(df['estimated_output'] - df['real_output'])
    df['relative_error'] = df['absolute_error'] / df['real_output']
    df['x'] = df['estimated_output'] / df['real_output']
    df['1/x'] = 1 / df['x']
    df['q_error'] = df[['x','1/x']].max(axis=1)
    df['real_output_lower_bound'] = df['real_output']
    df['real_output_lower_bound'] = df['real_output_lower_bound'].clip(1)
    df['estimated_output_lower_bound'] = df['estimated_output']
    df['estimated_output_lower_bound'] = df['estimated_output_lower_bound'].clip(1)
    df['pseudo_x'] = df['estimated_output_lower_bound'] / df['real_output_lower_bound']
    df['pseudo_1/x'] = 1 / df['pseudo_x']
    df['pseudo_q_error'] = df[['pseudo_x','pseudo_1/x']].max(axis=1)
    df['mean_squared_error'] = df['absolute_error'] ** 2
    df.drop(['x', '1/x', 'real_output_lower_bound', 'estimated_output_lower_bound', 'pseudo_x', 'pseudo_1/x'], axis=1, inplace = True)

    return df

## result

### with_NULLs

In [11]:
df_with_errors = generate_errors(df)
df_with_errors

df_with_errors.groupby([#"operator_type",
                        "benchmark",
                        "histogram"]).agg(root_mean_squared_error=("mean_squared_error", lambda x: np.sqrt(np.mean(x))),
                                          mean_absolute_error=("absolute_error", np.mean),
                                          mean_relative_error=("relative_error", np.mean),
                                          mean_q_error=("q_error", np.mean),
                                          mean_pseudo_q_error=("pseudo_q_error", np.mean))

Unnamed: 0_level_0,Unnamed: 1_level_0,root_mean_squared_error,mean_absolute_error,mean_relative_error,mean_q_error,mean_pseudo_q_error
benchmark,histogram,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
JoinOrder,EqualDistinctCountHistogram,443698.98312,116215.915151,,,26461.034006
JoinOrder,EquiHeightHistogram,492314.80092,118012.48678,,,11692.064
JoinOrder,EquiWidthHistogram,496165.013349,118046.000538,,,13092.666368
JoinOrder,MaxDiffFrequencyHistogram,498339.361284,121141.13295,,,11149.412992
TPC-DS,EqualDistinctCountHistogram,162708.557941,61653.716376,22.099158,,4657.808599
TPC-DS,EquiHeightHistogram,159938.496916,60536.332173,22.099502,,4899.630984
TPC-DS,EquiWidthHistogram,166009.014377,64070.25177,21.618539,,4175.518419
TPC-DS,MaxDiffFrequencyHistogram,162554.346247,61627.12781,22.076917,,4618.700114
TPC-H,EqualDistinctCountHistogram,9039.773392,2468.485335,,,1178.308454
TPC-H,EquiHeightHistogram,8745.382395,2185.476379,,,1177.894999


### without_NULLs

In [12]:
size_before = len(df)
df = df.drop(df[df.real_output == 0].index)
print(f"Removed {len(df)-size_before} ({(size_before-len(df))/size_before:.2%}) rows because the 'real_output' cell was zero.")
df = df.drop(df[df.estimated_output == 0].index)
print(f"Removed {len(df)-size_before} ({(size_before-len(df))/size_before:.2%}) rows because the 'estimated_output' cell was zero.")

df_with_errors = generate_errors(df)

df_with_errors.groupby([#"operator_type",
                        "benchmark",
                        "histogram"]).agg(root_mean_squared_error=("mean_squared_error", lambda x: np.sqrt(np.mean(x))),
                                          mean_absolute_error=("absolute_error", np.mean),
                                          mean_relative_error=("relative_error", np.mean),
                                          mean_q_error=("q_error", np.mean),
                                          mean_pseudo_q_error=("pseudo_q_error", np.mean))

Removed -248 (4.31%) rows because the 'real_output' cell was zero.
Removed -836 (14.52%) rows because the 'estimated_output' cell was zero.


Unnamed: 0_level_0,Unnamed: 1_level_0,root_mean_squared_error,mean_absolute_error,mean_relative_error,mean_q_error,mean_pseudo_q_error
benchmark,histogram,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
JoinOrder,EqualDistinctCountHistogram,487677.353661,135489.67552,418.578825,2687811.0,28021.583955
JoinOrder,EquiHeightHistogram,526581.363716,130868.741253,611.899674,431838.0,10664.095566
JoinOrder,EquiWidthHistogram,520019.381496,130165.147454,523.972262,476818.1,12354.624118
JoinOrder,MaxDiffFrequencyHistogram,464675.441015,120347.838956,572.079717,50509.56,10172.270333
TPC-DS,EqualDistinctCountHistogram,166481.168226,64074.571533,23.100213,281971200.0,4374.47835
TPC-DS,EquiHeightHistogram,166386.585608,64734.529561,23.851159,291543200.0,4478.932424
TPC-DS,EquiWidthHistogram,181287.694013,73882.754803,19.136653,37984.85,2455.904064
TPC-DS,MaxDiffFrequencyHistogram,165742.867396,63898.491599,22.916939,503843900.0,4617.988887
TPC-H,EqualDistinctCountHistogram,5868.921448,1437.437634,40.477231,714309.7,87.286465
TPC-H,EquiHeightHistogram,5362.748174,1162.720831,27.232247,1146574.0,74.506274


## compare_operators

In [6]:
grouped = df.groupby(["benchmark", "histogram", "query", "operator_type"]).size()

queries = df[["benchmark", "query"]].drop_duplicates()

# super inefficient, doesn't matter for now
op_counts = {}
for _, outer_benchmark, outer_query in queries.itertuples():
    for (_1, benchmark, histogram, query, operator_type, count) in grouped.reset_index().itertuples():
        if outer_benchmark == benchmark and outer_query == query:
            if (benchmark, query, operator_type) not in op_counts:
                op_counts[(benchmark, query, operator_type)] = count
                continue

            if op_counts[(benchmark, query, operator_type)] != count:
                print(f"WARNING: different value of {count} (previously {op_counts[(benchmark, query, operator_type)]}) for {benchmark}, query {query}, op {operator_type}")


