# generate_error_table

## imports

In [2]:
import pandas as pd
import numpy as np

## load data

In [3]:
df0 = pd.read_csv('data/benchmarks_20230122.csv')
df1 = pd.read_csv('data/benchmarks_20230126.csv')
df2 = pd.read_csv('data/equiwidth_20230209.csv')
df3 = pd.read_csv('data/gdy_20230216.csv')
df = pd.concat([df0, df1, df2, df3], axis=0)
df = df.fillna('TPC-DS')

## generate errors

In [4]:
def generate_errors(df):
    df = df.copy()  # Convenience
    
    df['absolute_error'] = np.absolute(df['estimated_output'] - df['real_output'])
    df['relative_error'] = df['absolute_error'] / df['real_output']
    df['x'] = df['estimated_output'] / df['real_output']
    df['1/x'] = 1 / df['x']
    df['q_error'] = df[['x','1/x']].max(axis=1)
    df['real_output_lower_bound'] = df['real_output']
    df['real_output_lower_bound'] = df['real_output_lower_bound'].clip(1)
    df['estimated_output_lower_bound'] = df['estimated_output']
    df['estimated_output_lower_bound'] = df['estimated_output_lower_bound'].clip(1)
    df['pseudo_x'] = df['estimated_output_lower_bound'] / df['real_output_lower_bound']
    df['pseudo_1/x'] = 1 / df['pseudo_x']
    df['pseudo_q_error'] = df[['pseudo_x','pseudo_1/x']].max(axis=1)
    df['mean_squared_error'] = df['absolute_error'] ** 2
    df.drop(['x', '1/x', 'real_output_lower_bound', 'estimated_output_lower_bound', 'pseudo_x', 'pseudo_1/x'], axis=1, inplace = True)

    return df

## result

### with_NULLs

In [5]:
df_with_errors = generate_errors(df)
df_with_errors

df_with_errors.groupby([#"operator_type",
                        "benchmark",
                        "histogram"]).agg(root_mean_squared_error=("mean_squared_error", lambda x: np.sqrt(np.mean(x))),
                                          mean_absolute_error=("absolute_error", np.mean),
                                          mean_relative_error=("relative_error", np.mean),
                                          mean_q_error=("q_error", np.mean),
                                          mean_pseudo_q_error=("pseudo_q_error", np.mean))

Unnamed: 0_level_0,Unnamed: 1_level_0,root_mean_squared_error,mean_absolute_error,mean_relative_error,mean_q_error,mean_pseudo_q_error
benchmark,histogram,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
JoinOrder,EqualDistinctCountHistogram,443699.0,116215.9,,,26461.034006
JoinOrder,EquiHeightHistogram,492314.8,118012.5,,,11692.064
JoinOrder,EquiWidthHistogram,496165.0,118046.0,,,13092.666368
JoinOrder,GDYHistogram,492175.1,118737.3,,,12278.477865
JoinOrder,MaxDiffFrequencyHistogram,498339.4,121141.1,,,11149.412992
TPC-DS,EqualDistinctCountHistogram,162708.6,61653.72,22.099158,,4657.808599
TPC-DS,EquiHeightHistogram,159938.5,60536.33,22.099502,,4899.630984
TPC-DS,EquiWidthHistogram,166009.0,64070.25,21.618539,,4175.518419
TPC-DS,GDYHistogram,1340737.0,558798.4,,,46628.471847
TPC-DS,MaxDiffFrequencyHistogram,162554.3,61627.13,22.076917,,4618.700114


### without_NULLs

In [6]:
size_before = len(df)
df = df.drop(df[df.real_output == 0].index)
print(f"Removed {len(df)-size_before} ({(size_before-len(df))/size_before:.2%}) rows because the 'real_output' cell was zero.")
df = df.drop(df[df.estimated_output == 0].index)
print(f"Removed {len(df)-size_before} ({(size_before-len(df))/size_before:.2%}) rows because the 'estimated_output' cell was zero.")

df_with_errors = generate_errors(df)

df_with_errors.groupby([#"operator_type",
                        "benchmark",
                        "histogram"]).agg(root_mean_squared_error=("mean_squared_error", lambda x: np.sqrt(np.mean(x))),
                                          mean_absolute_error=("absolute_error", np.mean),
                                          mean_relative_error=("relative_error", np.mean),
                                          mean_q_error=("q_error", np.mean),
                                          mean_pseudo_q_error=("pseudo_q_error", np.mean))

Removed -331 (4.61%) rows because the 'real_output' cell was zero.
Removed -1233 (17.18%) rows because the 'estimated_output' cell was zero.


Unnamed: 0_level_0,Unnamed: 1_level_0,root_mean_squared_error,mean_absolute_error,mean_relative_error,mean_q_error,mean_pseudo_q_error
benchmark,histogram,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
JoinOrder,EqualDistinctCountHistogram,469486.9,126977.2,432.409415,2711881.0,27346.490249
JoinOrder,EquiHeightHistogram,519198.9,127669.7,614.64244,433838.7,10712.631358
JoinOrder,EquiWidthHistogram,521159.1,130419.0,526.37442,271035.9,12356.022324
JoinOrder,GDYHistogram,512031.2,129128.0,367.184259,300025.5,11252.152216
JoinOrder,MaxDiffFrequencyHistogram,470429.2,119913.2,595.3494,52535.62,10546.295897
TPC-DS,EqualDistinctCountHistogram,166481.2,64074.57,23.100213,281971200.0,4374.47835
TPC-DS,EquiHeightHistogram,166386.6,64734.53,23.851159,291543200.0,4478.932424
TPC-DS,EquiWidthHistogram,191586.3,79725.32,21.343049,26491.8,2471.877064
TPC-DS,GDYHistogram,1364340.0,588286.0,65.195507,4128405000.0,25520.210441
TPC-DS,MaxDiffFrequencyHistogram,165742.9,63898.49,22.916939,503843900.0,4617.988887


## compare_operators

In [6]:
grouped = df.groupby(["benchmark", "histogram", "query", "operator_type"]).size()

queries = df[["benchmark", "query"]].drop_duplicates()

# super inefficient, doesn't matter for now
op_counts = {}
for _, outer_benchmark, outer_query in queries.itertuples():
    for (_1, benchmark, histogram, query, operator_type, count) in grouped.reset_index().itertuples():
        if outer_benchmark == benchmark and outer_query == query:
            if (benchmark, query, operator_type) not in op_counts:
                op_counts[(benchmark, query, operator_type)] = count
                continue

            if op_counts[(benchmark, query, operator_type)] != count:
                print(f"WARNING: different value of {count} (previously {op_counts[(benchmark, query, operator_type)]}) for {benchmark}, query {query}, op {operator_type}")


