# generate_error_table

## imports

In [8]:
import pandas as pd
import numpy as np

## load data

In [9]:
df0 = pd.read_csv('data/benchmarks_20230122.csv')
df1 = pd.read_csv('data/benchmarks_20230126.csv')
df2 = pd.read_csv('data/equiwidth.csv')
df1 = df1.fillna('TPC-DS')
df = pd.concat([df0, df1, df2], axis=0)

## generate errors

In [10]:
def generate_errors(df):
    df = df.copy()  # Convenience
    
    df['absolute_error'] = np.absolute(df['estimated_output'] - df['real_output'])
    df['relative_error'] = df['absolute_error'] / df['real_output']
    df['x'] = df['estimated_output'] / df['real_output']
    df['1/x'] = 1 / df['x']
    df['q_error'] = df[['x','1/x']].max(axis=1)
    df['real_output_lower_bound'] = df['real_output']
    df['real_output_lower_bound'] = df['real_output_lower_bound'].clip(1)
    df['estimated_output_lower_bound'] = df['estimated_output']
    df['estimated_output_lower_bound'] = df['estimated_output_lower_bound'].clip(1)
    df['pseudo_x'] = df['estimated_output_lower_bound'] / df['real_output_lower_bound']
    df['pseudo_1/x'] = 1 / df['pseudo_x']
    df['pseudo_q_error'] = df[['pseudo_x','pseudo_1/x']].max(axis=1)
    df['mean_squared_error'] = df['absolute_error'] ** 2
    df.drop(['x', '1/x', 'real_output_lower_bound', 'estimated_output_lower_bound', 'pseudo_x', 'pseudo_1/x'], axis=1, inplace = True)

    return df

## result

### with_NULLs

In [11]:
df_with_errors = generate_errors(df)
df_with_errors

df_with_errors.groupby(["operator_type",
                        "benchmark",
                        "histogram"]).agg(root_mean_squared_error=("mean_squared_error", lambda x: np.sqrt(np.mean(x))),
                                          mean_absolute_error=("absolute_error", np.mean),
                                          mean_relative_error=("relative_error", np.mean),
                                          mean_q_error=("q_error", np.mean),
                                          mean_pseudo_q_error=("pseudo_q_error", np.mean))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,root_mean_squared_error,mean_absolute_error,mean_relative_error,mean_q_error,mean_pseudo_q_error
operator_type,benchmark,histogram,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Aggregate,TPC-DS,EqualDistinctCountHistogram,17989.0,17989.0,1635.363636,1636.363636,1636.363636
Aggregate,TPC-DS,EquiHeightHistogram,17989.0,17989.0,1635.363636,1636.363636,1636.363636
Aggregate,TPC-DS,MaxDiffFrequencyHistogram,17989.0,17989.0,1635.363636,1636.363636,1636.363636
Aggregate,TPC-H,EqualDistinctCountHistogram,45175.0,45175.0,3.011667,4.011667,4.011667
Aggregate,TPC-H,EquiHeightHistogram,45175.0,45175.0,3.011667,4.011667,4.011667
Aggregate,TPC-H,EquiWidthHistogram,45175.0,45175.0,3.011667,4.011667,4.011667
Aggregate,TPC-H,MaxDiffFrequencyHistogram,45175.0,45175.0,3.011667,4.011667,4.011667
JoinHashLeft,JoinOrder,EqualDistinctCountHistogram,559261.272399,162581.962918,,,40920.73491
JoinHashLeft,JoinOrder,EquiHeightHistogram,612721.251615,173276.985421,,,15323.767842
JoinHashLeft,JoinOrder,EquiWidthHistogram,616046.39898,170267.952291,,,16996.235882


### without_NULLs

In [12]:
size_before = len(df)
df = df.drop(df[df.real_output == 0].index)
print(f"Removed {len(df)-size_before} ({(size_before-len(df))/size_before:.2%}) rows because the 'real_output' cell was zero.")
df = df.drop(df[df.estimated_output == 0].index)
print(f"Removed {len(df)-size_before} ({(size_before-len(df))/size_before:.2%}) rows because the 'estimated_output' cell was zero.")

df_with_errors = generate_errors(df)

df_with_errors.groupby(["operator_type",
                        "benchmark",
                        "histogram"]).agg(root_mean_squared_error=("mean_squared_error", lambda x: np.sqrt(np.mean(x))),
                                          mean_absolute_error=("absolute_error", np.mean),
                                          mean_relative_error=("relative_error", np.mean),
                                          mean_q_error=("q_error", np.mean),
                                          mean_pseudo_q_error=("pseudo_q_error", np.mean))

Removed -228 (4.16%) rows because the 'real_output' cell was zero.
Removed -723 (13.20%) rows because the 'estimated_output' cell was zero.


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,root_mean_squared_error,mean_absolute_error,mean_relative_error,mean_q_error,mean_pseudo_q_error
operator_type,benchmark,histogram,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Aggregate,TPC-DS,EqualDistinctCountHistogram,17989.0,17989.0,1635.363636,1636.364,1636.363636
Aggregate,TPC-DS,EquiHeightHistogram,17989.0,17989.0,1635.363636,1636.364,1636.363636
Aggregate,TPC-DS,MaxDiffFrequencyHistogram,17989.0,17989.0,1635.363636,1636.364,1636.363636
Aggregate,TPC-H,EqualDistinctCountHistogram,45175.0,45175.0,3.011667,4.011667,4.011667
Aggregate,TPC-H,EquiHeightHistogram,45175.0,45175.0,3.011667,4.011667,4.011667
Aggregate,TPC-H,EquiWidthHistogram,45175.0,45175.0,3.011667,4.011667,4.011667
Aggregate,TPC-H,MaxDiffFrequencyHistogram,45175.0,45175.0,3.011667,4.011667,4.011667
JoinHashLeft,JoinOrder,EqualDistinctCountHistogram,604033.347385,184272.025162,649.196682,4702886.0,45327.061463
JoinHashLeft,JoinOrder,EquiHeightHistogram,660365.180598,197711.034349,976.213789,1286775.0,16840.319092
JoinHashLeft,JoinOrder,EquiWidthHistogram,668380.944022,195942.46284,745.444806,398384.3,18989.319958


## compare_operators

In [14]:
grouped = df.groupby(["benchmark", "histogram", "query", "operator_type"]).size()

queries = df[["benchmark", "query"]].drop_duplicates()

# super inefficient, doesn't matter for now
op_counts = {}
for _, outer_benchmark, outer_query in queries.itertuples():
    for (_1, benchmark, histogram, query, operator_type, count) in grouped.reset_index().itertuples():
        if outer_benchmark == benchmark and outer_query == query:
            if (benchmark, query, operator_type) not in op_counts:
                op_counts[(benchmark, query, operator_type)] = count
                continue

            if op_counts[(benchmark, query, operator_type)] != count:
                print(f"WARNING: different value of {count} (previously {op_counts[(benchmark, query, operator_type)]}) for {benchmark}, query {query}, op {operator_type}")


