# generate_error_table

## imports

In [50]:
import pandas as pd

## load data

In [51]:
df = pd.read_csv('../benchmarks_20230122.csv')

In [52]:
print(str(len(df.loc[df['real_output'] == 0])) + " Rows have been dropped because the 'real_output' cell was zero.")
df = df.drop(df[df.real_output == 0].index)
print(str(len(df.loc[df['estimated_output'] == 0])) + " Rows have been dropped because the 'estimated_output' cell was zero.")
df = df.drop(df[df.estimated_output == 0].index)

46 Rows have been dropped because the 'real_output' cell was zero.
73 Rows have been dropped because the 'estimated_output' cell was zero.


## generate errors

In [53]:
def generate_errors(df):
    df['absolute_error'] = df['estimated_output'] - df['real_output']
    df['relative_error'] = df['absolute_error'] / df['real_output']
    df['x'] = df['estimated_output'] / df['real_output']
    df['1/x'] = 1 / df['x']
    df['q_error'] = df[['x','1/x']].max(axis=1)
    df['real_output_lower_bound'] = df['real_output']
    df['real_output_lower_bound'] = df['real_output_lower_bound'].clip(1)
    df['estimated_output_lower_bound'] = df['estimated_output']
    df['estimated_output_lower_bound'] = df['estimated_output_lower_bound'].clip(1)
    df['pseudo_x'] = df['estimated_output_lower_bound'] / df['real_output_lower_bound']
    df['pseudo_1/x'] = 1 / df['pseudo_x']
    df['pseudo_q_error'] = df[['pseudo_x','pseudo_1/x']].max(axis=1)
    df.drop(['x', '1/x', 'real_output_lower_bound', 'estimated_output_lower_bound', 'pseudo_x', 'pseudo_1/x'], axis=1, inplace = True)

    
def generate_MSE(df):
    mse = 0
    for index, row in df.iterrows():
        mse += row['absolute_error'] ** 2
    return mse / df.size

def generate_mean_q_error(df):
    mse = 0
    for index, row in df.iterrows():
        mse += row['q_error']
    return mse / df.size

def generate_mean_pseudo_q_error(df):
    mse = 0
    for index, row in df.iterrows():
        mse += row['pseudo_q_error']
    return mse / df.size

def generate_mean_relative_error(df):
    mse = 0
    for index, row in df.iterrows():
        mse += abs(row['relative_error'])
    return mse / df.size

def generate_mean_error(df):
    mse = 0
    for index, row in df.iterrows():
        mse += abs(row['absolute_error'])
    return mse / df.size

def generate_error_table(input_df):
    dfs_by_hist = [x for _, x in input_df.groupby(input_df['histogram'])]

    df_list = []
    for x in dfs_by_hist:
        y = [x for _, x in x.groupby(x['benchmark'])]
        df_list.append(y)
    
    for i in range(len(df_list)):
        for j in range(len(df_list[i])):
            y = [x for _, x in df_list[i][j].groupby(df_list[i][j]['operator_type'])]
            df_list[i][j] = y
    
    df = pd.DataFrame(columns=['histogram','benchmark','operator_type','mean_squared_error','mean_q_error','mean_pseudo_q_error','mean_relative_error','mean_error'])
    z = 0
    for i in range(len(df_list)):
        for j in range(len(df_list[i])):
            for k in range(len(df_list[i][j])):
                generate_errors(df_list[i][j][k])
                
                histogram = df_list[i][j][k]['histogram'].iloc[0]
                benchmark = df_list[i][j][k]['benchmark'].iloc[0]
                operator = df_list[i][j][k]['operator_type'].iloc[0]
                mean_squared_error = generate_MSE(df_list[i][j][k])
                mean_q_error = generate_mean_q_error(df_list[i][j][k])
                mean_pseudo_q_error = generate_mean_pseudo_q_error(df_list[i][j][k])
                mean_relative_error = generate_mean_relative_error(df_list[i][j][k])
                mean_error = generate_mean_error(df_list[i][j][k])
                
                df.loc[z] = [histogram,benchmark,operator,mean_squared_error,mean_q_error,mean_pseudo_q_error,mean_relative_error,mean_error]
                z += 1
    return df

## result

In [56]:
generate_error_table(df)

Unnamed: 0,histogram,benchmark,operator_type,mean_squared_error,mean_q_error,mean_pseudo_q_error,mean_relative_error,mean_error
0,EqualDistinctCountHistogram,JoinOrder,JoinHashLeft,27104280000.0,322465.779035,3402.384755,44.17253,13954.634636
1,EqualDistinctCountHistogram,JoinOrder,JoinHashRight,2092143000.0,635959.80906,2056.563008,307.553593,6814.221285
2,EqualDistinctCountHistogram,JoinOrder,TableScan,2754386000.0,151.798365,151.795225,0.191414,3867.80787
3,EqualDistinctCountHistogram,TPC-H,Aggregate,156983100.0,0.30859,0.30859,0.231667,3475.0
4,EqualDistinctCountHistogram,TPC-H,JoinHashLeft,179692.2,84106.396008,5.503813,0.044997,57.928948
5,EqualDistinctCountHistogram,TPC-H,JoinHashRight,176286.5,47.439464,47.439464,47.362261,72.769839
6,EqualDistinctCountHistogram,TPC-H,TableScan,1295433.0,23273.501734,1.614192,0.027537,79.442187
7,EquiHeightHistogram,TPC-H,Aggregate,156983100.0,0.30859,0.30859,0.231667,3475.0
8,EquiHeightHistogram,TPC-H,JoinHashLeft,192303.9,144311.985179,5.585295,0.044174,58.178205
9,EquiHeightHistogram,TPC-H,JoinHashRight,77420.59,25.80354,25.80354,25.726255,47.167289
