# generate_error_table

## imports

In [2]:
import pandas as pd

## load data

In [14]:
df0 = pd.read_csv('data/benchmarks_20230122.csv')
df1 = pd.read_csv('data/benchmarks_20230126.csv')
df1 = df1.fillna('TPC-DS')
df = pd.concat([df0, df1], axis=0)

In [15]:
df

Unnamed: 0,benchmark,histogram,query,operator_count,operator_type,real_input,real_output,estimated_input,estimated_output
0,TPC-H,EquiHeightHistogram,TPC-H 01,0,TableScan,60175,59775,60175.0,60148.539062
1,TPC-H,EquiHeightHistogram,TPC-H 02,0,TableScan,2000,27,2000.0,41.111111
2,TPC-H,EquiHeightHistogram,TPC-H 02,1,JoinHashLeft,8000,36,8000.0,16.444445
3,TPC-H,EquiHeightHistogram,TPC-H 02,2,JoinHashLeft,100,12,100.0,20.000002
4,TPC-H,EquiHeightHistogram,TPC-H 02,3,JoinHashLeft,100,12,100.0,20.000002
...,...,...,...,...,...,...,...,...,...
2950,TPC-DS,EqualDistinctCountHistogram,97,2,TableScan,65535,366,65535.0,330.983582
2951,TPC-DS,EqualDistinctCountHistogram,99,0,JoinHashRight,6,286801,6.0,6403.269043
2952,TPC-DS,EqualDistinctCountHistogram,99,1,JoinHashRight,20,286984,20.0,6435.052734
2953,TPC-DS,EqualDistinctCountHistogram,99,2,JoinHashLeft,1441548,288032,1441548.0,6499.243164


In [16]:
print(str(len(df.loc[df['real_output'] == 0])) + " Rows have been dropped because the 'real_output' cell was zero.")
df = df.drop(df[df.real_output == 0].index)
print(str(len(df.loc[df['estimated_output'] == 0])) + " Rows have been dropped because the 'estimated_output' cell was zero.")
df = df.drop(df[df.estimated_output == 0].index)

81 Rows have been dropped because the 'real_output' cell was zero.
245 Rows have been dropped because the 'estimated_output' cell was zero.


## generate errors

In [17]:
def generate_errors(df):
    df['absolute_error'] = df['estimated_output'] - df['real_output']
    df['relative_error'] = df['absolute_error'] / df['real_output']
    df['x'] = df['estimated_output'] / df['real_output']
    df['1/x'] = 1 / df['x']
    df['q_error'] = df[['x','1/x']].max(axis=1)
    df['real_output_lower_bound'] = df['real_output']
    df['real_output_lower_bound'] = df['real_output_lower_bound'].clip(1)
    df['estimated_output_lower_bound'] = df['estimated_output']
    df['estimated_output_lower_bound'] = df['estimated_output_lower_bound'].clip(1)
    df['pseudo_x'] = df['estimated_output_lower_bound'] / df['real_output_lower_bound']
    df['pseudo_1/x'] = 1 / df['pseudo_x']
    df['pseudo_q_error'] = df[['pseudo_x','pseudo_1/x']].max(axis=1)
    df.drop(['x', '1/x', 'real_output_lower_bound', 'estimated_output_lower_bound', 'pseudo_x', 'pseudo_1/x'], axis=1, inplace = True)

    
def generate_MSE(df):
    mse = 0
    row_count = 0
    for index, row in df.iterrows():
        mse += row['absolute_error'] ** 2
        row_count += 1
    return mse / row_count

def generate_mean_q_error(df):
    mse = 0
    row_count = 0
    for index, row in df.iterrows():
        mse += row['q_error']
        row_count += 1
    return mse / row_count

def generate_mean_pseudo_q_error(df):
    mse = 0
    row_count = 0
    for index, row in df.iterrows():
        mse += row['pseudo_q_error']
        row_count += 1
    return mse / row_count

def generate_mean_relative_error(df):
    mse = 0
    row_count = 0
    for index, row in df.iterrows():
        mse += abs(row['relative_error'])
        row_count += 1
    return mse / row_count

def generate_mean_error(df):
    mse = 0
    row_count = 0
    for index, row in df.iterrows():
        mse += abs(row['absolute_error'])
        row_count += 1
    return mse / row_count

def generate_error_table(input_df):
    dfs_by_hist = [x for _, x in input_df.groupby(input_df['histogram'])]

    df_list = []
    for x in dfs_by_hist:
        y = [x for _, x in x.groupby(x['benchmark'])]
        df_list.append(y)
    
    for i in range(len(df_list)):
        for j in range(len(df_list[i])):
            y = [x for _, x in df_list[i][j].groupby(df_list[i][j]['operator_type'])]
            df_list[i][j] = y
    
    df = pd.DataFrame(columns=['histogram','benchmark','operator_type','mean_squared_error','mean_q_error','mean_pseudo_q_error','mean_relative_error','mean_error'])
    z = 0
    for i in range(len(df_list)):
        for j in range(len(df_list[i])):
            for k in range(len(df_list[i][j])):
                generate_errors(df_list[i][j][k])
                
                histogram = df_list[i][j][k]['histogram'].iloc[0]
                benchmark = df_list[i][j][k]['benchmark'].iloc[0]
                operator = df_list[i][j][k]['operator_type'].iloc[0]
                mean_squared_error = generate_MSE(df_list[i][j][k])
                mean_q_error = generate_mean_q_error(df_list[i][j][k])
                mean_pseudo_q_error = generate_mean_pseudo_q_error(df_list[i][j][k])
                mean_relative_error = generate_mean_relative_error(df_list[i][j][k])
                mean_error = generate_mean_error(df_list[i][j][k])
                
                df.loc[z] = [histogram,benchmark,operator,mean_squared_error,mean_q_error,mean_pseudo_q_error,mean_relative_error,mean_error]
                z += 1
    return df

## result

In [20]:
error_table = generate_error_table(df)

In [21]:
error_table.to_csv('error_table.zip', index=False)

In [22]:
error_table

Unnamed: 0,histogram,benchmark,operator_type,mean_squared_error,mean_q_error,mean_pseudo_q_error,mean_relative_error,mean_error
0,EqualDistinctCountHistogram,JoinOrder,JoinHashLeft,355957400000.0,4602399.0,44233.757563,633.385627,179794.160648
1,EqualDistinctCountHistogram,JoinOrder,JoinHashRight,29463580000.0,8956434.0,28963.162955,4331.366296,95714.541396
2,EqualDistinctCountHistogram,JoinOrder,TableScan,37503810000.0,2067.076,2067.033323,2.605403,52569.547552
3,EqualDistinctCountHistogram,TPC-DS,Aggregate,323604100.0,1636.364,1636.363636,1635.363636,17989.0
4,EqualDistinctCountHistogram,TPC-DS,JoinHashLeft,28496290000.0,608339200.0,9363.020742,34.061996,90790.890757
5,EqualDistinctCountHistogram,TPC-DS,JoinHashRight,110398600000.0,18339.74,205.2546,9.943153,166164.953563
6,EqualDistinctCountHistogram,TPC-DS,TableScan,4345338.0,3259.252,4.344585,0.093344,236.928597
7,EqualDistinctCountHistogram,TPC-H,Aggregate,2040781000.0,4.011667,4.011667,3.011667,45175.0
8,EqualDistinctCountHistogram,TPC-H,JoinHashLeft,2335999.0,1093383.0,71.54957,0.58496,753.076322
9,EqualDistinctCountHistogram,TPC-H,JoinHashRight,2291725.0,616.713,616.713036,615.709392,946.00791
