# generate_error_table

## imports

In [2]:
import pandas as pd

## load data

In [17]:
equaldistinctcount_df = pd.read_csv('data/tpch_s001_r1_equaldistinctcount.csv')
equiheight_df = pd.read_csv('data/tpch_s001_r1_equiheight.csv')
maxdiff_df = pd.read_csv('data/tpch_s001_r1_maxdifffr.csv')
equiwidth_df = pd.read_csv('data/tpch_s001_r1_equiwidth.csv')

In [42]:
equaldistinctcount_df = equaldistinctcount_df.drop(equaldistinctcount_df[equaldistinctcount_df.real_output == 0].index)
equiheight_df = equiheight_df.drop(equiheight_df[equiheight_df.real_output == 0].index)
maxdiff_df = maxdiff_df.drop(maxdiff_df[maxdiff_df.real_output == 0].index)
equiwidth_df = equiwidth_df.drop(equiwidth_df[equiwidth_df.real_output == 0].index)

## generate errors

In [43]:
def generate_errors(df):
    df['absolute_error'] = df['estimated_output'] - df['real_output']
    df['relative_error'] = df['absolute_error'] / df['real_output']
    df['x'] = df['estimated_output'] / df['real_output']
    df['1/x'] = 1 / df['x']
    df['q_error'] = df[['x','1/x']].max(axis=1)
    df.drop(['x', '1/x'], axis=1, inplace = True)

    
def generate_MSE(df):
    mse = 0
    for index, row in df.iterrows():
        mse += row['absolute_error'] ** 2
    return mse / df.size

def generate_mean_q_error(df):
    mse = 0
    for index, row in df.iterrows():
        mse += row['q_error']
    return mse / df.size

def generate_mean_relative_error(df):
    mse = 0
    for index, row in df.iterrows():
        mse += abs(row['relative_error'])
    return mse / df.size

def generate_mean_error(df):
    mse = 0
    for index, row in df.iterrows():
        mse += abs(row['absolute_error'])
    return mse / df.size

In [44]:
generate_errors(equaldistinctcount_df)
generate_errors(equiheight_df)
generate_errors(maxdiff_df)
generate_errors(equiwidth_df)

In [45]:
equiwidth_df

Unnamed: 0,name,operator_type,real_input,real_output,estimated_input,estimated_output,absolute_error,relative_error,q_error
0,TPC-H 016,TableScan,60175,59775,60175.0,60148.539062,373.539062,0.006249,1.006249
1,TPC-H 0216,TableScan,2000,27,2000.0,41.799999,14.799999,0.548148,1.548148
2,TPC-H 0217,JoinHashLeft,8000,36,8000.0,16.719999,-19.280001,-0.535556,2.153110
3,TPC-H 0218,JoinHashLeft,100,12,100.0,20.000000,8.000000,0.666667,1.666667
4,TPC-H 0219,JoinHashLeft,100,12,100.0,20.000000,8.000000,0.666667,1.666667
...,...,...,...,...,...,...,...,...,...
77,TPC-H 2119,JoinHashLeft,100,4,100.0,4.000000,0.000000,0.000000,1.000000
78,TPC-H 2121,JoinHashLeft,60175,2447,60175.0,2406.999756,-40.000244,-0.016347,1.016618
79,TPC-H 2124,TableScan,25,1,25.0,1.000000,0.000000,0.000000,1.000000
80,TPC-H 225,JoinHashRight,15000,59,15000.0,1500.000000,1441.000000,24.423729,25.423729


In [46]:
data = {'histrogram_type': ['equaldistinctcount', 'equiheight', 'maxdiff', 'equiwidth'],
        'mean_squared_error': [generate_MSE(equaldistinctcount_df), generate_MSE(equiheight_df), generate_MSE(maxdiff_df), generate_MSE(equiwidth_df)],
        'mean_q_error': [generate_mean_q_error(equaldistinctcount_df), generate_mean_q_error(equiheight_df), generate_mean_q_error(maxdiff_df), generate_mean_q_error(equiwidth_df)],
        'mean_absolute_error': [generate_mean_error(equaldistinctcount_df), generate_mean_error(equiheight_df), generate_mean_error(maxdiff_df), generate_mean_error(equiwidth_df)],
        'mean_relative_error': [generate_mean_relative_error(equaldistinctcount_df), generate_mean_relative_error(equiheight_df), generate_mean_relative_error(maxdiff_df), generate_mean_relative_error(equiwidth_df)],
        }

In [47]:
error_table = pd.DataFrame(data)

In [48]:
error_table

Unnamed: 0,histrogram_type,mean_squared_error,mean_q_error,mean_absolute_error,mean_relative_error
0,equaldistinctcount,3782863.0,78350.207227,160.280196,4.439981
1,equiheight,3112453.0,124088.076803,125.835588,2.947213
2,maxdiff,3220780.0,136710.414181,140.534353,3.633085
3,equiwidth,3778389.0,122497.543596,155.89028,2.910251
