# generate_error_table

## imports

In [1]:
import pandas as pd

## load data

In [6]:
df = pd.read_csv('data/test.csv')

In [7]:
print(str(len(df.loc[df['real_output'] == 0])) + " Rows have been dropped because the 'real_output' cell was zero.")
df = df.drop(df[df.real_output == 0].index)

4 Rows have been dropped because the 'real_output' cell was zero.


## generate errors

In [8]:
def generate_errors(df):
    df['absolute_error'] = df['estimated_output'] - df['real_output']
    df['relative_error'] = df['absolute_error'] / df['real_output']
    df['x'] = df['estimated_output'] / df['real_output']
    df['1/x'] = 1 / df['x']
    df['q_error'] = df[['x','1/x']].max(axis=1)
    df['real_output_lower_bound'] = df['real_output']
    df['real_output_lower_bound'] = df['real_output_lower_bound'].clip(1)
    df['estimated_output_lower_bound'] = df['estimated_output']
    df['estimated_output_lower_bound'] = df['estimated_output_lower_bound'].clip(1)
    df['pseudo_x'] = df['estimated_output_lower_bound'] / df['real_output_lower_bound']
    df['pseudo_1/x'] = 1 / df['pseudo_x']
    df['pseudo_q_error'] = df[['pseudo_x','pseudo_1/x']].max(axis=1)
    df.drop(['x', '1/x', 'real_output_lower_bound', 'estimated_output_lower_bound', 'pseudo_x', 'pseudo_1/x'], axis=1, inplace = True)

    
def generate_MSE(df):
    mse = 0
    for index, row in df.iterrows():
        mse += row['absolute_error'] ** 2
    return mse / df.size

def generate_mean_q_error(df):
    mse = 0
    for index, row in df.iterrows():
        mse += row['q_error']
    return mse / df.size

def generate_mean_pseudo_q_error(df):
    mse = 0
    for index, row in df.iterrows():
        mse += row['pseudo_q_error']
    return mse / df.size

def generate_mean_relative_error(df):
    mse = 0
    for index, row in df.iterrows():
        mse += abs(row['relative_error'])
    return mse / df.size

def generate_mean_error(df):
    mse = 0
    for index, row in df.iterrows():
        mse += abs(row['absolute_error'])
    return mse / df.size

In [10]:
generate_errors(df)

In [126]:
data = {'histrogram_type': ['equaldistinctcount', 'equiheight', 'maxdiff', 'equiwidth'],
        'mean_squared_error': [generate_MSE(equaldistinctcount_df), generate_MSE(equiheight_df), generate_MSE(maxdiff_df), generate_MSE(equiwidth_df)],
        'mean_q_error': [generate_mean_q_error(equaldistinctcount_df), generate_mean_q_error(equiheight_df), generate_mean_q_error(maxdiff_df), generate_mean_q_error(equiwidth_df)],
        'mean_pseudo_q_error': [generate_mean_pseudo_q_error(equaldistinctcount_df), generate_mean_pseudo_q_error(equiheight_df), generate_mean_pseudo_q_error(maxdiff_df), generate_mean_pseudo_q_error(equiwidth_df)],
        'mean_absolute_error': [generate_mean_error(equaldistinctcount_df), generate_mean_error(equiheight_df), generate_mean_error(maxdiff_df), generate_mean_error(equiwidth_df)],
        'mean_relative_error': [generate_mean_relative_error(equaldistinctcount_df), generate_mean_relative_error(equiheight_df), generate_mean_relative_error(maxdiff_df), generate_mean_relative_error(equiwidth_df)],
        }

In [127]:
error_table = pd.DataFrame(data)

In [128]:
error_table

Unnamed: 0,histrogram_type,mean_squared_error,mean_q_error,mean_pseudo_q_error,mean_absolute_error,mean_relative_error
0,equaldistinctcount,3404577.0,70515.186504,8.618197,144.252177,3.995983
1,equiheight,2801208.0,111679.269122,7.259702,113.252029,2.652492
2,maxdiff,2898702.0,123039.372763,7.87709,126.480917,3.269776
3,equiwidth,3400550.0,110247.789236,7.168596,140.301252,2.619226


In [12]:
df

Unnamed: 0,benchmark,histogram,query,operator_count,operator_type,real_input,real_output,estimated_input,estimated_output,absolute_error,relative_error,q_error,pseudo_q_error
0,TPC-H,EquiHeightHistogram,TPC-H 01,0,TableScan,60175,59775,60175.0,60148.539062,373.539062,0.006249,1.006249,1.006249
1,TPC-H,EquiHeightHistogram,TPC-H 02,0,TableScan,2000,27,2000.0,41.111111,14.111111,0.522634,1.522634,1.522634
2,TPC-H,EquiHeightHistogram,TPC-H 02,1,JoinHashLeft,8000,36,8000.0,16.444445,-19.555555,-0.543210,2.189189,2.189189
3,TPC-H,EquiHeightHistogram,TPC-H 02,2,JoinHashLeft,100,12,100.0,20.000002,8.000002,0.666667,1.666667,1.666667
4,TPC-H,EquiHeightHistogram,TPC-H 02,3,JoinHashLeft,100,12,100.0,20.000002,8.000002,0.666667,1.666667,1.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,JoinOrder,EquiHeightHistogram,1a,0,JoinHashLeft,2528312,105,2528312.0,659.288025,554.288025,5.278934,6.278934,6.278934
82,JoinOrder,EquiHeightHistogram,1a,1,JoinHashLeft,1380035,250,1380035.0,12212.704102,11962.704102,47.850816,48.850816,48.850816
83,JoinOrder,EquiHeightHistogram,1a,2,TableScan,113,1,113.0,1.000000,0.000000,0.000000,1.000000,1.000000
84,JoinOrder,EquiHeightHistogram,1a,3,JoinHashLeft,2609129,8309,2609129.0,29301.695312,20992.695312,2.526501,3.526501,3.526501


In [15]:
df1, df2 = [x for _, x in df.groupby(df['benchmark'])]

In [24]:
dfs = [x for _, x in df2.groupby(df['operator_type'])]

In [26]:
len(dfs)

4