# generate_error_table

## imports

In [82]:
import pandas as pd

## load data

In [83]:
df = pd.read_csv('data/test.csv')

In [84]:
print(str(len(df.loc[df['real_output'] == 0])) + " Rows have been dropped because the 'real_output' cell was zero.")
df = df.drop(df[df.real_output == 0].index)

4 Rows have been dropped because the 'real_output' cell was zero.


## generate errors

In [98]:
def generate_errors(df):
    df['absolute_error'] = df['estimated_output'] - df['real_output']
    df['relative_error'] = df['absolute_error'] / df['real_output']
    df['x'] = df['estimated_output'] / df['real_output']
    df['1/x'] = 1 / df['x']
    df['q_error'] = df[['x','1/x']].max(axis=1)
    df['real_output_lower_bound'] = df['real_output']
    df['real_output_lower_bound'] = df['real_output_lower_bound'].clip(1)
    df['estimated_output_lower_bound'] = df['estimated_output']
    df['estimated_output_lower_bound'] = df['estimated_output_lower_bound'].clip(1)
    df['pseudo_x'] = df['estimated_output_lower_bound'] / df['real_output_lower_bound']
    df['pseudo_1/x'] = 1 / df['pseudo_x']
    df['pseudo_q_error'] = df[['pseudo_x','pseudo_1/x']].max(axis=1)
    df.drop(['x', '1/x', 'real_output_lower_bound', 'estimated_output_lower_bound', 'pseudo_x', 'pseudo_1/x'], axis=1, inplace = True)

    
def generate_MSE(df):
    mse = 0
    for index, row in df.iterrows():
        mse += row['absolute_error'] ** 2
    return mse / df.size

def generate_mean_q_error(df):
    mse = 0
    for index, row in df.iterrows():
        mse += row['q_error']
    return mse / df.size

def generate_mean_pseudo_q_error(df):
    mse = 0
    for index, row in df.iterrows():
        mse += row['pseudo_q_error']
    return mse / df.size

def generate_mean_relative_error(df):
    mse = 0
    for index, row in df.iterrows():
        mse += abs(row['relative_error'])
    return mse / df.size

def generate_mean_error(df):
    mse = 0
    for index, row in df.iterrows():
        mse += abs(row['absolute_error'])
    return mse / df.size

def generate_error_table(dfs):
    df = pd.DataFrame(columns=['histogram','benchmark','operator_type','mean_squared_error','mean_q_error','mean_pseudo_q_error','mean_relative_error','mean_error'])
    z = 0
    for i in range(len(dfs)):
        for j in range(len(dfs[i])):
            for k in range(len(dfs[i][j])):
                generate_errors(dfs[i][j][k])
                
                histogram = dfs[i][j][k]['histogram'].iloc[0]
                benchmark = dfs[i][j][k]['benchmark'].iloc[0]
                operator = dfs[i][j][k]['operator_type'].iloc[0]
                mean_squared_error = generate_MSE(dfs[i][j][k])
                mean_q_error = generate_mean_q_error(dfs[i][j][k])
                mean_pseudo_q_error = generate_mean_pseudo_q_error(dfs[i][j][k])
                mean_relative_error = generate_mean_relative_error(dfs[i][j][k])
                mean_error = generate_mean_error(dfs[i][j][k])
                
                df.loc[z] = [histogram,benchmark,operator,mean_squared_error,mean_q_error,mean_pseudo_q_error,mean_relative_error,mean_error]
                z += 1
    return df

In [86]:
dfs_by_hist = [x for _, x in df.groupby(df['histogram'])]

In [87]:
len(dfs_by_hist)

2

In [88]:
df_list = []
for x in dfs_by_hist:
    y = [x for _, x in x.groupby(df['benchmark'])]
    df_list.append(y)

In [89]:
df_list[0][0]

Unnamed: 0,benchmark,histogram,query,operator_count,operator_type,real_input,real_output,estimated_input,estimated_output
81,JoinOrder,EquiHeightHistogram,1a,0,JoinHashLeft,2528312,105,2528312.0,659.288025
82,JoinOrder,EquiHeightHistogram,1a,1,JoinHashLeft,1380035,250,1380035.0,12212.704102
83,JoinOrder,EquiHeightHistogram,1a,2,TableScan,113,1,113.0,1.0
84,JoinOrder,EquiHeightHistogram,1a,3,JoinHashLeft,2609129,8309,2609129.0,29301.695312
85,JoinOrder,EquiHeightHistogram,1a,4,TableScan,4,1,4.0,1.0


In [90]:
for i in range(len(df_list)):
    for j in range(len(df_list[i])):
        y = [x for _, x in df_list[i][j].groupby(df['operator_type'])]
        df_list[i][j] = y

In [91]:
df_list[0][0][0]

Unnamed: 0,benchmark,histogram,query,operator_count,operator_type,real_input,real_output,estimated_input,estimated_output
81,JoinOrder,EquiHeightHistogram,1a,0,JoinHashLeft,2528312,105,2528312.0,659.288025
82,JoinOrder,EquiHeightHistogram,1a,1,JoinHashLeft,1380035,250,1380035.0,12212.704102
84,JoinOrder,EquiHeightHistogram,1a,3,JoinHashLeft,2609129,8309,2609129.0,29301.695312


In [99]:

dffff = generate_error_table(df_list)

In [100]:
dffff

Unnamed: 0,histogram,benchmark,operator_type,mean_squared_error,mean_q_error,mean_pseudo_q_error,mean_relative_error,mean_error
0,EquiHeightHistogram,JoinOrder,JoinHashLeft,14977100.0,1.504006,1.504006,1.427083,859.222755
1,EquiHeightHistogram,JoinOrder,TableScan,0.0,0.076923,0.076923,0.0,0.0
2,EquiHeightHistogram,TPC-H,Aggregate,156983100.0,0.30859,0.30859,0.231667,3475.0
3,EquiHeightHistogram,TPC-H,JoinHashLeft,197111.5,147919.782885,5.723004,0.045279,59.63266
4,EquiHeightHistogram,TPC-H,JoinHashRight,77420.59,25.80354,25.80354,25.726255,47.167289
5,EquiHeightHistogram,TPC-H,TableScan,20192.76,24065.603279,1.581752,0.02444,19.472229
6,EquiWidthHistogram,TPC-H,JoinHashLeft,0.0,0.076923,0.076923,0.0,0.0
