In [1]:
import os;
import json;
import builtins
import duckdb
import pandas as pd;
import matplotlib.pyplot as plt
import numpy as np

def get_dataset_from_testcase(run):
    return run[:-2]

dir = '/home/chesetti/Repos/learned_merge_cleanup/sponge/join_all'
index_stats = []
for test_case in os.listdir(dir):
    if test_case == 'build':
        continue
    for run_name in os.listdir(os.path.join(dir, test_case, 'input_results', 'run')):
        json_file = builtins.open(os.path.join(dir, test_case, 'input_results', 'run', run_name))
        run = (json.load(json_file))
        if 'index_stats' not in run['result']:
            continue
        for run_index_stats in run['result']['index_stats']:
            run_index_stats['dataset'] = get_dataset_from_testcase(test_case)
            index_stats.append(run_index_stats)
idf = pd.json_normalize(index_stats)
print(idf.columns)

Index(['index_build_duration', 'index_load_duration', 'index_name',
       'index_size', 'dataset'],
      dtype='object')


In [2]:
epsilon = {
    "pgm256": 256,
    "pgm1024": 1024,
    "pgm2048": 2048,
    "flatpgm256": 256,
    "flatpgm1024": 1024,
    "flatpgm2048": 2048,
    "sampledpgm256": 256,
    "sampledpgm1024": 1024,
    "sampledpgm2048": 2048,
    "flatpgm4096": 4096,
    "flatpgm8192": 8192,
    "hashjoin": 0,
    "btree256": 256,
    "btree1024": 1024,
    "btree2048": 2048,
    "sj": 0,
    "sj2": 0,
    "standard_merge": 0
};
def indexType(x):
    if "sampledpgm" in x:
        return "SAMPLED_PGM"
    if "flatpgm" in x:
        return "FLAT_PGM"
    elif "pgm" in x:
        return "PGM"
    elif "btree" in x:
        return "BTREE"
    return "NA"
def indexEpsilon(x):
    if "256" in x:
        return 256
    if "1024" in x:
        return 1024
    if "2048" in x:
        return 2048
    if "4096" in x:
        return 4096
    if "8192" in x:
        return 4096
    return 0
idf['index_type'] = idf['index_name'].map(lambda x: indexType(x))
idf['epsilon'] = idf['index_name'].map(lambda x: indexEpsilon(x))

┌──────────────────────┬─────────────────────┬────────────────────┬────────────┬─────────┬─────────────┬─────────┐
│ index_build_duration │ index_load_duration │     index_name     │ index_size │ dataset │ index_type  │ epsilon │
│        int64         │        int64        │      varchar       │   int64    │ varchar │   varchar   │  int64  │
├──────────────────────┼─────────────────────┼────────────────────┼────────────┼─────────┼─────────────┼─────────┤
│             78414342 │          2007137411 │ sampledpgm256      │    3189976 │ fb      │ SAMPLED_PGM │     256 │
│             68418945 │          2073866882 │ sampledpgm1024     │     734320 │ fb      │ SAMPLED_PGM │    1024 │
│             60661553 │          2012004054 │ sampledpgm2048     │     280336 │ fb      │ SAMPLED_PGM │    2048 │
│             52541846 │          1961324404 │ sampledpgm4096     │      91328 │ fb      │ SAMPLED_PGM │    4096 │
│           7174367285 │          3812320714 │ pgm256             │    4143880 │

In [3]:
def plot_index_memory(dataset, ax):
    rows = duckdb.sql(f"SELECT index_type, epsilon, index_size FROM idf WHERE dataset='{dataset}'").df()
    rows = rows.pivot(index='epsilon', values='index_size', columns='index_type').dropna()
    x = np.arange(len(rows.index))
    width = 0.15  # the width of the bars
    multiplier = 0
    for column in rows.columns:
        offset = width * multiplier
        rects = ax.bar(x + offset, rows[column], width, label=column)
        multiplier += 1
        ax.bar_label(rects, padding=3)
    ax.set_xticks(x + width, rows.index.astype(str))
    ax.set_ylabel('Size(B)')
    ax.set_xlabel('Epsilon')
    ax.legend(loc='upper right', ncols=3)
    ax.set_title('Index Size')

def plot_index_build(dataset, ax):
    rows = duckdb.sql(f"SELECT index_type, epsilon, index_build_duration FROM idf WHERE dataset='{dataset}'").df()
    rows = rows.pivot(index='epsilon', values='index_build_duration', columns='index_type').dropna()
    x = np.arange(len(rows.index))
    width = 0.25  # the width of the bars
    multiplier = 0
    for column in rows.columns:
        offset = width * multiplier
        rects = ax.bar(x + offset, rows[column], width, label=column)
        multiplier += 1
        ax.bar_label(rects, padding=3)
    ax.set_xticks(x + width, rows.index.astype(str))
    ax.legend(loc='upper right', ncols=3)
    ax.set_ylabel('Duration(ns)')
    ax.set_xlabel('Epsilon')
    ax.legend(loc='upper right', ncols=3)
    ax.set_title('Index Build Duration')
    
def plot_for_dataset(dataset):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 6))
    plot_index_memory(dataset, ax1)
    plot_index_build(dataset, ax2)
    fig.suptitle(f'dataset={dataset}', fontsize=16)
    plt.show()

In [None]:
build_duration = duckdb.sql("SELECT index_type, epsilon, dataset, AVG(index_build_duration) as v FROM idf WHERE epsilon <= 4096 AND epsilon > 0 AND epsilon!=1024 GROUP BY index_type, epsilon, dataset").df()
display(build_duration)
build_duration = build_duration.pivot(index='epsilon', values='v', columns=['dataset', 'index_type'])

index_size = duckdb.sql("SELECT index_type, AVG(index_size) as v, epsilon, dataset FROM idf WHERE epsilon <= 4096 AND epsilon > 0 AND epsilon!=1024 GROUP BY index_type, epsilon, dataset").df()
index_size = index_size.pivot(index='epsilon', values='v', columns=['dataset', 'index_type'])

datasets = ['fb', 'osm', 'wiki', 'books'] 
indexes = ['BTREE', 'PGM', 'SAMPLED_PGM']
columns = []
for dataset in datasets:
    for index in indexes:
        column = (dataset, index)
        columns.append(column)
        build_duration[column] = build_duration[column] / 1e9
        index_size[column] = index_size[column] / (1024 * 1024)
print(index_size[columns].to_latex(float_format='%.3f').replace('SAMPLED_PGM','PGM(S)').replace('uniform_dense', 'udense').replace('uniform_sparse', 'usparse').replace('_', '\_'))
print(build_duration[columns].to_latex(float_format='%.3f').replace('SAMPLED_PGM','PGM(S)').replace('uniform_dense', 'udense').replace('uniform_sparse', 'usparse').replace('_', '\_'))

datasets = ['uniform_dense', 'uniform_sparse', 'normal', 'lognormal'] 
columns = []
for dataset in datasets:
    for index in indexes:
        column = (dataset, index)
        columns.append(column)
        build_duration[column] = build_duration[column] / 1e9
        index_size[column] = index_size[column] / (1024 * 1024)
print(index_size[columns].to_latex(float_format='%.3f').replace('SAMPLED_PGM','PGM(S)').replace('uniform_dense', 'udense').replace('uniform_sparse', 'usparse').replace('_', '\_'))
print(build_duration[columns].to_latex(float_format='%.3f').replace('SAMPLED_PGM','PGM(S)').replace('uniform_dense', 'udense').replace('uniform_sparse', 'usparse').replace('_', '\_'))

Unnamed: 0,index_type,epsilon,dataset,v
0,BTREE,2048,fb,3.020990e+05
1,SAMPLED_PGM,4096,books,1.492173e+08
2,BTREE,256,osm,1.210570e+07
3,SAMPLED_PGM,256,uniform_dense,6.589758e+07
4,FLAT_PGM,256,uniform_dense,4.347776e+09
...,...,...,...,...
91,PGM,2048,uniform_dense,8.469572e+09
92,PGM,4096,lognormal,8.054811e+09
93,PGM,4096,uniform_sparse,3.975624e+09
94,SAMPLED_PGM,4096,normal,6.023696e+07


\begin{tabular}{lrrrrrrrrrrrr}
\toprule
dataset & \multicolumn{3}{r}{fb} & \multicolumn{3}{r}{osm} & \multicolumn{3}{r}{wiki} & \multicolumn{3}{r}{books} \\
index\_type & BTREE & PGM & PGM(S) & BTREE & PGM & PGM(S) & BTREE & PGM & PGM(S) & BTREE & PGM & PGM(S) \\
epsilon &  &  &  &  &  &  &  &  &  &  &  &  \\
\midrule
256 & 12.043 & 3.952 & 3.042 & 48.159 & 10.484 & 8.513 & 5.450 & 0.112 & 0.107 & 48.159 & 4.101 & 2.994 \\
2048 & 1.513 & 0.283 & 0.267 & 6.023 & 1.286 & 1.239 & 0.684 & 0.025 & 0.025 & 6.023 & 0.091 & 0.087 \\
4096 & 0.754 & 0.090 & 0.087 & 3.014 & 0.650 & 0.636 & 0.346 & 0.006 & 0.006 & 3.014 & 0.023 & 0.023 \\
\bottomrule
\end{tabular}

\begin{tabular}{lrrrrrrrrrrrr}
\toprule
dataset & \multicolumn{3}{r}{fb} & \multicolumn{3}{r}{osm} & \multicolumn{3}{r}{wiki} & \multicolumn{3}{r}{books} \\
index\_type & BTREE & PGM & PGM(S) & BTREE & PGM & PGM(S) & BTREE & PGM & PGM(S) & BTREE & PGM & PGM(S) \\
epsilon &  &  &  &  &  &  &  &  &  &  &  &  \\
\midrule
256 & 0.002 & 7.17