In [20]:
import os;
import json;
import builtins
import duckdb
import pandas as pd;
import matplotlib.pyplot as plt


def get_dataset_from_testcase(run):
    run = index.find("_")
    return run[0:run-1]

def get_dataset_size(dataset):
    if dataset == "string_keys":
        return 100_000_000
    if dataset == "osm" or dataset == "books":
        return 800_000_000
    else:
        return 200_000_000

runs = []
dir = '/home/chesetti/Repos/learned_merge_cleanup/sponge/merge_all'
for test_case in os.listdir(dir):
    if test_case == 'build':
        continue
    for run_name in os.listdir(os.path.join(dir, test_case, 'outputs', 'results' )):
        if run_name == 'run':
            continue
        json_file = builtins.open(os.path.join(dir, test_case, 'outputs', 'results', run_name))
        run = (json.load(json_file))
        run['dataset'] = get_dataset_from_testcase(test_case)
        run['dataset_size'] = get_dataset_size(run['dataset'])
        runs.append(run)

df = pd.json_normalize(runs)
print(df.columns)

Index(['command', 'dataset', 'dataset_size', 'result.checksum',
       'result.duration_ns', 'result.duration_sec', 'result.inner_disk_fetch',
       'result.inner_disk_fetch_size', 'result.inner_total_bytes_fetched',
       'result.num_output_keys', 'result.outer_disk_fetch',
       'result.outer_disk_fetch_size', 'result.outer_total_bytes_fetched',
       'spec.algo', 'spec.algo_name', 'spec.check_checksum', 'spec.common_key',
       'spec.inner_table', 'spec.key_size', 'spec.key_type',
       'spec.load_sstable_in_mem', 'spec.name', 'spec.num_threads',
       'spec.outer_table', 'spec.result_path', 'spec.value_size',
       'spec.write_result_to_disk', 'spec.index.epsilon',
       'spec.index.leaf_size_in_pages', 'spec.index.search',
       'spec.index.type'],
      dtype='object')


In [21]:
def get_index_type(index):
    if "btree" in index:
        return "BTREE"
    if "pgm" in index:
        return "PGM"
    return "NA"

def get_index_variant(index):
    if index == "sort_join" or index == "hash_join":
        return "NA"
    pos = index.find("_")
    return index[pos+1:-1]

df["threads"] = df["spec.num_threads"]
df["epsilon"] = df["spec.index.epsilon"]
df["duration_sec"] = df["result.duration_ns"] / (1000000000)
df["ratio"] = df["spec.common_key"]
df["thput"] = (df["result.num_output_keys"] / df["ratio"]) / (df["result.duration_ns"] / (1000000000))
df["algo"] = df["spec.algo_name"]
df["join_algo"] = df["spec.algo"]
df["index_type"] = df["spec.algo_name"].map(lambda x: get_index_type(x))
df["index_variant"] = df["spec.algo_name"].map(lambda x: get_index_variant(x))
df["inner_bytes_fetched"] = df["result.inner_total_bytes_fetched"]
display(df["algo"].unique())
display(df["dataset"].unique())
display(df["index_type"].unique())


array(['standard_merge', 'learned_merge_btree256',
       'learned_merge_sampledflatpgm256'], dtype=object)

array(['fb', 'fb_', 'books', 'osm', 'wiki'], dtype=object)

array(['NA', 'BTREE', 'PGM'], dtype=object)

In [16]:
# SingleThread, 2 way merge vs BTree
def plot_dataset_join_duration(dataset):
    rows = duckdb.sql(
        "SELECT ratio, threads, algo, MEDIAN(duration_sec) as d, MEDIAN(thput) as t FROM df " 
        "WHERE" 
        "   (algo='standard_merge' OR algo='learned_merge_sampledflatpgm256' OR algo='learned_merge_btree256')"
        "   AND threads=1"
        f"   AND dataset='{dataset}'"
        "GROUP BY dataset, ratio, threads, algo"
    ).df()
    result = (rows.pivot(index='ratio', values='d', columns=['algo']))
    ## Save to CSV
    os.makedirs('single_thread_csv_ssd_merge_latency/', exist_ok=True)
    result.to_csv(f'single_thread_csv_ssd_merge_latency/{dataset}.csv')
    return result

real_datasets = ['fb', 'wiki', 'osm', 'books']
synth_datasets = ['uniform_dense', 'uniform_sparse', 'normal', 'lognormal', 'string_keys']

for dataset in real_datasets:
    display(dataset)
    display(plot_dataset_join_duration(dataset))

for dataset in synth_datasets:
    display(dataset)
    display(plot_dataset_join_duration(dataset))

'fb'

algo,learned_merge_btree256,learned_merge_sampledflatpgm256,standard_merge
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,54.389202,24.055351,12.942572
10,8.753012,5.828777,7.435069
100,3.840491,3.378283,6.735534
1000,3.265584,3.189829,6.647478


'wiki'

algo,learned_merge_btree256,learned_merge_sampledflatpgm256,standard_merge
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,23.827476,10.851329,6.132952
10,6.003269,3.479737,3.990581
100,1.827955,1.498334,3.289609
1000,1.658005,1.249172,3.291058


'osm'

algo,learned_merge_sampledflatpgm256,standard_merge
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1
1,96.327081,
10,,28.168213
100,12.550683,
1000,,25.435507


'books'

algo,learned_merge_btree256,learned_merge_sampledflatpgm256,standard_merge
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,,99.355489,
10,35.301941,,28.60293
100,14.993839,12.290061,
1000,11.380678,11.262972,


'uniform_dense'

algo
ratio


'uniform_sparse'

algo
ratio


'normal'

algo
ratio


'lognormal'

algo
ratio


'string_keys'

algo
ratio


In [22]:
def plot_epsilon_vs_duration_200M(ratio):
    rows = duckdb.sql(
        " SELECT threads, ratio, epsilon, algo, AVG(duration_sec) as v FROM df"
        " WHERE (algo='learned_merge_sampledflatpgm256' OR algo='standard_merge' OR algo='learned_merge_btree256')"
        f" AND ratio={ratio} AND dataset='fb_' AND dataset_size=200000000 AND threads=16"
        " GROUP BY ratio, threads, epsilon, algo"
        " ORDER BY ratio, threads, epsilon"
    ).df()
    os.makedirs('multithread_merge', exist_ok=True)
    rows.pivot(index='threads', values='v', columns=['algo']).to_csv(f'multithread_merge/ratio_{ratio}.txt')
    return rows.pivot(index='threads', values='v', columns=['algo'])

display(plot_epsilon_vs_duration_200M(1))
display(plot_epsilon_vs_duration_200M(10))
display(plot_epsilon_vs_duration_200M(100))
display(plot_epsilon_vs_duration_200M(1000))

algo,learned_merge_btree256,learned_merge_sampledflatpgm256,standard_merge
threads,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
16,5.338047,5.479818,5.459968


algo,learned_merge_btree256,learned_merge_sampledflatpgm256,standard_merge
threads,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
16,2.955827,3.280966,2.943177


algo,learned_merge_btree256,learned_merge_sampledflatpgm256,standard_merge
threads,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
16,2.876583,2.776465,2.810304


algo,learned_merge_btree256,learned_merge_sampledflatpgm256,standard_merge
threads,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
16,2.738573,2.775918,3.002019


In [16]:
def plot_dataset_relative():
    rows = duckdb.sql(
        "SELECT ratio, algo, AVG(duration_sec) as d FROM df " 
        "WHERE" 
        "  threads=1 AND dataset!='string_keys'"
        "GROUP BY ratio, threads, algo"
    ).df()
    result = rows.pivot(index='ratio', values='d', columns=['algo'])
    return result
result = plot_dataset_relative()

columns = ['standard_merge', 'learned_merge_btree256', 'learned_merge_flatpgm256']
for column in columns:
    result[column+'_rel'] = result['standard_merge']/result[column]

print(result[['standard_merge_rel', 'learned_merge_btree256_rel', 'learned_merge_flatpgm256_rel']].to_latex(float_format='%.2lf'))

\begin{tabular}{lrrr}
\toprule
algo & standard_merge_rel & learned_merge_btree256_rel & learned_merge_flatpgm256_rel \\
ratio &  &  &  \\
\midrule
1 & 1.00 & 0.29 & 0.55 \\
10 & 1.00 & 0.94 & 1.27 \\
100 & 1.00 & 1.73 & 1.86 \\
1000 & 1.00 & 2.02 & 2.01 \\
\bottomrule
\end{tabular}



In [17]:
def plot_dataset_relative_16():
    rows = duckdb.sql(
        "SELECT ratio, algo, AVG(duration_sec) as d FROM df " 
        "WHERE" 
        "  threads=16 AND dataset!='string_keys'"
        "GROUP BY ratio, threads, algo"
    ).df()
    result = rows.pivot(index='ratio', values='d', columns=['algo'])
    return result
result = plot_dataset_relative_16()

columns = ['standard_merge', 'learned_merge_btree256', 'learned_merge_flatpgm256']
for column in columns:
    result[column+'_rel'] = result['standard_merge']/result[column]

print(result[['standard_merge_rel', 'learned_merge_btree256_rel', 'learned_merge_flatpgm256_rel']].to_latex(float_format='%.2lf'))

\begin{tabular}{lrrr}
\toprule
algo & standard_merge_rel & learned_merge_btree256_rel & learned_merge_flatpgm256_rel \\
ratio &  &  &  \\
\midrule
1 & 1.00 & 0.98 & 1.02 \\
10 & 1.00 & 1.00 & 0.99 \\
100 & 1.00 & 0.98 & 0.99 \\
1000 & 1.00 & 1.01 & 0.99 \\
\bottomrule
\end{tabular}

