In [1]:
import os;
import json;
import builtins
import duckdb
import pandas as pd;
import matplotlib.pyplot as plt


def get_dataset_from_testcase(run):
    return run[0:-2]

def get_dataset_size(dataset):
    if dataset == "string_keys":
        return 100_000_000
    if dataset == "osm" or dataset == "books":
        return 800_000_000
    else:
        return 200_000_000

runs = []
dir = '/home/chesetti/Repos/learned_merge_cleanup/sponge/merge_all'
for test_case in os.listdir(dir):
    if test_case == 'build':
        continue
    for run_name in os.listdir(os.path.join(dir, test_case, 'outputs', 'results', 'run')):
        json_file = builtins.open(os.path.join(dir, test_case, 'outputs', 'results', 'run', run_name))
        run = (json.load(json_file))
        run['dataset'] = get_dataset_from_testcase(test_case)
        run['dataset_size'] = get_dataset_size(run['dataset'])
        runs.append(run)

df = pd.json_normalize(runs)
print(df.columns)

Index(['command', 'dataset', 'dataset_size', 'result.checksum',
       'result.duration_ns', 'result.duration_sec', 'result.inner_disk_fetch',
       'result.inner_disk_fetch_size', 'result.inner_total_bytes_fetched',
       'result.num_output_keys', 'result.outer_disk_fetch',
       'result.outer_disk_fetch_size', 'result.outer_total_bytes_fetched',
       'spec.algo', 'spec.algo_name', 'spec.check_checksum', 'spec.common_key',
       'spec.index.epsilon', 'spec.index.leaf_size_in_pages',
       'spec.index.search', 'spec.index.type', 'spec.inner_table',
       'spec.key_size', 'spec.key_type', 'spec.load_sstable_in_mem',
       'spec.name', 'spec.num_threads', 'spec.outer_table', 'spec.result_path',
       'spec.value_size', 'spec.write_result_to_disk'],
      dtype='object')


In [2]:
def get_index_type(index):
    if "btree" in index:
        return "BTREE"
    if "pgm" in index:
        return "PGM"
    return "NA"

def get_index_variant(index):
    if index == "sort_join" or index == "hash_join":
        return "NA"
    pos = index.find("_")
    return index[pos+1:-1]

df["threads"] = df["spec.num_threads"]
df["epsilon"] = df["spec.index.epsilon"]
df["duration_sec"] = df["result.duration_ns"] / (1000000000)
df["ratio"] = df["spec.common_key"]
df["thput"] = (df["result.num_output_keys"] / df["ratio"]) / (df["result.duration_ns"] / (1000000000))
df["algo"] = df["spec.algo_name"]
df["join_algo"] = df["spec.algo"]
df["index_type"] = df["spec.algo_name"].map(lambda x: get_index_type(x))
df["index_variant"] = df["spec.algo_name"].map(lambda x: get_index_variant(x))
df["inner_bytes_fetched"] = df["result.inner_total_bytes_fetched"]
display(df["algo"].unique())
display(df["dataset"].unique())
display(df["index_type"].unique())


array(['learned_merge_btree1024', 'standard_merge',
       'learned_merge_flatpgm1024', 'learned_merge_flatpgm4096',
       'learned_merge_btree256', 'learned_merge_flatpgm256',
       'learned_merge_btree4096'], dtype=object)

array(['fb', 'fb_', 'uniform_sparse', 'books', 'normal', 'wiki', 'osm',
       'osm_', 'lognormal', 'uniform_dense', 'uniform_dense_',
       'lognormal_', 'books_', 'wiki_', 'normal_', 'uniform_sparse_'],
      dtype=object)

array(['BTREE', 'NA', 'PGM'], dtype=object)

In [3]:
# SingleThread, 2 way merge vs BTree
def plot_dataset_join_duration(dataset):
    rows = duckdb.sql(
        "SELECT ratio, threads, algo, MEDIAN(duration_sec) as d, MEDIAN(thput) as t FROM df " 
        "WHERE" 
        "   (algo='standard_merge' OR algo='learned_merge_flatpgm256' OR algo='learned_merge_btree256')"
        "   AND threads=1"
        f"   AND dataset='{dataset}'"
        "GROUP BY dataset, ratio, threads, algo"
    ).df()
    result = (rows.pivot(index='ratio', values='d', columns=['algo']))
    ## Save to CSV
    os.makedirs('single_thread_csv_ssd_latency/', exist_ok=True)
    result.to_csv(f'single_thread_csv_ssd_latency/{dataset}.csv')
    return result

real_datasets = ['fb', 'wiki', 'osm', 'books']
synth_datasets = ['uniform_dense', 'uniform_sparse', 'normal', 'lognormal', 'string_keys']

for dataset in real_datasets:
    display(dataset)
    display(plot_dataset_join_duration(dataset))

for dataset in synth_datasets:
    display(dataset)
    display(plot_dataset_join_duration(dataset))

'fb'

algo,learned_merge_btree256,learned_merge_flatpgm256,standard_merge
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,46.251985,24.184499,13.079334
10,7.664472,5.729398,7.550282
100,3.82207,3.723181,6.605485
1000,2.921363,3.236495,6.817958


'wiki'

algo,learned_merge_btree256,learned_merge_flatpgm256,standard_merge
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,20.07254,10.840252,6.206496
10,5.396248,3.729804,4.198998
100,1.951792,1.693054,2.996075
1000,1.471158,1.415024,2.917552


'osm'

algo,learned_merge_btree256,learned_merge_flatpgm256,standard_merge
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,201.300594,105.236068,58.178117
10,36.017936,26.413891,33.180606
100,16.920377,15.316816,29.325865
1000,14.398344,14.280192,28.65396


'books'

algo,learned_merge_btree256,learned_merge_flatpgm256,standard_merge
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,200.859988,105.302105,57.213536
10,35.251808,26.043406,32.762532
100,16.714618,16.012587,29.002509
1000,14.475697,14.340541,28.855999


'uniform_dense'

algo,learned_merge_btree256,learned_merge_flatpgm256,standard_merge
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,45.52029,24.064289,13.420485
10,7.506567,5.652976,7.5533
100,4.071882,3.82784,7.084772
1000,3.250793,3.247757,6.330465


'uniform_sparse'

algo,learned_merge_btree256,learned_merge_flatpgm256,standard_merge
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,47.79799,25.928652,14.779435
10,8.781433,6.519261,8.334127
100,4.549787,3.988889,7.131252
1000,3.688781,4.095492,7.345225


'normal'

algo,learned_merge_btree256,learned_merge_flatpgm256,standard_merge
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,46.056971,24.018945,13.084747
10,7.831951,5.952748,7.66204
100,3.930232,3.32748,7.076973
1000,2.980957,3.15391,6.449813


'lognormal'

algo,learned_merge_btree256,learned_merge_flatpgm256,standard_merge
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,45.290013,22.423782,13.451228
10,7.816759,6.002131,7.573997
100,3.572057,3.479897,6.680839
1000,3.509546,3.09103,6.522584


'string_keys'

algo
ratio


In [4]:
def plot_epsilon_vs_duration_200M(ratio):
    rows = duckdb.sql(
        " SELECT threads, ratio, epsilon, algo, AVG(duration_sec) as v FROM df"
        " WHERE (algo='learned_merge_flatpgm256' OR algo='standard_merge' OR algo='learned_merge_btree256')"
        f" AND ratio={ratio} AND dataset_size=200000000"
        " GROUP BY ratio, threads, epsilon, algo"
        " ORDER BY ratio, threads, epsilon"
    ).df()
    os.makedirs('multithread_merge', exist_ok=True)
    rows.pivot(index='threads', values='v', columns=['algo']).to_csv(f'multithread_merge/ratio_{ratio}.txt')
    return rows.pivot(index='threads', values='v', columns=['algo'])

display(plot_epsilon_vs_duration_200M(1))
display(plot_epsilon_vs_duration_200M(10))
display(plot_epsilon_vs_duration_200M(100))
display(plot_epsilon_vs_duration_200M(1000))

algo,learned_merge_btree256,learned_merge_flatpgm256,standard_merge
threads,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,41.856756,21.978104,12.305231
2,21.356617,11.470001,6.532667
4,11.287722,6.240817,4.886909
8,6.593589,4.992341,4.811599
16,9.22988,8.849375,9.030358


algo,learned_merge_btree256,learned_merge_flatpgm256,standard_merge
threads,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,7.551617,5.633329,7.13107
2,4.294296,3.385978,3.966801
4,2.909259,3.02849,2.975099
8,2.878941,2.870211,2.96231
16,4.965808,5.003276,4.967724


algo,learned_merge_btree256,learned_merge_flatpgm256,standard_merge
threads,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3.603768,3.359369,6.242631
2,2.41601,2.651447,3.657473
4,2.726481,2.73278,2.649911
8,2.762933,2.744795,2.643101
16,4.723023,4.67871,4.631823


algo,learned_merge_btree256,learned_merge_flatpgm256,standard_merge
threads,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2.990302,2.996005,6.129352
2,2.553492,2.468615,3.553896
4,2.741263,2.792973,2.792146
8,2.811802,2.653412,2.66817
16,4.548075,4.62919,4.579165


In [5]:
def plot_dataset_relative():
    rows = duckdb.sql(
        "SELECT ratio, algo, AVG(duration_sec) as d FROM df " 
        "WHERE" 
        "  threads=1 AND dataset!='string_keys'"
        "GROUP BY ratio, threads, algo"
    ).df()
    display(rows)
    result = rows.pivot(index='ratio', values='d', columns=['algo'])
    return result
result = plot_dataset_relative()

columns = ['hash_join', 'inlj_btree256', 'lsj_flatpgm256', 'sort_join']
for column in columns:
    result[column+'_rel'] = result['sort_join']/result[column]

print(result[['sort_join_rel', 'hash_join_rel', 'inlj_btree256_rel', 'lsj_flatpgm256_rel']].to_latex(float_format='%.2lf'))


Unnamed: 0,ratio,algo,d
0,10,learned_merge_flatpgm4096,10.799789
1,1,learned_merge_btree4096,76.984251
2,100,learned_merge_btree256,6.94959
3,1000,learned_merge_btree256,5.836216
4,100,learned_merge_btree4096,6.794882
5,1000,learned_merge_btree4096,5.910041
6,10,standard_merge,13.62804
7,1,learned_merge_btree256,81.594943
8,1,learned_merge_btree1024,79.235508
9,10,learned_merge_flatpgm1024,10.741429


KeyError: 'sort_join'