In [1]:
import os;
import json;
import builtins
import duckdb
import pandas as pd;
import matplotlib.pyplot as plt


def get_dataset_from_testcase(run):
    return run[0:-2]

def get_dataset_size(dataset):
    if dataset == "osm" or dataset == "books":
        return 800_000_000
    else:
        return 200_000_000

dir = '/home/chesetti/Repos/KVector_Merge/sponge/join_all'
runs = []
for test_case in os.listdir(dir):
    if test_case == 'build':
        continue
    for run_name in os.listdir(os.path.join(dir, test_case, 'outputs', 'results', 'run')):
        json_file = builtins.open(os.path.join(dir, test_case, 'outputs', 'results', 'run', run_name))
        run = (json.load(json_file))
        run['dataset'] = get_dataset_from_testcase(test_case)
        run['dataset_size'] = get_dataset_size(run['dataset'])
        runs.append(run)
df = pd.json_normalize(runs)
print(df.columns)

Index(['command', 'dataset', 'dataset_size', 'result.checksum',
       'result.duration_ns', 'result.duration_sec', 'result.inner_disk_fetch',
       'result.inner_disk_fetch_size', 'result.inner_total_bytes_fetched',
       'result.num_output_keys', 'result.outer_disk_fetch',
       'result.outer_disk_fetch_size', 'result.outer_total_bytes_fetched',
       'spec.algo', 'spec.algo_name', 'spec.check_checksum', 'spec.common_key',
       'spec.index.epsilon', 'spec.index.leaf_size_in_pages',
       'spec.index.search', 'spec.index.type', 'spec.inner_table',
       'spec.key_size', 'spec.key_type', 'spec.load_sstable_in_mem',
       'spec.name', 'spec.num_threads', 'spec.outer_table', 'spec.result_path',
       'spec.value_size', 'spec.write_result_to_disk'],
      dtype='object')


In [2]:
def get_index_type(index):
    if "btree" in index:
        return "BTREE"
    if "pgm" in index:
        return "PGM"
    return "NA"

def get_index_variant(index):
    if index == "sort_join" or index == "hash_join":
        return "NA"
    pos = index.find("_")
    return index[pos+1:-1]

df["threads"] = df["spec.num_threads"]
df["epsilon"] = df["spec.index.epsilon"]
df["duration_sec"] = df["result.duration_ns"] / (1000000000)
df["ratio"] = df["spec.common_key"]
df["thput"] = (df["result.num_output_keys"] / df["ratio"]) / (df["result.duration_ns"] / (1000000000))
df["algo"] = df["spec.algo_name"]
df["join_algo"] = df["spec.algo"]
df["index_type"] = df["spec.algo_name"].map(lambda x: get_index_type(x))
df["index_variant"] = df["spec.algo_name"].map(lambda x: get_index_variant(x))
df["inner_bytes_fetched"] = df["result.inner_total_bytes_fetched"]
display(df["algo"].unique())
display(df["dataset"].unique())
display(df["index_type"].unique())


array(['inlj_btree1024', 'lsj_pgm4096', 'lsj_sampledflatpgm4096',
       'hash_join', 'lsj_sampledflatpgm1024', 'inlj_flatpgm256',
       'lsj_btree4096', 'sort_join', 'inlj_sampledflatpgm4096',
       'inlj_btree4096', 'lsj_flatpgm4096', 'lsj_btree1024',
       'inlj_pgm256', 'lsj_btree256', 'lsj_flatpgm1024', 'inlj_pgm4096',
       'inlj_sampledflatpgm1024', 'lsj_pgm256', 'inlj_sampledflatpgm256',
       'lsj_pgm1024', 'lsj_sampledflatpgm256', 'inlj_flatpgm4096',
       'lsj_flatpgm256', 'inlj_flatpgm1024', 'inlj_pgm1024',
       'inlj_btree256'], dtype=object)

array(['wiki', 'uniform_dense', 'normal', 'uniform_sparse', 'books',
       'osm', 'lognormal', 'fb'], dtype=object)

array(['BTREE', 'PGM', 'NA'], dtype=object)

In [3]:
# SingleThread, HJ vs SJ vs INLJ(BTree256) vs INLJ(PGM256) vs LS(BTree256) vs LS(PGM256)
def plot_dataset_join_duration(dataset):
    rows = duckdb.sql(
        "SELECT ratio, threads, algo, MEDIAN(duration_sec) as d, MEDIAN(thput) as t FROM df " 
        "WHERE" 
        "   (algo='hash_join' OR algo='sort_join' OR algo='inlj_btree256' OR algo='inlj_pgm256' OR "
        "   algo='lsj_btree256' OR algo='lsj_flatpgm256')" 
        "   AND threads=1"
        f"   AND dataset='{dataset}'"
        "GROUP BY dataset, ratio, threads, algo"
    ).df()
    return(rows.pivot(index='ratio', values='d', columns=['algo']))

real_datasets = ['fb', 'wiki', 'osm', 'books']
synth_datasets = ['uniform_dense', 'uniform_sparse', 'normal', 'lognormal']

for dataset in real_datasets:
    display(dataset)
    display(plot_dataset_join_duration(dataset))

for dataset in synth_datasets:
    display(dataset)
    display(plot_dataset_join_duration(dataset))

'fb'

algo,hash_join,inlj_btree256,inlj_pgm256,lsj_btree256,lsj_flatpgm256,sort_join
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,79.785572,53.723555,33.464064,52.649181,26.987002,25.162671
10,53.982077,5.690409,4.085079,5.636519,3.016305,7.079598
100,44.572595,1.196704,0.943092,1.200733,0.740194,5.148352
1000,20.136223,0.225369,0.225207,0.227195,0.21885,4.931778


'wiki'

algo,hash_join,inlj_btree256,inlj_pgm256,lsj_btree256,lsj_flatpgm256,sort_join
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,32.578296,23.270138,14.453964,22.860397,12.128019,11.32456
10,24.205699,4.822314,3.162925,4.74616,2.52957,4.194891
100,20.306184,0.869956,0.698924,0.867694,0.503725,2.513819
1000,8.994534,0.208226,0.180925,0.208636,0.173009,2.319318


'osm'

algo,hash_join,inlj_btree256,inlj_pgm256,lsj_btree256,lsj_flatpgm256,sort_join
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,,307.938682,239.477801,306.041844,196.079051,191.227261
10,270.794,26.320338,19.397566,25.40624,13.395372,31.216253
100,195.984129,5.559968,4.4995,5.956839,3.412531,22.119204
1000,149.10854,72.161864,1.052071,1.248693,60.339386,20.924745


'books'

algo,hash_join,inlj_btree256,inlj_pgm256,lsj_btree256,lsj_flatpgm256,sort_join
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,,312.395641,230.627835,307.033923,197.721628,196.546935
10,270.211066,25.979196,18.239296,25.582048,13.961933,31.37767
100,196.483455,5.725865,4.33042,5.772816,3.765012,29.208963
1000,148.763578,60.483649,1.272723,1.110183,1.118547,21.187958


'uniform_dense'

algo,hash_join,inlj_btree256,inlj_pgm256,lsj_btree256,lsj_flatpgm256,sort_join
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,83.193261,52.841626,28.012667,51.890777,29.925751,27.256534
10,54.03266,5.719059,3.77765,5.714833,3.049141,7.733041
100,44.483307,1.233695,0.880521,1.373423,0.820292,5.151877
1000,19.479347,0.237484,0.21975,0.237422,0.22068,4.977187


'uniform_sparse'

algo,hash_join,inlj_btree256,inlj_pgm256,lsj_btree256,lsj_flatpgm256,sort_join
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,79.824059,52.790627,31.543096,51.743549,26.76867,24.984327
10,53.865636,5.70505,3.956811,5.632568,3.010531,7.082598
100,44.547378,1.230551,0.93714,1.231746,0.756002,5.154212
1000,22.689613,0.236518,0.231072,0.237742,0.22638,4.946246


'normal'

algo,hash_join,inlj_btree256,inlj_pgm256,lsj_btree256,lsj_flatpgm256,sort_join
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,80.190714,53.444584,31.871422,52.369877,26.820202,25.023829
10,53.703619,5.877991,4.001172,5.775892,3.066949,7.252469
100,44.970908,1.265768,0.947298,1.259856,0.766837,5.29607
1000,22.897787,0.240616,0.234836,0.240155,0.230603,5.102566


'lognormal'

algo,hash_join,inlj_btree256,inlj_pgm256,lsj_btree256,lsj_flatpgm256,sort_join
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,78.720935,52.932648,33.314757,51.819811,25.66869,24.880721
10,53.873789,5.701944,4.224656,5.654821,2.961539,7.065209
100,44.420552,1.213984,1.024411,1.21458,0.755119,5.132633
1000,19.661215,0.231155,0.23793,0.23068,0.228448,4.928607


In [4]:
# Effect of Epsilon on INLJ (BTree and PGM)
def plot_epsilon_inlj(dataset):
    rows = duckdb.sql(
        "SELECT ratio, algo, join_algo, epsilon, MEDIAN(duration_sec) as d, MEDIAN(thput) as t FROM df " 
        "WHERE" 
        "   join_algo = 'inlj'"
        "   AND threads=1"
        f"   AND dataset='{dataset}'"
        "GROUP BY dataset, ratio, algo, join_algo, epsilon"
    ).df()
    return rows.pivot(index='ratio', values='d', columns=['algo', 'epsilon'])

for dataset in real_datasets:
    display(dataset)
    display(plot_epsilon_inlj(dataset))

for dataset in synth_datasets:
    display(dataset)
    display(plot_epsilon_inlj(dataset))

'fb'

algo,inlj_sampledflatpgm4096,inlj_btree256,inlj_pgm256,inlj_btree1024,inlj_btree4096,inlj_sampledflatpgm1024,inlj_pgm1024,inlj_flatpgm4096,inlj_sampledflatpgm256,inlj_flatpgm1024,inlj_flatpgm256,inlj_pgm4096
epsilon,4096.0,256.0,256.0,1024.0,4096.0,1024.0,1024.0,4096.0,256.0,1024.0,256.0,4096.0
ratio,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,34.713387,53.723555,33.464064,54.588636,52.968209,35.019604,33.376552,33.575393,38.25217,32.459946,30.652944,36.123307
10,4.218466,5.690409,4.085079,5.497925,5.307913,4.537026,3.915549,3.931653,5.138802,3.819236,3.843559,4.183998
100,1.086608,1.196704,0.943092,0.909161,0.809161,1.176709,0.970635,0.995085,1.276552,0.949179,0.921725,1.022775
1000,0.40138,0.225369,0.225207,0.286764,0.307143,0.300688,0.277599,0.384095,0.252409,0.274128,0.223829,0.384734


'wiki'

algo,inlj_btree4096,inlj_sampledflatpgm1024,inlj_flatpgm4096,inlj_sampledflatpgm4096,inlj_btree1024,inlj_pgm4096,inlj_flatpgm256,inlj_pgm256,inlj_pgm1024,inlj_flatpgm1024,inlj_btree256,inlj_sampledflatpgm256
epsilon,4096.0,1024.0,4096.0,4096.0,1024.0,4096.0,256.0,256.0,1024.0,1024.0,256.0,256.0
ratio,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,21.523201,15.109639,14.929495,15.455615,22.631037,15.82733,13.696504,14.453964,14.999373,14.4822,23.270138,15.39042
10,4.258731,3.531694,3.234768,3.422134,4.524638,3.424825,3.041385,3.162925,3.296488,3.120329,4.822314,3.716301
100,0.615451,0.91382,0.716621,0.814383,0.689098,0.73653,0.655918,0.698924,0.713232,0.706915,0.869956,0.978394
1000,0.181401,0.246377,0.274723,0.302543,0.207184,0.278255,0.181253,0.180925,0.224906,0.216769,0.208226,0.216119


'osm'

algo,inlj_btree1024,inlj_pgm4096,inlj_pgm1024,inlj_sampledflatpgm1024,inlj_sampledflatpgm4096,inlj_flatpgm256,inlj_btree4096,inlj_sampledflatpgm256,inlj_flatpgm1024,inlj_flatpgm4096,inlj_pgm256,inlj_btree256
epsilon,1024.0,4096.0,1024.0,1024.0,4096.0,256.0,4096.0,256.0,1024.0,4096.0,256.0,256.0
ratio,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,301.937752,242.854643,241.508943,231.109968,231.867855,226.309682,301.00892,251.150047,224.577256,221.623865,239.477801,307.938682
10,24.599326,19.918597,19.830247,21.439189,19.602522,17.324695,24.992381,25.68864,17.279345,18.116734,19.397566,26.320338
100,4.083472,4.875353,5.114307,5.501635,5.785165,4.263373,19.40273,6.106386,5.062979,4.713111,4.4995,5.559968
1000,1.310681,22.111946,1.292627,1.389794,1.883281,1.026575,1.439786,1.177019,1.288735,1.785288,1.052071,72.161864


'books'

algo,inlj_flatpgm256,inlj_sampledflatpgm4096,inlj_pgm1024,inlj_pgm256,inlj_btree256,inlj_flatpgm4096,inlj_pgm4096,inlj_btree4096,inlj_sampledflatpgm1024,inlj_sampledflatpgm256,inlj_btree1024,inlj_flatpgm1024
epsilon,256.0,4096.0,1024.0,256.0,256.0,4096.0,4096.0,4096.0,1024.0,256.0,1024.0,1024.0
ratio,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,217.702608,216.384412,226.011519,230.627835,312.395641,219.54483,225.895627,316.68755,226.755863,230.982651,306.759161,213.922545
10,17.248855,25.029694,20.935937,18.239296,25.979196,17.559111,28.417911,24.994639,18.985725,23.885226,24.143906,17.062209
100,24.345587,8.219648,5.974149,4.33042,5.725865,4.751887,5.016366,3.688105,5.333968,6.178544,19.206605,4.423572
1000,1.034404,1.85414,1.249759,1.272723,60.483649,1.734792,17.27924,1.474335,1.366381,1.192589,1.322034,29.937509


'uniform_dense'

algo,inlj_flatpgm1024,inlj_pgm1024,inlj_sampledflatpgm4096,inlj_pgm4096,inlj_flatpgm4096,inlj_sampledflatpgm256,inlj_btree256,inlj_sampledflatpgm1024,inlj_flatpgm256,inlj_btree4096,inlj_btree1024,inlj_pgm256
epsilon,1024.0,1024.0,4096.0,4096.0,4096.0,256.0,256.0,1024.0,256.0,4096.0,1024.0,256.0
ratio,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,29.429158,31.867798,30.874094,30.713331,31.162976,32.092898,52.841626,29.740423,28.443459,52.278103,53.701065,28.012667
10,3.720386,3.66376,3.545851,3.731835,3.499872,4.605629,5.719059,3.558415,3.491992,5.371596,5.51741,3.77765
100,0.864033,0.795375,0.889533,0.779522,0.797951,1.248739,1.233695,0.926119,0.866013,0.864252,1.021391,0.880521
1000,0.260287,0.256217,0.354851,0.318248,0.335915,0.247309,0.237484,0.302895,0.240102,0.336949,0.328518,0.21975


'uniform_sparse'

algo,inlj_sampledflatpgm1024,inlj_flatpgm256,inlj_pgm256,inlj_pgm4096,inlj_flatpgm4096,inlj_btree256,inlj_btree4096,inlj_sampledflatpgm4096,inlj_flatpgm1024,inlj_btree1024,inlj_sampledflatpgm256,inlj_pgm1024
epsilon,1024.0,256.0,256.0,4096.0,4096.0,256.0,4096.0,4096.0,1024.0,1024.0,256.0,1024.0
ratio,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,31.503087,29.983271,31.543096,32.611604,31.841783,52.790627,52.300837,32.114671,31.027543,53.698649,33.020995,31.160718
10,4.152439,3.776262,3.956811,3.85645,3.804095,5.70505,5.306251,3.894787,3.747043,5.494754,4.967536,3.766312
100,1.189227,0.917394,0.93714,0.992095,0.990347,1.230551,0.826849,1.081915,0.980722,0.924741,1.310415,0.984568
1000,0.312569,0.228666,0.231072,0.397535,0.400153,0.236518,0.323396,0.421276,0.290428,0.301985,0.256023,0.289575


'normal'

algo,inlj_btree1024,inlj_flatpgm256,inlj_btree4096,inlj_flatpgm4096,inlj_pgm256,inlj_sampledflatpgm256,inlj_flatpgm1024,inlj_pgm4096,inlj_sampledflatpgm4096,inlj_sampledflatpgm1024,inlj_pgm1024,inlj_btree256
epsilon,1024.0,256.0,4096.0,4096.0,256.0,256.0,1024.0,4096.0,4096.0,1024.0,1024.0,256.0
ratio,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,54.54127,29.944082,52.923079,33.292856,31.871422,31.614064,31.539024,35.093939,33.668926,32.114858,33.541749,53.444584
10,5.540704,3.806,5.43151,4.308542,4.001172,4.970013,3.927536,4.475421,5.003237,4.645596,4.135038,5.877991
100,0.937591,0.925729,0.835695,1.214645,0.947298,1.329774,1.019723,1.22252,1.478316,1.318857,1.045134,1.265768
1000,0.30414,0.231377,0.329501,0.424541,0.234836,0.263512,0.298296,0.431134,0.455504,0.32316,0.299738,0.240616


'lognormal'

algo,inlj_sampledflatpgm1024,inlj_flatpgm1024,inlj_flatpgm4096,inlj_sampledflatpgm4096,inlj_btree4096,inlj_pgm256,inlj_pgm1024,inlj_pgm4096,inlj_btree1024,inlj_sampledflatpgm256,inlj_flatpgm256,inlj_btree256
epsilon,1024.0,1024.0,4096.0,4096.0,4096.0,256.0,1024.0,4096.0,1024.0,256.0,256.0,256.0
ratio,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,31.939601,31.595328,33.705478,34.064562,52.155415,33.314757,33.354719,35.825353,53.684522,31.834472,30.13324,52.932648
10,4.967971,4.06829,4.624012,5.560521,5.347522,4.224656,4.261129,4.790636,5.50503,5.15361,3.900504,5.701944
100,1.488593,1.140116,1.437676,1.786009,0.812216,1.024411,1.158466,1.469182,0.911732,1.421147,0.979978,1.213984
1000,0.332593,0.303786,0.470993,0.50125,0.314207,0.23793,0.30692,0.50419,0.303248,0.261101,0.231444,0.231155


In [5]:
# Effect of Epsilon on INLJ (BTree and PGM)
def plot_epsilon_lsj(dataset):
    rows = duckdb.sql(
        "SELECT ratio, algo, join_algo, epsilon, MEDIAN(duration_sec) as d, MEDIAN(thput) as t FROM df " 
        "WHERE" 
        "   join_algo = 'lsj'"
        "   AND threads=1"
        f"   AND dataset='{dataset}'"
        "GROUP BY dataset, ratio, algo, join_algo, epsilon"
    ).df()
    return rows.pivot(index='ratio', values='d', columns=['algo', 'epsilon'])

for dataset in real_datasets:
    display(dataset)
    display(plot_epsilon_lsj(dataset))

for dataset in synth_datasets:
    display(dataset)
    display(plot_epsilon_lsj(dataset))

'fb'

algo,lsj_btree1024,lsj_flatpgm1024,lsj_sampledflatpgm4096,lsj_pgm1024,lsj_pgm4096,lsj_btree4096,lsj_btree256,lsj_flatpgm4096,lsj_sampledflatpgm1024,lsj_pgm256,lsj_sampledflatpgm256,lsj_flatpgm256
epsilon,1024.0,1024.0,4096.0,1024.0,4096.0,4096.0,256.0,4096.0,1024.0,256.0,256.0,256.0
ratio,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,52.765188,28.474029,30.610882,31.524737,35.21845,51.813857,52.649181,29.859236,28.824079,31.933872,27.985456,26.987002
10,5.427577,3.010683,3.18301,3.388625,3.654325,5.227907,5.636519,3.153408,3.054856,3.532868,3.105649,3.016305
100,0.901017,0.613775,0.574284,0.652138,0.626493,0.811296,1.200733,0.610235,0.621495,0.798612,0.770686,0.740194
1000,0.288444,0.253435,0.273259,0.260004,0.277396,0.315796,0.227195,0.27555,0.268029,0.226063,0.242841,0.21885


'wiki'

algo,lsj_pgm4096,lsj_flatpgm4096,lsj_pgm256,lsj_sampledflatpgm1024,lsj_btree256,lsj_flatpgm1024,lsj_pgm1024,lsj_sampledflatpgm4096,lsj_sampledflatpgm256,lsj_btree4096,lsj_btree1024,lsj_flatpgm256
epsilon,4096.0,4096.0,256.0,1024.0,256.0,1024.0,1024.0,4096.0,256.0,4096.0,1024.0,256.0
ratio,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,15.187852,13.65448,13.852011,12.800034,22.860397,12.547229,14.158023,13.754466,12.719507,21.121723,22.511216,12.128019
10,3.071957,2.726041,2.818926,2.615648,4.74616,2.600654,2.864973,2.755398,2.579863,4.226802,4.485158,2.52957
100,0.479688,0.444253,0.538358,0.460861,0.867694,0.451815,0.488916,0.446688,0.523436,0.612697,0.676807,0.503725
1000,0.163821,0.160087,0.176829,0.178203,0.208636,0.173633,0.176845,0.168344,0.195386,0.181598,0.206332,0.173009


'osm'

algo,lsj_sampledflatpgm1024,lsj_flatpgm1024,lsj_pgm1024,lsj_sampledflatpgm256,lsj_btree4096,lsj_flatpgm256,lsj_flatpgm4096,lsj_pgm256,lsj_btree1024,lsj_pgm4096,lsj_sampledflatpgm4096,lsj_btree256
epsilon,1024.0,1024.0,1024.0,256.0,4096.0,256.0,4096.0,256.0,1024.0,4096.0,4096.0,256.0
ratio,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,193.657458,201.837106,229.218297,201.556823,314.681367,196.079051,205.18502,218.200621,298.912165,231.691579,209.825529,306.041844
10,13.885208,13.49705,19.825505,17.779542,24.391867,13.395372,13.961594,18.631684,33.022576,17.478215,25.592661,25.40624
100,2.817509,2.811542,3.172866,3.805073,3.657893,3.412531,3.614236,4.017927,4.074608,3.270632,20.674718,5.956839
1000,1.251469,1.184113,1.19136,1.118663,1.395077,60.339386,1.260188,1.038566,11.360867,1.298017,13.931449,1.248693


'books'

algo,lsj_pgm256,lsj_sampledflatpgm4096,lsj_flatpgm1024,lsj_btree256,lsj_flatpgm4096,lsj_pgm4096,lsj_pgm1024,lsj_btree1024,lsj_sampledflatpgm1024,lsj_sampledflatpgm256,lsj_flatpgm256,lsj_btree4096
epsilon,256.0,4096.0,1024.0,256.0,4096.0,4096.0,1024.0,1024.0,1024.0,256.0,256.0,4096.0
ratio,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,222.958457,205.656734,202.035074,307.033923,211.19048,222.171079,219.136054,303.311307,201.974467,200.034339,197.721628,305.646373
10,16.946312,14.274216,13.859648,25.582048,14.377494,15.390217,29.666786,24.279749,13.579579,13.982104,13.961933,24.202154
100,4.185684,2.629854,24.789946,5.772816,3.048945,3.014945,18.495725,4.105892,19.933536,3.559659,3.765012,3.711576
1000,1.025128,1.248045,1.181989,1.110183,1.33844,1.256449,1.530666,32.219553,28.354237,1.103729,1.118547,1.664497


'uniform_dense'

algo,lsj_btree4096,lsj_flatpgm1024,lsj_btree256,lsj_sampledflatpgm4096,lsj_pgm256,lsj_pgm1024,lsj_flatpgm256,lsj_sampledflatpgm256,lsj_btree1024,lsj_flatpgm4096,lsj_sampledflatpgm1024,lsj_pgm4096
epsilon,4096.0,1024.0,256.0,4096.0,256.0,1024.0,256.0,256.0,1024.0,4096.0,1024.0,4096.0
ratio,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,51.374875,28.586893,51.890777,32.816717,26.974921,30.963412,29.925751,27.616325,56.768697,30.356971,28.703653,33.23271
10,5.250702,3.350924,5.714833,3.233103,3.035404,3.409199,3.049141,3.414798,5.791681,3.54867,3.372915,3.529055
100,0.814608,0.698063,1.373423,0.682954,0.732892,0.687736,0.820292,0.864375,0.91359,0.634225,0.641027,0.672398
1000,0.336563,0.300067,0.237422,0.29865,0.238841,0.254523,0.22068,0.264497,0.305108,0.278874,0.269474,0.280808


'uniform_sparse'

algo,lsj_flatpgm4096,lsj_sampledflatpgm4096,lsj_pgm1024,lsj_flatpgm256,lsj_btree256,lsj_sampledflatpgm1024,lsj_btree4096,lsj_pgm4096,lsj_sampledflatpgm256,lsj_btree1024,lsj_flatpgm1024,lsj_pgm256
epsilon,4096.0,4096.0,1024.0,256.0,256.0,1024.0,4096.0,4096.0,256.0,1024.0,1024.0,256.0
ratio,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,29.883543,30.165467,30.189751,26.76867,51.743549,28.602864,51.374448,31.689784,27.468063,52.749507,28.34864,30.333008
10,3.164637,3.234927,3.223437,3.010531,5.632568,3.101642,5.252183,3.323338,3.103629,5.417373,3.043242,3.37943
100,0.592688,0.6131,0.651727,0.756002,1.231746,0.648281,0.816544,0.611171,0.788451,0.914489,0.639914,0.795113
1000,0.294354,0.300379,0.270081,0.22638,0.237742,0.286976,0.322849,0.290296,0.254252,0.300796,0.271,0.231511


'normal'

algo,lsj_btree256,lsj_btree4096,lsj_pgm256,lsj_flatpgm1024,lsj_sampledflatpgm1024,lsj_pgm4096,lsj_flatpgm4096,lsj_flatpgm256,lsj_pgm1024,lsj_sampledflatpgm4096,lsj_sampledflatpgm256,lsj_btree1024
epsilon,256.0,4096.0,256.0,1024.0,1024.0,4096.0,4096.0,256.0,1024.0,4096.0,256.0,1024.0
ratio,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,52.369877,52.070128,30.336636,28.198332,28.875777,33.094355,29.770474,26.820202,31.40962,29.927734,27.938565,52.972112
10,5.775892,5.288045,3.41559,3.026056,3.147763,3.510877,3.127781,3.066949,3.353272,3.189761,3.168278,5.472135
100,1.259856,0.859088,0.793542,0.65112,0.643274,0.662887,0.601905,0.766837,0.686029,0.609589,0.809134,0.917467
1000,0.240155,0.329414,0.23295,0.271792,0.29048,0.296629,0.289028,0.230603,0.277214,0.293955,0.257686,0.303025


'lognormal'

algo,lsj_btree4096,lsj_sampledflatpgm1024,lsj_flatpgm4096,lsj_sampledflatpgm256,lsj_flatpgm256,lsj_btree1024,lsj_sampledflatpgm4096,lsj_btree256,lsj_pgm256,lsj_pgm4096,lsj_flatpgm1024,lsj_pgm1024
epsilon,4096.0,1024.0,4096.0,256.0,256.0,1024.0,4096.0,256.0,256.0,4096.0,1024.0,1024.0
ratio,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,51.340584,28.17098,28.495432,27.486908,25.66869,52.66664,29.427815,51.819811,30.897326,32.16644,27.017295,30.556263
10,5.242098,3.066378,3.053803,3.102597,2.961539,5.396317,3.12454,5.654821,3.471716,3.398305,2.959799,3.31687
100,0.803116,0.638884,0.57976,0.789848,0.755119,0.905219,0.585797,1.21458,0.814019,0.614113,0.623597,0.657125
1000,0.311328,0.287632,0.284259,0.257202,0.228448,0.295275,0.284786,0.23068,0.233523,0.289814,0.274921,0.278461


In [6]:
# SingleThread, HJ vs SJ vs INLJ(BTree256) vs INLJ(PGM256) vs LS(BTree256) vs LS(PGM256)
def plot_dataset_join_duration(dataset):
    rows = duckdb.sql(
        "SELECT ratio, threads, algo, MEDIAN(duration_sec) as d, MEDIAN(thput) as t FROM df " 
        "WHERE" 
        "   (algo='hash_join' OR algo='sort_join' OR algo='inlj_btree256' OR algo='inlj_pgm256' OR "
        "   algo='lsj_btree256' OR algo='lsj_flatpgm256')" 
        "   AND threads=1"
        f"   AND dataset='{dataset}'"
        "GROUP BY dataset, ratio, threads, algo"
    ).df()
    result = (rows.pivot(index='ratio', values='d', columns=['algo']))
    ## Save to CSV
    os.makedirs('single_thread_csv_hdd_latency/', exist_ok=True)
    result.to_csv(f'single_thread_csv_hdd_latency/{dataset}.csv')
    return result

real_datasets = ['fb', 'wiki', 'osm', 'books']
synth_datasets = ['uniform_dense', 'uniform_sparse', 'normal', 'lognormal']

for dataset in real_datasets:
    display(dataset)
    display(plot_dataset_join_duration(dataset))
for dataset in synth_datasets:
    display(dataset)
    display(plot_dataset_join_duration(dataset))


'fb'

algo,hash_join,inlj_btree256,inlj_pgm256,lsj_btree256,lsj_flatpgm256,sort_join
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,79.785572,53.723555,33.464064,52.649181,26.987002,25.162671
10,53.982077,5.690409,4.085079,5.636519,3.016305,7.079598
100,44.572595,1.196704,0.943092,1.200733,0.740194,5.148352
1000,20.136223,0.225369,0.225207,0.227195,0.21885,4.931778


'wiki'

algo,hash_join,inlj_btree256,inlj_pgm256,lsj_btree256,lsj_flatpgm256,sort_join
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,32.578296,23.270138,14.453964,22.860397,12.128019,11.32456
10,24.205699,4.822314,3.162925,4.74616,2.52957,4.194891
100,20.306184,0.869956,0.698924,0.867694,0.503725,2.513819
1000,8.994534,0.208226,0.180925,0.208636,0.173009,2.319318


'osm'

algo,hash_join,inlj_btree256,inlj_pgm256,lsj_btree256,lsj_flatpgm256,sort_join
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,,307.938682,239.477801,306.041844,196.079051,191.227261
10,270.794,26.320338,19.397566,25.40624,13.395372,31.216253
100,195.984129,5.559968,4.4995,5.956839,3.412531,22.119204
1000,149.10854,72.161864,1.052071,1.248693,60.339386,20.924745


'books'

algo,hash_join,inlj_btree256,inlj_pgm256,lsj_btree256,lsj_flatpgm256,sort_join
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,,312.395641,230.627835,307.033923,197.721628,196.546935
10,270.211066,25.979196,18.239296,25.582048,13.961933,31.37767
100,196.483455,5.725865,4.33042,5.772816,3.765012,29.208963
1000,148.763578,60.483649,1.272723,1.110183,1.118547,21.187958
