In [126]:
import os;
import json;
import builtins
import pandas as pd;
import duckdb

dir = '/home/chesetti/Repos/learned_merge_cleanup/sponge'
threads = ["1", "4", "16", "32"]
op = ['join', 'merge']
datasets = ['fb', 'wiki', 'uniform_dense', 'uniform_sparse', 'normal', 'lognormal']
test_results = []
for op in op:
    for dataset in datasets:
        for thread in threads:
            testdir = os.path.join(dir, "_".join([op, dataset, thread]), "outputs", "results")
            if (not os.path.exists(testdir)):
                continue
            rundirs = os.listdir(testdir)
            for rundir in rundirs:
                for test_result_file in os.listdir(os.path.join(testdir, rundir)):
                    json_file = builtins.open(os.path.join(testdir, rundir, test_result_file))
                    test_result = json.load(json_file)
                    test_result['run'] = rundir
                    test_result['dataset'] = dataset
                    test_result['op'] = op
                    if dataset == 'osm' or dataset == 'books':
                        test_result['input_size'] = 800_000_000 
                    else:
                        test_result['input_size'] = 200_000_000 
                    test_results.append(test_result)
                    json_file.close()
test_dataframe = pd.json_normalize(test_results)
print(test_dataframe.columns)

Index(['command', 'run', 'dataset', 'op', 'input_size', 'result.checksum',
       'result.duration_ns', 'result.duration_sec', 'result.inner_disk_fetch',
       'result.inner_disk_fetch_size', 'result.inner_index_build_duration_ns',
       'result.inner_index_size', 'result.inner_total_bytes_fetched',
       'result.outer_disk_fetch', 'result.outer_disk_fetch_size',
       'result.outer_total_bytes_fetched', 'spec.algo', 'spec.algo_name',
       'spec.common_key', 'spec.index.search', 'spec.index.type',
       'spec.inner_table', 'spec.key_size', 'spec.key_type',
       'spec.load_sstable_in_mem', 'spec.name', 'spec.num_threads',
       'spec.outer_table', 'spec.result_path', 'spec.value_size',
       'spec.write_result_to_disk', 'spec.index.leaf_size_in_pages'],
      dtype='object')


In [169]:
epsilon = {
    "pgm64": 128,
    "pgm128": 256,
    "pgm256": 512,
    "pgm512": 1024,
    "pgm1024": 2048,
    "btree256": 256,
    "btree512": 512,
    "btree1024": 1024,
    "btree2048": 2048,
    "hashjoin": 0,
    "btree": 0,
    "sj": 0,
    "sj2": 0,
    "standard_merge": 0
};
indexType = {
    "pgm64": "PGM",
    "pgm128": "PGM",
    "pgm256": "PGM",
    "pgm512": "PGM",
    "pgm1024": "PGM",
    "btree256": "BTREE",
    "btree512": "BTREE",
    "btree1024": "BTREE",
    "btree2048": "BTREE",
    "hashjoin": "NA",
    "btree": 0,
    "sj": "NA",
    "sj2": "NA",
    "standard_merge": 0
};
dataset_type = {
    "fb": "REAL",
    "wiki": "REAL",
    "osm": "REAL",
    "books": "REAL",
    "uniform_dense": "SYNTHETIC",
    "uniform_sparse": "SYNTHETIC",
    "normal": "SYNTHETIC",
    "lognormal": "SYNTHETIC",
};
test_dataframe["threads"] = test_dataframe["spec.num_threads"]
test_dataframe["duration"] = test_dataframe["result.duration_ns"]
test_dataframe["ratio"] = test_dataframe["spec.common_key"]
test_dataframe["algo"] = test_dataframe["spec.algo_name"]
test_dataframe["index_memory"] = test_dataframe["result.inner_index_size"]
test_dataframe["index_build_duration"] = test_dataframe["result.inner_index_build_duration_ns"]
test_dataframe["epsilon"] = test_dataframe["spec.algo_name"].map(lambda x: epsilon[x.lower()])
test_dataframe["index_type"] = test_dataframe["spec.algo_name"].map(lambda x: indexType[x.lower()])
test_dataframe["dataset_type"] = test_dataframe["dataset"].map(lambda x: dataset_type[x.lower()])

df = test_dataframe[["dataset", "dataset_type", "threads", "ratio", "op", "algo", "index_type", "epsilon", "index_memory", "index_build_duration", "duration"]]
duckdb.sql("DROP TABLE IF EXISTS results"); 
duckdb.sql("CREATE TABLE results AS SELECT * FROM df")
duckdb.sql("SELECT * FROM results")


┌───────────┬──────────────┬─────────┬───────┬───┬─────────┬──────────────┬──────────────────────┬─────────────┐
│  dataset  │ dataset_type │ threads │ ratio │ … │ epsilon │ index_memory │ index_build_duration │  duration   │
│  varchar  │   varchar    │  int64  │ int64 │   │  int64  │    int64     │        int64         │    int64    │
├───────────┼──────────────┼─────────┼───────┼───┼─────────┼──────────────┼──────────────────────┼─────────────┤
│ fb        │ REAL         │       1 │    80 │ … │    1024 │       810544 │          19555204759 │  1012430921 │
│ fb        │ REAL         │       1 │    80 │ … │       0 │     35019496 │          22965679955 │ 37485553567 │
│ fb        │ REAL         │       1 │    20 │ … │    1024 │      3160008 │           1903134479 │  2467144793 │
│ fb        │ REAL         │       1 │    60 │ … │    1024 │       810544 │          19553677886 │  1231050038 │
│ fb        │ REAL         │       1 │    70 │ … │    2048 │      1586176 │           1896102198

In [171]:
def compare_btree_pgm_memory(epsilon, thread, dataset_type):
        where = f"WHERE threads={thread} AND epsilon={epsilon} AND ratio=10 "
        if dataset_type != "all":
                where = where + f" AND dataset_type='{dataset_type}' "
        query = ("SELECT ratio, threads, algo, index_type, dataset, dataset_type,"
                "epsilon, MEAN(index_memory) as memory_mean, FROM results " 
                + where +
                "GROUP BY (ratio, threads, algo, index_type, epsilon, dataset, dataset_type) "
                "ORDER BY ratio")
        subresult = duckdb.sql(query)
        df = duckdb.sql("PIVOT subresult ON index_type USING MEAN(memory_mean) GROUP BY ratio,dataset").df()
        df['REL'] = (df['PGM']) / df['BTREE']
        return df

def compare_btree_pgm_build_duration(epsilon, thread, dataset_type):
        where = f"WHERE threads={thread} AND epsilon={epsilon} AND ratio=10 "
        if dataset_type != "all":
                where = where + f" AND dataset_type='{dataset_type}' "
        query = ("SELECT ratio, threads, algo, index_type, dataset, dataset_type,"
                "epsilon, MEAN(index_build_duration) as index_build_duration, FROM results " 
                + where +
                "GROUP BY (ratio, threads, algo, index_type, epsilon, dataset, dataset_type) "
                "ORDER BY ratio")
        subresult = duckdb.sql(query)
        df = duckdb.sql("PIVOT subresult ON index_type USING MEAN(index_build_duration) GROUP BY ratio,dataset").df()
        df['REL'] = (df['PGM']) / df['BTREE']
        return df
dict = []
dict.append({
        'epsilon':256, 
        'ALL': compare_btree_pgm_memory(epsilon=256, thread=1, dataset_type='all')['REL'].mean(),
        'REAL': compare_btree_pgm_memory(epsilon=256, thread=1, dataset_type='REAL')['REL'].mean(),
        'SYNTHETIC': compare_btree_pgm_memory(epsilon=256, thread=1, dataset_type='SYNTHETIC')['REL'].mean()
        })
dict.append({
        'epsilon':1024, 
        'ALL': compare_btree_pgm_memory(epsilon=1024, thread=1, dataset_type='all')['REL'].mean(),
        'REAL': compare_btree_pgm_memory(epsilon=1024, thread=1, dataset_type='REAL')['REL'].mean(),
        'SYNTHETIC': compare_btree_pgm_memory(epsilon=1024, thread=1, dataset_type='SYNTHETIC')['REL'].mean()
        })
dict.append({
        'epsilon': 2048, 
        'ALL': compare_btree_pgm_memory(epsilon=2048, thread=1, dataset_type='all')['REL'].mean(),
        'REAL': compare_btree_pgm_memory(epsilon=2048, thread=1, dataset_type='REAL')['REL'].mean(),
        'SYNTHETIC': compare_btree_pgm_memory(epsilon=2048, thread=1, dataset_type='SYNTHETIC')['REL'].mean()
        })

print(pd.DataFrame(dict).to_latex(index=False))

dict = []
dict.append({
        'epsilon':256, 
        'ALL': compare_btree_pgm_build_duration(epsilon=256, thread=1, dataset_type='all')['REL'].mean(),
        'REAL': compare_btree_pgm_build_duration(epsilon=256, thread=1, dataset_type='REAL')['REL'].mean(),
        'SYNTHETIC': compare_btree_pgm_build_duration(epsilon=256, thread=1, dataset_type='SYNTHETIC')['REL'].mean()
        })
dict.append({
        'epsilon':1024, 
        'ALL': compare_btree_pgm_build_duration(epsilon=1024, thread=1, dataset_type='all')['REL'].mean(),
        'REAL': compare_btree_pgm_build_duration(epsilon=1024, thread=1, dataset_type='REAL')['REL'].mean(),
        'SYNTHETIC': compare_btree_pgm_build_duration(epsilon=1024, thread=1, dataset_type='SYNTHETIC')['REL'].mean()
        })
dict.append({
        'epsilon': 2048, 
        'ALL': compare_btree_pgm_build_duration(epsilon=2048, thread=1, dataset_type='all')['REL'].mean(),
        'REAL': compare_btree_pgm_build_duration(epsilon=2048, thread=1, dataset_type='REAL')['REL'].mean(),
        'SYNTHETIC': compare_btree_pgm_build_duration(epsilon=2048, thread=1, dataset_type='SYNTHETIC')['REL'].mean()
        })
print(pd.DataFrame(dict).to_latex(index=False))

\begin{tabular}{rrrr}
\toprule
epsilon & ALL & REAL & SYNTHETIC \\
\midrule
256 & 0.059858 & 0.176309 & 0.001632 \\
1024 & 0.049511 & 0.145798 & 0.001368 \\
2048 & 0.040291 & 0.116214 & 0.002330 \\
\bottomrule
\end{tabular}

\begin{tabular}{rrrr}
\toprule
epsilon & ALL & REAL & SYNTHETIC \\
\midrule
256 & 10.033322 & 8.279725 & 10.910120 \\
1024 & 10.362137 & 7.913730 & 11.586340 \\
2048 & 10.000386 & 7.935983 & 11.032588 \\
\bottomrule
\end{tabular}



## PGM vs BTREE for same epsilon (Duration)

In [129]:
def compare_btree_pgm_duration(epsilon, thread, dataset, op):
        where = f"WHERE threads={thread} AND epsilon={epsilon} AND op='{op}'"
        if dataset != "all":
                where = where + f"AND dataset='{dataset}'"
        query = ("SELECT ratio, threads, algo, index_type,"
                "epsilon, MEDIAN(duration) as duration_mean, FROM results " 
                + where +
                "GROUP BY (ratio, threads, algo, index_type, epsilon) "
                "ORDER BY ratio")
        subresult = duckdb.sql(query)
        df = duckdb.sql("PIVOT subresult ON index_type USING sum(duration_mean) GROUP BY ratio").df()
        df['REL'] = (df['PGM']/ df['BTREE'])
        return df
print('all join', compare_btree_pgm_duration(epsilon=2048, thread=1, dataset='all', op='join')['REL'].mean())
print('all merge', compare_btree_pgm_duration(epsilon=2048, thread=1, dataset='all', op='merge')['REL'].mean())
print(compare_btree_pgm_duration(epsilon=2048, thread=1, dataset='all', op='join'))
print(compare_btree_pgm_duration(epsilon=2048, thread=1, dataset='all', op='merge'))

print(compare_btree_pgm_duration(epsilon=2048, thread=4, dataset='all', op='join'))
print(compare_btree_pgm_duration(epsilon=2048, thread=4, dataset='all', op='merge'))

all join 0.9765117983326244
all merge 0.919384681161711
   ratio         BTREE           PGM       REL
0     10  4.296212e+09  3.317434e+09  0.772177
1     20  2.581881e+09  2.163889e+09  0.838105
2     30  1.775476e+09  1.784577e+09  1.005126
3     40  1.438509e+09  1.528594e+09  1.062624
4     50  1.363660e+09  1.299417e+09  0.952889
5     60  1.110094e+09  1.148584e+09  1.034673
6     70  1.015309e+09  1.056628e+09  1.040696
7     80  9.389836e+08  9.907380e+08  1.055117
8     90  8.847404e+08  9.392990e+08  1.061666
9    100  9.310315e+08  8.770720e+08  0.942043
   ratio         BTREE           PGM       REL
0     10  6.652433e+09  5.392367e+09  0.810586
1     20  4.952259e+09  4.314241e+09  0.871166
2     30  4.437089e+09  3.946902e+09  0.889525
3     40  4.126136e+09  3.773944e+09  0.914644
4     50  3.906404e+09  3.637032e+09  0.931044
5     60  3.826924e+09  3.563248e+09  0.931100
6     70  3.731180e+09  3.499754e+09  0.937975
7     80  3.606507e+09  3.475135e+09  0.963573
8   