In [3]:
import os;
import json;
import builtins
import pandas as pd;
import duckdb

dir = '/home/chesetti/Repos/learned_merge_cleanup/sponge'
threads = ["1", "4", "16", "32"]
op = ['join', 'merge']
datasets = ['fb', 'wiki', 'uniform_dense', 'uniform_sparse', 'normal', 'lognormal']
test_results = []
for op in op:
    for dataset in datasets:
        for thread in threads:
            testdir = os.path.join(dir, "_".join([op, dataset, thread]), "outputs", "results")
            if (not os.path.exists(testdir)):
                continue
            rundirs = os.listdir(testdir)
            for rundir in rundirs:
                for test_result_file in os.listdir(os.path.join(testdir, rundir)):
                    json_file = builtins.open(os.path.join(testdir, rundir, test_result_file))
                    test_result = json.load(json_file)
                    test_result['run'] = rundir
                    test_result['dataset'] = dataset
                    test_result['op'] = op
                    if dataset == 'osm' or dataset == 'books':
                        test_result['input_size'] = 800_000_000 
                    else:
                        test_result['input_size'] = 200_000_000 
                    test_results.append(test_result)
                    json_file.close()
test_dataframe = pd.json_normalize(test_results)
print(test_dataframe.columns)

Index(['command', 'run', 'dataset', 'op', 'input_size', 'result.checksum',
       'result.duration_ns', 'result.duration_sec', 'result.inner_disk_fetch',
       'result.inner_disk_fetch_size', 'result.inner_total_bytes_fetched',
       'result.outer_disk_fetch', 'result.outer_disk_fetch_size',
       'result.outer_total_bytes_fetched', 'spec.algo', 'spec.algo_name',
       'spec.common_key', 'spec.index.search', 'spec.index.type',
       'spec.inner_table', 'spec.key_size', 'spec.key_type',
       'spec.load_sstable_in_mem', 'spec.name', 'spec.num_threads',
       'spec.outer_table', 'spec.result_path', 'spec.value_size',
       'spec.write_result_to_disk', 'spec.index.leaf_size_in_pages'],
      dtype='object')


In [5]:
epsilon = {
    "pgm256": 256,
    "pgm1024": 1024,
    "pgm2048": 2048,
    "btree256": 256,
    "btree512": 512,
    "btree1024": 1024,
    "btree2048": 2048,
    "hashjoin": 0,
    "btree": 0,
    "sj": 0,
    "sj2": 0,
    "standard_merge": 0
};
indexType = {
    "pgm64": "PGM",
    "pgm128": "PGM",
    "pgm256": "PGM",
    "pgm512": "PGM",
    "pgm1024": "PGM",
    "pgm2048": "PGM",
    "btree256": "BTREE",
    "btree512": "BTREE",
    "btree1024": "BTREE",
    "btree2048": "BTREE",
    "hashjoin": "NA",
    "btree": 0,
    "sj": "SJ",
    "sj2": "NA",
    "standard_merge": "SM"
};
dataset_type = {
    "fb": "REAL",
    "wiki": "REAL",
    "osm": "REAL",
    "books": "REAL",
    "uniform_dense": "SYNTHETIC",
    "uniform_sparse": "SYNTHETIC",
    "normal": "SYNTHETIC",
    "lognormal": "SYNTHETIC",
};
test_dataframe["threads"] = test_dataframe["spec.num_threads"]
test_dataframe["duration"] = test_dataframe["result.duration_ns"]
test_dataframe["ratio"] = test_dataframe["spec.common_key"]
test_dataframe["algo"] = test_dataframe["spec.algo_name"]
test_dataframe["inner_bytes_fetched"] = test_dataframe["result.inner_total_bytes_fetched"]
test_dataframe["epsilon"] = test_dataframe["spec.algo_name"].map(lambda x: epsilon[x.lower()])
test_dataframe["index_type"] = test_dataframe["spec.algo_name"].map(lambda x: indexType[x.lower()])
test_dataframe["dataset_type"] = test_dataframe["dataset"].map(lambda x: dataset_type[x.lower()])

df = test_dataframe[["dataset", "dataset_type", "threads", "ratio", "op", "algo", "index_type", "epsilon", "duration", "inner_bytes_fetched"]]
duckdb.sql("DROP TABLE IF EXISTS results"); 
duckdb.sql("CREATE TABLE results AS SELECT * FROM df")
duckdb.sql("SELECT dataset,ratio,index_type,epsilon FROM results where op='merge'")


┌───────────┬───────┬────────────┬─────────┐
│  dataset  │ ratio │ index_type │ epsilon │
│  varchar  │ int64 │  varchar   │  int64  │
├───────────┼───────┼────────────┼─────────┤
│ fb        │    50 │ PGM        │    2048 │
│ fb        │    30 │ PGM        │     256 │
│ fb        │    70 │ BTREE      │    2048 │
│ fb        │   100 │ BTREE      │    2048 │
│ fb        │    80 │ PGM        │    2048 │
│ fb        │    10 │ PGM        │     256 │
│ fb        │    80 │ BTREE      │     256 │
│ fb        │    40 │ PGM        │    1024 │
│ fb        │    30 │ SM         │       0 │
│ fb        │    40 │ SM         │       0 │
│ ·         │     · │ ·          │       · │
│ ·         │     · │ ·          │       · │
│ ·         │     · │ ·          │       · │
│ lognormal │    10 │ BTREE      │    2048 │
│ lognormal │    50 │ BTREE      │    1024 │
│ lognormal │   100 │ PGM        │    2048 │
│ lognormal │   100 │ BTREE      │    1024 │
│ lognormal │   100 │ BTREE      │     256 │
│ lognorma

In [56]:
def group_by_threads(op, ratio):
    query = (" SELECT ratio,index_type,threads, MEDIAN(duration) as v"
        f" FROM results where op='join' AND ratio={ratio} AND (epsilon=2048 OR epsilon=0) AND index_type!='NA'"
        " GROUP BY dataset, ratio, index_type, threads "
        " ORDER BY Threads")
    print(duckdb.sql(query))
group_by_threads('join', 10)

 SELECT ratio,index_type,threads, MEDIAN(duration) as v FROM results where op='join' AND ratio=10 AND (epsilon=2048 OR epsilon=0) AND index_type!='NA' GROUP BY dataset, ratio, index_type, threads  ORDER BY Threads


## PGM Index Memory Study

In [113]:
def compare_btree_pgm_memory(epsilon, thread, dataset_type):
        where = f"WHERE threads={thread} AND epsilon={epsilon} AND ratio=10 "
        if dataset_type != "all":
                where = where + f" AND dataset_type='{dataset_type}' "
        query = ("SELECT ratio, threads, algo, index_type, dataset, dataset_type,"
                "epsilon, MEAN(index_memory) as memory_mean, FROM results " 
                + where +
                "GROUP BY (ratio, threads, algo, index_type, epsilon, dataset, dataset_type) "
                "ORDER BY ratio")
        subresult = duckdb.sql(query)
        df = duckdb.sql("PIVOT subresult ON index_type USING MEAN(memory_mean) GROUP BY ratio,dataset").df()
        df['REL'] = (df['PGM']) / df['BTREE']
        return df

def compare_btree_pgm_build_duration(epsilon, thread, dataset_type):
        where = f"WHERE threads={thread} AND epsilon={epsilon} AND ratio=10 "
        if dataset_type != "all":
                where = where + f" AND dataset_type='{dataset_type}' "
        query = ("SELECT ratio, threads, algo, index_type, dataset, dataset_type,"
                "epsilon, MEAN(index_build_duration) as index_build_duration, FROM results " 
                + where +
                "GROUP BY (ratio, threads, algo, index_type, epsilon, dataset, dataset_type) "
                "ORDER BY ratio")
        subresult = duckdb.sql(query)
        df = duckdb.sql("PIVOT subresult ON index_type USING MEAN(index_build_duration) GROUP BY ratio,dataset").df()
        df['REL'] = (df['PGM']) / df['BTREE']
        return df
dict = []
dict.append({
        'epsilon':256, 
        'ALL': compare_btree_pgm_memory(epsilon=256, thread=1, dataset_type='all')['REL'].mean(),
        'REAL': compare_btree_pgm_memory(epsilon=256, thread=1, dataset_type='REAL')['REL'].mean(),
        'SYNTHETIC': compare_btree_pgm_memory(epsilon=256, thread=1, dataset_type='SYNTHETIC')['REL'].mean(),
        'ACTUAL': compare_btree_pgm_memory(epsilon=256, thread=1, dataset_type='all')['PGM'].mean(),
        })
dict.append({
        'epsilon':1024, 
        'ALL': compare_btree_pgm_memory(epsilon=1024, thread=1, dataset_type='all')['REL'].mean(),
        'REAL': compare_btree_pgm_memory(epsilon=1024, thread=1, dataset_type='REAL')['REL'].mean(),
        'SYNTHETIC': compare_btree_pgm_memory(epsilon=1024, thread=1, dataset_type='SYNTHETIC')['REL'].mean(),
        'ACTUAL': compare_btree_pgm_memory(epsilon=1024, thread=1, dataset_type='all')['PGM'].mean(),
        })
dict.append({
        'epsilon': 2048, 
        'ALL': compare_btree_pgm_memory(epsilon=2048, thread=1, dataset_type='all')['REL'].mean(),
        'REAL': compare_btree_pgm_memory(epsilon=2048, thread=1, dataset_type='REAL')['REL'].mean(),
        'SYNTHETIC': compare_btree_pgm_memory(epsilon=2048, thread=1, dataset_type='SYNTHETIC')['REL'].mean(),
        'ACTUAL': compare_btree_pgm_memory(epsilon=2048, thread=1, dataset_type='all')['PGM'].mean(),
        })

print(pd.DataFrame(dict).to_latex(index=False))

dict = []
dict.append({
        'epsilon':256, 
        'ALL': compare_btree_pgm_build_duration(epsilon=256, thread=1, dataset_type='all')['REL'].mean(),
        'REAL': compare_btree_pgm_build_duration(epsilon=256, thread=1, dataset_type='REAL')['REL'].mean(),
        'SYNTHETIC': compare_btree_pgm_build_duration(epsilon=256, thread=1, dataset_type='SYNTHETIC')['REL'].mean()
        })
dict.append({
        'epsilon':1024, 
        'ALL': compare_btree_pgm_build_duration(epsilon=1024, thread=1, dataset_type='all')['REL'].mean(),
        'REAL': compare_btree_pgm_build_duration(epsilon=1024, thread=1, dataset_type='REAL')['REL'].mean(),
        'SYNTHETIC': compare_btree_pgm_build_duration(epsilon=1024, thread=1, dataset_type='SYNTHETIC')['REL'].mean()
        })
dict.append({
        'epsilon': 2048, 
        'ALL': compare_btree_pgm_build_duration(epsilon=2048, thread=1, dataset_type='all')['REL'].mean(),
        'REAL': compare_btree_pgm_build_duration(epsilon=2048, thread=1, dataset_type='REAL')['REL'].mean(),
        'SYNTHETIC': compare_btree_pgm_build_duration(epsilon=2048, thread=1, dataset_type='SYNTHETIC')['REL'].mean()
        })
print(pd.DataFrame(dict).to_latex(index=False))

\begin{tabular}{rrrrr}
\toprule
epsilon & ALL & REAL & SYNTHETIC & ACTUAL \\
\midrule
256 & 0.059858 & 0.176309 & 0.001632 & 755868.000000 \\
1024 & 0.049511 & 0.145798 & 0.001368 & 156456.000000 \\
2048 & 0.040291 & 0.116214 & 0.002330 & 63228.000000 \\
\bottomrule
\end{tabular}

\begin{tabular}{rrrr}
\toprule
epsilon & ALL & REAL & SYNTHETIC \\
\midrule
256 & 10.033322 & 8.279725 & 10.910120 \\
1024 & 10.362137 & 7.913730 & 11.586340 \\
2048 & 10.000386 & 7.935983 & 11.032588 \\
\bottomrule
\end{tabular}



## PGM vs TREE SJ same epsilon (Duration)

In [110]:
def compare_btree_pgm_duration(epsilon, thread, dataset, op):
        where = f"WHERE threads={thread} AND op='{op}' AND (epsilon={epsilon} OR algo='sj' OR algo='standard_merge')"
        if dataset != "all":
                where = where + f"AND dataset='{dataset}'"
        query = ("SELECT ratio, threads, algo, index_type,"
                "epsilon, MEDIAN(duration) as duration_mean, FROM results " 
                + where +
                "GROUP BY (ratio, threads, algo, index_type, epsilon) "
                "ORDER BY ratio")
        subresult = duckdb.sql(query)
        print(subresult)
        df = duckdb.sql("PIVOT subresult ON index_type USING sum(duration_mean) GROUP BY ratio").df()
        print(df)
        baseline = 'SJ' if op == 'join' else 'SM'
        df['RATIO'] = df['ratio']
        if 'PGM' in df.columns:
                df['PGM_REL'] = (df['PGM']/ df[baseline])
        if 'BTREE' in df.columns:
                df['BTREE_REL'] = (df['BTREE']/ df[baseline])
        return df

def compare_pgm_bytes_fetched(epsilon, thread, dataset, op):
        where = f"WHERE threads={thread} AND op='{op}' AND (epsilon={epsilon} OR algo='sj' OR algo='standard_merge')"
        if dataset != "all":
                where = where + f"AND dataset='{dataset}'"
        query = ("SELECT ratio, threads, algo, index_type,"
                "epsilon, MEDIAN(inner_bytes_fetched) as median, FROM results " 
                + where +
                "GROUP BY (ratio, threads, algo, index_type, epsilon) "
                "ORDER BY ratio")
        subresult = duckdb.sql(query)
        df = duckdb.sql("PIVOT subresult ON index_type USING AVG(median) GROUP BY ratio").df()
        baseline = 'SJ' if op == 'join' else 'SM'
        df['RATIO'] = df['ratio']
        df['BTREE_REL'] = (df['BTREE']/ df[baseline])
        df['PGM_REL'] = (df['PGM']/ df[baseline])
        return df


In [None]:

df = pd.DataFrame();
df['ratio'] = (compare_btree_pgm_duration(epsilon=2048, thread=1, dataset='all', op='join')['RATIO'])
df['t=1'] = (compare_btree_pgm_duration(epsilon=2048, thread=1, dataset='all', op='join')['PGM_REL'])
df['t=4'] = (compare_btree_pgm_duration(epsilon=2048, thread=4, dataset='all', op='join')['PGM_REL'])
df['t=16'] = (compare_btree_pgm_duration(epsilon=2048, thread=16, dataset='all', op='join')['PGM_REL'])
df['t=32'] = (compare_btree_pgm_duration(epsilon=2048, thread=32, dataset='all', op='join')['PGM_REL'])
print(df.to_latex(index=False))

df = pd.DataFrame();
df['ratio'] = (compare_pgm_bytes_fetched(epsilon=2048, thread=1, dataset='all', op='join')['RATIO'])
df['t=1'] = (compare_pgm_bytes_fetched(epsilon=2048, thread=1, dataset='all', op='join')['PGM_REL'])
df['t=4'] = (compare_pgm_bytes_fetched(epsilon=2048, thread=4, dataset='all', op='join')['PGM_REL'])
df['t=16'] = (compare_pgm_bytes_fetched(epsilon=2048, thread=16, dataset='all', op='join')['PGM_REL'])
df['t=32'] = (compare_pgm_bytes_fetched(epsilon=2048, thread=32, dataset='all', op='join')['PGM_REL'])
print(df.to_latex(index=False))

df = pd.DataFrame();
df['ratio'] = (compare_btree_pgm_duration(epsilon=2048, thread=1, dataset='all', op='merge')['RATIO'])
df['t=1'] = (compare_btree_pgm_duration(epsilon=2048, thread=1, dataset='all', op='merge')['PGM_REL'])
df['t=4'] = (compare_btree_pgm_duration(epsilon=2048, thread=4, dataset='all', op='merge')['PGM_REL'])
df['t=16'] = (compare_btree_pgm_duration(epsilon=2048, thread=16, dataset='all', op='merge')['PGM_REL'])
df['t=32'] = (compare_btree_pgm_duration(epsilon=2048, thread=32, dataset='all', op='merge')['PGM_REL'])
print(df.to_latex(index=False))

df = pd.DataFrame();
df['ratio'] = (compare_pgm_bytes_fetched(epsilon=2048, thread=1, dataset='all', op='merge')['RATIO'])
df['t=1'] = (compare_pgm_bytes_fetched(epsilon=2048, thread=1, dataset='all', op='merge')['PGM_REL'])
df['t=4'] = (compare_pgm_bytes_fetched(epsilon=2048, thread=4, dataset='all', op='merge')['PGM_REL'])
df['t=16'] = (compare_pgm_bytes_fetched(epsilon=2048, thread=16, dataset='all', op='merge')['PGM_REL'])
df['t=32'] = (compare_pgm_bytes_fetched(epsilon=2048, thread=32, dataset='all', op='merge')['PGM_REL'])
print(df.to_latex(index=False))



In [None]:
f = compare_btree_pgm_duration
df = []
df.append({'epsilon': 256, 'index': 'PGM', 't': 1, 'value': compare_btree_pgm_duration(epsilon=256, thread=1, dataset='all', op='join')['PGM_REL'].mean()})
df.append({'epsilon': 1024, 'index': 'PGM', 't': 1, 'value': compare_btree_pgm_duration(epsilon=1024, thread=1, dataset='all', op='join')['PGM_REL'].mean()})
df.append({'epsilon': 2048, 'index': 'PGM', 't': 1, 'value': compare_btree_pgm_duration(epsilon=2048, thread=1, dataset='all', op='join')['PGM_REL'].mean()})

df.append({'epsilon': 256, 'index': 'BTREE', 't': 1, 'value': compare_btree_pgm_duration(epsilon=256, thread=1, dataset='all', op='join')['BTREE_REL'].mean()})
df.append({'epsilon': 1024, 'index': 'BTREE', 't': 1, 'value': compare_btree_pgm_duration(epsilon=1024, thread=1, dataset='all', op='join')['BTREE_REL'].mean()})
df.append({'epsilon': 2048, 'index': 'BTREE', 't': 1, 'value': compare_btree_pgm_duration(epsilon=2048, thread=1, dataset='all', op='join')['BTREE_REL'].mean()})

df = pd.DataFrame(df)
print(df.pivot(index='epsilon', columns='index', values='value').to_latex())

f = compare_btree_pgm_duration
df = []
df.append({'epsilon': 256, 'index': 'PGM', 't': 1, 'value': compare_btree_pgm_duration(epsilon=256, thread=1, dataset='all', op='merge')['PGM_REL'].mean()})
df.append({'epsilon': 2048, 'index': 'PGM', 't': 1, 'value': compare_btree_pgm_duration(epsilon=2048, thread=1, dataset='all', op='merge')['PGM_REL'].mean()})

df.append({'epsilon': 256, 'index': 'BTREE', 't': 1, 'value': compare_btree_pgm_duration(epsilon=256, thread=1, dataset='all', op='merge')['BTREE_REL'].mean()})
df.append({'epsilon': 2048, 'index': 'BTREE', 't': 1, 'value': compare_btree_pgm_duration(epsilon=2048, thread=1, dataset='all', op='merge')['BTREE_REL'].mean()})

df = pd.DataFrame(df)
print(df.pivot(index='epsilon', columns='index', values='value').to_latex())

In [112]:
f = compare_btree_pgm_duration
df = []

df.append({'epsilon': 2048, 'index': 'PGM', 't': 1, 'value': compare_btree_pgm_duration(epsilon=2048, thread=1, dataset='all', op='merge')['PGM_REL'].mean()})

df.append({'epsilon': 512, 'index': 'PGM', 't': 1, 'value': compare_btree_pgm_duration(epsilon=512, thread=1, dataset='all', op='merge')['PGM_REL'].mean()})
df.append({'epsilon': 2048, 'index': 'BTREE', 't': 1, 'value': compare_btree_pgm_duration(epsilon=2048, thread=1, dataset='all', op='merge')['BTREE_REL'].mean()})
df.append({'epsilon': 256, 'index': 'BTREE', 't': 1, 'value': compare_btree_pgm_duration(epsilon=256, thread=1, dataset='all', op='merge')['BTREE_REL'].mean()})

df = pd.DataFrame(df)
print(df.pivot(index='epsilon', columns='index', values='value').to_latex())

┌───────┬─────────┬────────────────┬────────────┬─────────┬───────────────┐
│ ratio │ threads │      algo      │ index_type │ epsilon │ duration_mean │
│ int64 │  int64  │    varchar     │  varchar   │  int64  │    double     │
├───────┼─────────┼────────────────┼────────────┼─────────┼───────────────┤
│    10 │       1 │ BTree2048      │ BTREE      │    2048 │  6652433206.0 │
│    10 │       1 │ PGM1024        │ PGM        │    2048 │  5392367428.5 │
│    10 │       1 │ standard_merge │ SM         │       0 │  7471977709.5 │
│    20 │       1 │ BTree2048      │ BTREE      │    2048 │  4952258535.0 │
│    20 │       1 │ standard_merge │ SM         │       0 │  7080549651.0 │
│    20 │       1 │ PGM1024        │ PGM        │    2048 │  4314241449.5 │
│    30 │       1 │ BTree2048      │ BTREE      │    2048 │  4437088769.0 │
│    30 │       1 │ PGM1024        │ PGM        │    2048 │  3946901609.0 │
│    30 │       1 │ standard_merge │ SM         │       0 │  6920048575.5 │
│    40 │   