In [151]:
#!/usr/bin/python

import pprint
import pandas as pd

pp = pprint.PrettyPrinter(indent=2)
runs = []

"Parse 'List Sizes: %d %d'"
def parse_list_size(l):
    tokens = l.split()
    list_size_1 = int(tokens[2].strip())
    list_size_2 = int(tokens[3].strip())
    return (list_size_1, list_size_2)

"Parse 'Use Disk: %d'"
def parse_is_disk(l):
    tokens = l.split()
    if tokens[2] == "0":
        return False
    else:
        return True

"Parse 'PLR_Error: %d'"
def parse_plr_error(l):
    tokens = l.split()
    return int(tokens[1].strip())

"Parse 'Key Bytes: %d'"
def parse_key_bytes(l):
    tokens = l.split()
    return int(tokens[2].strip())

"Parse 'Num Threads: %d'"
def parse_num_threads(l):
    tokens = l.split()
    return int(tokens[2].strip())

"Parse 'merge duration: %d'"
def parse_merge_duration(l):
    tokens = l.split()
    return float(tokens[2].strip())

"Parse 'Iterator x creation duration time: %d sec'"
def parse_iter_creation_duration(l):
    tokens = l.split()
    return float(tokens[5].strip())

"Parse 'Iterator x model size bytes: %d'"
def parse_iter_model_size(l):
    tokens = l.split()
    return int(tokens[5].strip())

def parse_file(f, test_id):
    file = open('./run_10M_bigrun.txt', 'r')
    lines = file.readlines()
    run = {}
    run['test_id'] = test_id
    run['num_threads'] = 1
    run['success'] = False
    test_cases = {}
    lines.append("merge_mode: end")
    for l in lines:
        if l.startswith('run'):
            continue
        if l.startswith("merge_mode:"):
            if run and run['success']:
                tc = {}
                tc = (run['list_1_size'], run['list_2_size'], run['use_disk'], run['key_bytes'])
                tc_id = str(tc)
                if tc_id not in test_cases.keys():
                    test_cases[tc_id] = {}
                    test_cases[tc_id]['id'] = str(tc)
                    test_cases[tc_id]['key_bytes'] = run['key_bytes']
                    test_cases[tc_id]['use_disk'] = run['use_disk']
                    test_cases[tc_id]['list_1_size'] = run['list_1_size']
                    test_cases[tc_id]['list_2_size'] = run['list_2_size']
                    test_cases[tc_id]['list_2_size'] = run['list_2_size']
                    test_cases[tc_id]['ratio'] = run['list_2_size']/run['list_1_size']
                col = run['merge_mode']
                if 'num_threads' in run.keys() and run['num_threads'] != 1:
                    col = col + ",threads="+ str(run['num_threads'])
                if 'plr_error_bound' in run.keys():
                    col = col + ",plr_error="+ str(run['plr_error_bound'])
                    test_cases[tc_id]['plr_error_bound'] = run['plr_error_bound'] 
                test_cases[tc_id][col] = run['merge_duration_sec']
                test_cases[tc_id][col + '_model_size_KB'] = run['iter_1_model_size_bytes']//1024
                test_cases[tc_id][col + '_training_overhead'] = run['iter_1_creation_sec']

                run['num_threads'] = 1
                run['success'] = False
                run['plr_error_bound'] = 'NA'
                run['test_id'] = test_id
            run['merge_mode'] = l.split()[1]

        if l.startswith("Ok!"):
            run['success'] = True
        if l.startswith("List Sizes: "):
            run['list_1_size'] = parse_list_size(l)[0]
            run['list_2_size'] = parse_list_size(l)[1]
            run['ratio'] = run['list_2_size']/run['list_1_size']
        if l.startswith("Use Disk: "):
            run['use_disk'] = parse_is_disk(l)
        if l.startswith("PLR_Error"):
            run['plr_error_bound'] = parse_plr_error(l)
        if l.startswith("Key Bytes"):
            run['key_bytes'] = parse_key_bytes(l)
        if l.startswith("Num Threads"):
            run['num_threads'] = parse_num_threads(l)
        if l.startswith("Ok!"):
            run['success'] = True
        if l.startswith("Merge duration"):
            run['merge_duration_sec'] = parse_merge_duration(l)
        if l.startswith("Iterator 0 creation"):
            run['iter_0_creation_sec'] = parse_iter_creation_duration(l)
        if l.startswith("Iterator 1 creation"):
            run['iter_1_creation_sec'] = parse_iter_creation_duration(l)
        if l.startswith("Iterator 0 model size"):
            run['iter_0_model_size_bytes'] = parse_iter_model_size(l)
        if l.startswith("Iterator 1 model size"):
            run['iter_1_model_size_bytes'] = parse_iter_model_size(l)
        if l.startswith("Merge duration"):
            run['merge_duration_sec'] = parse_merge_duration(l)

    rows = [tc[1] for tc in test_cases.items()]
    df = pd.DataFrame.from_dict(rows)
    
    return df

def extract_testcase(df, key_bytes, use_disk, baseline, others):
    condition = (df['key_bytes'] == key_bytes) & (df['use_disk']==use_disk)
    columns1 = ['list_1_size', 'list_2_size', 'ratio']
    columns1.append(baseline)
    columns2 = ['list_1_size', 'list_2_size', 'ratio']
    columns2.append(baseline + ' %')
    df[baseline + ' %'] = (df[baseline] - df[baseline])/df[baseline] * 100.0
    for o in others:
        columns1.append(o)
        columns2.append(o + " %")
        df[o + ' %'] = (df[baseline] - df[o])/df[baseline] * 100.0
    return (df.loc[condition][columns1].sort_values('list_2_size'), df.loc[condition][columns2].sort_values('list_2_size'))

def get_model_sizes(df, key_bytes):
    condition = (df['key_bytes'] == key_bytes)
    columns = ['list_2_size', 'plr_error_bound', 'iter_1_model_size_bytes', 'iter_1_creation_sec']
    return (df.loc[condition][columns].sort_values('list_2_size'))



df = parse_file('./run_10M_bigrun.txt', 2)  
print(df.columns)

key_sizes = [8]
for key_size in key_sizes:
    d = extract_testcase(df, key_size, 1, 'standard,plr_error=NA_model_size_KB', ['learned,plr_error=2_model_size_KB', 'learned,plr_error=10_model_size_KB', 'learned,plr_error=100_model_size_KB', 'learned,plr_error=1000_model_size_KB'])
    display(d[0])

key_sizes = [8]
for key_size in key_sizes:
    d = extract_testcase(df, key_size, 1, 'standard,plr_error=NA_training_overhead', ['learned,plr_error=2_training_overhead', 'learned,plr_error=10_training_overhead', 'learned,plr_error=100_training_overhead', 'learned,plr_error=1000_training_overhead'])
    display(d[0])


key_sizes = [8, 16, 32]
for key_size in key_sizes:
    d = extract_testcase(df, key_size, 1, 'standard,plr_error=NA', ['learned,plr_error=2', 'learned,plr_error=10', 'learned,plr_error=100', 'learned,plr_error=1000'])
    display(d[0])
    display(d[1])
    

    
print('PARALLEL')
key_sizes = [8, 16, 32]
for key_size in key_sizes:
    d = extract_testcase(df, key_size, 0, 'parallel_standard,threads=4,plr_error=NA', ['parallel_learned,threads=4,plr_error=2','parallel_learned,threads=4,plr_error=10','parallel_learned,threads=4,plr_error=100', 'parallel_learned,threads=4,plr_error=1000'])
    display(d[0])
    display(d[1])

print('JOINS')
key_sizes = [8, 16, 32]
for key_size in key_sizes:
    d = extract_testcase(df, key_size, 1, 'standard_join,plr_error=NA', ['learned_join,plr_error=2', 'learned_join,plr_error=10', 'learned_join,plr_error=100', 'learned_join,plr_error=1000'])
    display(d[0])
    display(d[1])

    
'''
key_sizes = [8, 16, 32]
for key_size in key_sizes:
    d = extract_testcase(df, key_size, 1, 'standard,plr_error=NA', ['learned,plr_error=2', 'learned,plr_error=10', 'learned,plr_error=100'])
    display(d[0])
    display(d[1])

key_sizes = [8, 16, 32]
for key_size in key_sizes:
    d = extract_testcase(df, key_size, 0, 'parallel_standard,threads=4,plr_error=NA', ['parallel_learned,threads=4,plr_error=2','parallel_learned,threads=4,plr_error=10','parallel_learned,threads=4,plr_error=100'])
    display(d[0])
    display(d[1])
    
key_sizes = [8, 16, 32]
for key_size in key_sizes:
    d = extract_testcase(df, key_size, 1, 'parallel_standard,threads=4,plr_error=NA', ['parallel_learned,threads=4,plr_error=2','parallel_learned,threads=4,plr_error=10','parallel_learned,threads=4,plr_error=100'])
    display(d[0])
    display(d[1])

key_sizes = [8, 16, 32]
for key_size in key_sizes:
    d = extract_testcase(df, key_size, 0, 'standard_join,plr_error=NA', ['learned_join,plr_error=2', 'learned_join,plr_error=10', 'learned_join,plr_error=100'])
    display(d[0])
    display(d[1])

key_sizes = [8, 16, 32]
for key_size in key_sizes:
    d = extract_testcase(df, key_size, 1, 'standard_join,plr_error=NA', ['learned_join,plr_error=2', 'learned_join,plr_error=10', 'learned_join,plr_error=100'])
    display(d[0])
    display(d[1])



#display(extract_testcase(df, 16, 1, 'standard,plr_error=NA', ['learned,plr_error=2', 'learned,plr_error=10', 'learned,plr_error=100']))
#display(extract_testcase(df, 32, 1, 'standard,plr_error=NA', ['learned,plr_error=2', 'learned,plr_error=10', 'learned,plr_error=100']))
'''

'''df2 = parse_file('./run_10M.txt', 1)
print(df.columns)
standard_vs_learned = df.loc[(df['key_bytes']==8) & (df['use_disk']==0)][['list_1_size', 'list_2_size', 'ratio', 'standard,plr_error=NA', 'learned,plr_error=10', 'learned,plr_error=100']].sort_values('list_2_size')
display(standard_vs_learned)

standard_vs_learned = df.loc[(df['key_bytes']==8) & (df['use_disk']==0)][['list_1_size', 'list_2_size', 'ratio', 'standard_join,plr_error=NA', 'learned_join,plr_error=10', 'learned_join,plr_error=100']].sort_values('list_2_size')
display(standard_vs_learned)

standard_vs_learned = df.loc[(df['key_bytes']==8) & (df['use_disk']==1)][['list_1_size', 'list_2_size', 'ratio', 'standard_join,plr_error=NA', 'learned_join,plr_error=10', 'learned_join,plr_error=100']].sort_values('list_2_size')
display(standard_vs_learned)

#standard_vs_learned_join = df[['id','standard_join', 'learned_join']]
#display(standard_vs_learned_join)
'''


Index(['id', 'key_bytes', 'use_disk', 'list_1_size', 'list_2_size', 'ratio',
       'plr_error_bound', 'parallel_learned,threads=4,plr_error=10',
       'parallel_learned,threads=4,plr_error=10_model_size_KB',
       'parallel_learned,threads=4,plr_error=10_training_overhead',
       'learned_join,plr_error=100',
       'learned_join,plr_error=100_model_size_KB',
       'learned_join,plr_error=100_training_overhead', 'learned,plr_error=10',
       'learned,plr_error=10_model_size_KB',
       'learned,plr_error=10_training_overhead', 'learned_join,plr_error=10',
       'learned_join,plr_error=10_model_size_KB',
       'learned_join,plr_error=10_training_overhead',
       'parallel_standard,threads=4,plr_error=NA',
       'parallel_standard,threads=4,plr_error=NA_model_size_KB',
       'parallel_standard,threads=4,plr_error=NA_training_overhead',
       'learned_join,plr_error=2', 'learned_join,plr_error=2_model_size_KB',
       'learned_join,plr_error=2_training_overhead', 'standard,plr

Unnamed: 0,list_1_size,list_2_size,ratio,"standard,plr_error=NA_model_size_KB","learned,plr_error=2_model_size_KB","learned,plr_error=10_model_size_KB","learned,plr_error=100_model_size_KB","learned,plr_error=1000_model_size_KB"
27,10000000,10000000,1.0,0,28018,2325,28,0
28,10000000,100000000,10.0,0,280321,23191,280,2
5,10000000,500000000,50.0,0,1401593,116047,1405,14
14,10000000,600000000,60.0,0,1682112,139313,1682,17
26,10000000,800000000,80.0,0,2242684,185714,2246,22
25,10000000,1000000000,100.0,0,2803064,232107,2802,28


Unnamed: 0,list_1_size,list_2_size,ratio,"standard,plr_error=NA_training_overhead","learned,plr_error=2_training_overhead","learned,plr_error=10_training_overhead","learned,plr_error=100_training_overhead","learned,plr_error=1000_training_overhead"
27,10000000,10000000,1.0,0.167,0.691,0.706,0.441,0.695
28,10000000,100000000,10.0,1.332,5.527,4.845,4.471,4.728
5,10000000,500000000,50.0,5.19,27.222,24.221,22.298,22.042
14,10000000,600000000,60.0,6.249,32.555,29.071,26.998,26.666
26,10000000,800000000,80.0,8.379,44.067,38.567,36.294,35.197
25,10000000,1000000000,100.0,10.409,54.52,48.991,44.53,46.077


Unnamed: 0,list_1_size,list_2_size,ratio,"standard,plr_error=NA","learned,plr_error=2","learned,plr_error=10","learned,plr_error=100","learned,plr_error=1000"
27,10000000,10000000,1.0,0.888,1.292,1.315,2.318,13.683
28,10000000,100000000,10.0,3.065,3.317,3.61,5.628,23.104
5,10000000,500000000,50.0,14.165,10.519,10.012,13.724,32.17
14,10000000,600000000,60.0,16.878,11.132,12.506,15.375,35.025
26,10000000,800000000,80.0,22.443,14.718,14.607,17.651,37.6
25,10000000,1000000000,100.0,28.211,17.315,17.887,20.88,41.412


Unnamed: 0,list_1_size,list_2_size,ratio,"standard,plr_error=NA %","learned,plr_error=2 %","learned,plr_error=10 %","learned,plr_error=100 %","learned,plr_error=1000 %"
27,10000000,10000000,1.0,0.0,-45.495495,-48.085586,-161.036036,-1440.878378
28,10000000,100000000,10.0,0.0,-8.22186,-17.781403,-83.621533,-653.800979
5,10000000,500000000,50.0,0.0,25.739499,29.318743,3.113307,-127.109072
14,10000000,600000000,60.0,0.0,34.044318,25.903543,8.905084,-107.518663
26,10000000,800000000,80.0,0.0,34.420532,34.915118,21.351869,-67.535534
25,10000000,1000000000,100.0,0.0,38.623232,36.595654,25.986317,-46.793804


Unnamed: 0,list_1_size,list_2_size,ratio,"standard,plr_error=NA","learned,plr_error=2","learned,plr_error=10","learned,plr_error=100","learned,plr_error=1000"
13,5000000,5000000,1.0,0.552,0.772,0.937,1.469,7.122
35,5000000,50000000,10.0,1.884,2.245,2.413,3.499,12.247
8,5000000,250000000,50.0,8.735,6.815,7.003,8.456,19.26
33,5000000,300000000,60.0,10.431,8.003,8.123,9.575,22.169
19,5000000,400000000,80.0,13.759,10.498,10.61,11.905,23.244
30,5000000,500000000,100.0,18.451,12.437,12.6,14.152,25.906


Unnamed: 0,list_1_size,list_2_size,ratio,"standard,plr_error=NA %","learned,plr_error=2 %","learned,plr_error=10 %","learned,plr_error=100 %","learned,plr_error=1000 %"
13,5000000,5000000,1.0,0.0,-39.855072,-69.746377,-166.123188,-1190.217391
35,5000000,50000000,10.0,0.0,-19.161359,-28.078556,-85.721868,-550.053079
8,5000000,250000000,50.0,0.0,21.980538,19.828277,3.194047,-120.492272
33,5000000,300000000,60.0,0.0,23.276771,22.126354,8.206308,-112.529959
19,5000000,400000000,80.0,0.0,23.70085,22.886838,13.474816,-68.936696
30,5000000,500000000,100.0,0.0,32.594439,31.711018,23.29955,-40.404314


Unnamed: 0,list_1_size,list_2_size,ratio,"standard,plr_error=NA","learned,plr_error=2","learned,plr_error=10","learned,plr_error=100","learned,plr_error=1000"
29,2500000,2500000,1.0,0.236,0.564,0.754,0.92,4.745
12,2500000,25000000,10.0,1.307,1.718,1.81,2.447,7.574
0,2500000,125000000,50.0,6.076,5.392,5.56,6.587,11.811
32,2500000,150000000,60.0,7.273,6.239,6.363,7.44,12.815
6,2500000,200000000,80.0,9.492,8.011,8.037,8.976,14.866
18,2500000,250000000,100.0,12.248,9.788,9.787,12.224,16.801


Unnamed: 0,list_1_size,list_2_size,ratio,"standard,plr_error=NA %","learned,plr_error=2 %","learned,plr_error=10 %","learned,plr_error=100 %","learned,plr_error=1000 %"
29,2500000,2500000,1.0,0.0,-138.983051,-219.491525,-289.830508,-1910.59322
12,2500000,25000000,10.0,0.0,-31.44606,-38.48508,-87.222647,-479.495027
0,2500000,125000000,50.0,0.0,11.257406,8.492429,-8.410138,-94.387755
32,2500000,150000000,60.0,0.0,14.216967,12.512031,-2.296164,-76.199643
6,2500000,200000000,80.0,0.0,15.602613,15.328698,5.436157,-56.616098
18,2500000,250000000,100.0,0.0,20.084912,20.093076,0.19595,-37.173416


PARALLEL


Unnamed: 0,list_1_size,list_2_size,ratio,"parallel_standard,threads=4,plr_error=NA","parallel_learned,threads=4,plr_error=2","parallel_learned,threads=4,plr_error=10","parallel_learned,threads=4,plr_error=100","parallel_learned,threads=4,plr_error=1000"
21,10000000,10000000,1.0,0.184,0.438,0.469,1.03,6.328
34,10000000,100000000,10.0,0.909,1.213,1.585,2.37,12.111
11,10000000,500000000,50.0,4.325,2.741,2.767,3.541,13.601
17,10000000,600000000,60.0,4.188,3.089,3.891,3.553,13.851
15,10000000,800000000,80.0,5.12,3.588,4.292,4.215,20.522
7,10000000,1000000000,100.0,7.533,5.172,4.097,4.611,15.213


Unnamed: 0,list_1_size,list_2_size,ratio,"parallel_standard,threads=4,plr_error=NA %","parallel_learned,threads=4,plr_error=2 %","parallel_learned,threads=4,plr_error=10 %","parallel_learned,threads=4,plr_error=100 %","parallel_learned,threads=4,plr_error=1000 %"
21,10000000,10000000,1.0,0.0,-138.043478,-154.891304,-459.782609,-3339.130435
34,10000000,100000000,10.0,0.0,-33.443344,-74.367437,-160.726073,-1232.343234
11,10000000,500000000,50.0,0.0,36.624277,36.023121,18.127168,-214.473988
17,10000000,600000000,60.0,0.0,26.241643,7.091691,15.162369,-230.730659
15,10000000,800000000,80.0,0.0,29.921875,16.171875,17.675781,-300.820312
7,10000000,1000000000,100.0,0.0,31.342095,45.612638,38.789327,-101.951414


Unnamed: 0,list_1_size,list_2_size,ratio,"parallel_standard,threads=4,plr_error=NA","parallel_learned,threads=4,plr_error=2","parallel_learned,threads=4,plr_error=10","parallel_learned,threads=4,plr_error=100","parallel_learned,threads=4,plr_error=1000"
31,5000000,5000000,1.0,0.105,0.255,0.262,0.605,3.417
23,5000000,50000000,10.0,0.522,0.714,0.75,1.391,4.588
1,5000000,250000000,50.0,2.13,1.846,2.004,2.42,6.348
10,5000000,300000000,60.0,2.908,2.119,2.201,2.982,6.065
3,5000000,400000000,80.0,4.14,2.547,2.996,3.046,9.639
22,5000000,500000000,100.0,3.891,3.892,3.029,3.251,8.269


Unnamed: 0,list_1_size,list_2_size,ratio,"parallel_standard,threads=4,plr_error=NA %","parallel_learned,threads=4,plr_error=2 %","parallel_learned,threads=4,plr_error=10 %","parallel_learned,threads=4,plr_error=100 %","parallel_learned,threads=4,plr_error=1000 %"
31,5000000,5000000,1.0,0.0,-142.857143,-149.52381,-476.190476,-3154.285714
23,5000000,50000000,10.0,0.0,-36.781609,-43.678161,-166.475096,-778.927203
1,5000000,250000000,50.0,0.0,13.333333,5.915493,-13.615023,-198.028169
10,5000000,300000000,60.0,0.0,27.13205,24.312242,-2.544704,-108.562586
3,5000000,400000000,80.0,0.0,38.478261,27.63285,26.425121,-132.826087
22,5000000,500000000,100.0,0.0,-0.0257,22.153688,16.448214,-112.516063


Unnamed: 0,list_1_size,list_2_size,ratio,"parallel_standard,threads=4,plr_error=NA","parallel_learned,threads=4,plr_error=2","parallel_learned,threads=4,plr_error=10","parallel_learned,threads=4,plr_error=100","parallel_learned,threads=4,plr_error=1000"
9,2500000,2500000,1.0,0.073,0.188,0.19,0.432,1.244
24,2500000,25000000,10.0,0.325,0.447,0.469,0.648,1.558
4,2500000,125000000,50.0,1.599,1.302,1.408,1.343,4.508
16,2500000,150000000,60.0,1.848,1.564,1.575,2.003,3.823
20,2500000,200000000,80.0,3.181,1.943,2.062,2.087,5.862
2,2500000,250000000,100.0,2.938,2.938,2.36,2.488,3.532


Unnamed: 0,list_1_size,list_2_size,ratio,"parallel_standard,threads=4,plr_error=NA %","parallel_learned,threads=4,plr_error=2 %","parallel_learned,threads=4,plr_error=10 %","parallel_learned,threads=4,plr_error=100 %","parallel_learned,threads=4,plr_error=1000 %"
9,2500000,2500000,1.0,0.0,-157.534247,-160.273973,-491.780822,-1604.109589
24,2500000,25000000,10.0,0.0,-37.538462,-44.307692,-99.384615,-379.384615
4,2500000,125000000,50.0,0.0,18.574109,11.944966,16.010006,-181.926204
16,2500000,150000000,60.0,0.0,15.367965,14.772727,-8.387446,-106.872294
20,2500000,200000000,80.0,0.0,38.918579,35.177617,34.391701,-84.281672
2,2500000,250000000,100.0,0.0,0.0,19.673247,15.316542,-20.217835


JOINS


Unnamed: 0,list_1_size,list_2_size,ratio,"standard_join,plr_error=NA","learned_join,plr_error=2","learned_join,plr_error=10","learned_join,plr_error=100","learned_join,plr_error=1000"
27,10000000,10000000,1.0,0.677,1.121,1.078,3.541,24.769
28,10000000,100000000,10.0,1.628,1.098,1.257,3.341,24.659
5,10000000,500000000,50.0,6.121,1.7,1.834,3.864,25.204
14,10000000,600000000,60.0,7.184,2.359,1.971,4.453,25.117
26,10000000,800000000,80.0,9.604,2.042,2.224,4.502,25.218
25,10000000,1000000000,100.0,11.527,2.279,2.444,4.665,25.505


Unnamed: 0,list_1_size,list_2_size,ratio,"standard_join,plr_error=NA %","learned_join,plr_error=2 %","learned_join,plr_error=10 %","learned_join,plr_error=100 %","learned_join,plr_error=1000 %"
27,10000000,10000000,1.0,0.0,-65.583456,-59.231905,-423.042836,-3558.641064
28,10000000,100000000,10.0,0.0,32.555283,22.788698,-105.22113,-1414.68059
5,10000000,500000000,50.0,0.0,72.22676,70.037576,36.87306,-311.762784
14,10000000,600000000,60.0,0.0,67.16314,72.564031,38.015033,-249.624165
26,10000000,800000000,80.0,0.0,78.738026,76.842982,53.123698,-162.578092
25,10000000,1000000000,100.0,0.0,80.229028,78.797606,59.5298,-121.263121


Unnamed: 0,list_1_size,list_2_size,ratio,"standard_join,plr_error=NA","learned_join,plr_error=2","learned_join,plr_error=10","learned_join,plr_error=100","learned_join,plr_error=1000"
13,5000000,5000000,1.0,0.266,0.543,0.626,1.818,12.266
35,5000000,50000000,10.0,0.852,0.733,0.81,1.981,12.941
8,5000000,250000000,50.0,3.376,1.274,1.398,2.485,13.97
33,5000000,300000000,60.0,3.98,1.411,1.512,2.732,14.405
19,5000000,400000000,80.0,5.629,1.928,1.713,3.095,14.062
30,5000000,500000000,100.0,6.487,1.855,1.902,3.065,14.904


Unnamed: 0,list_1_size,list_2_size,ratio,"standard_join,plr_error=NA %","learned_join,plr_error=2 %","learned_join,plr_error=10 %","learned_join,plr_error=100 %","learned_join,plr_error=1000 %"
13,5000000,5000000,1.0,0.0,-104.135338,-135.338346,-583.458647,-4511.278195
35,5000000,50000000,10.0,0.0,13.967136,4.929577,-132.511737,-1418.896714
8,5000000,250000000,50.0,0.0,62.263033,58.590047,26.39218,-313.803318
33,5000000,300000000,60.0,0.0,64.547739,62.01005,31.356784,-261.934673
19,5000000,400000000,80.0,0.0,65.748801,69.568307,45.016877,-149.813466
30,5000000,500000000,100.0,0.0,71.404347,70.679821,52.751657,-129.751811


Unnamed: 0,list_1_size,list_2_size,ratio,"standard_join,plr_error=NA","learned_join,plr_error=2","learned_join,plr_error=10","learned_join,plr_error=100","learned_join,plr_error=1000"
29,2500000,2500000,1.0,0.145,0.388,0.527,1.177,6.968
12,2500000,25000000,10.0,0.486,0.551,0.611,1.441,7.863
0,2500000,125000000,50.0,1.959,1.034,1.13,1.719,7.701
32,2500000,150000000,60.0,2.311,1.116,1.168,1.795,7.987
6,2500000,200000000,80.0,2.989,1.263,1.335,2.042,8.91
18,2500000,250000000,100.0,3.723,1.419,1.654,2.345,8.363


Unnamed: 0,list_1_size,list_2_size,ratio,"standard_join,plr_error=NA %","learned_join,plr_error=2 %","learned_join,plr_error=10 %","learned_join,plr_error=100 %","learned_join,plr_error=1000 %"
29,2500000,2500000,1.0,0.0,-167.586207,-263.448276,-711.724138,-4705.517241
12,2500000,25000000,10.0,0.0,-13.374486,-25.720165,-196.502058,-1517.901235
0,2500000,125000000,50.0,0.0,47.217968,42.317509,12.251149,-293.108729
32,2500000,150000000,60.0,0.0,51.709217,49.459109,22.327997,-245.607962
6,2500000,200000000,80.0,0.0,57.745065,55.336233,31.682837,-198.093008
18,2500000,250000000,100.0,0.0,61.885576,55.573462,37.013161,-124.630674


"df2 = parse_file('./run_10M.txt', 1)\nprint(df.columns)\nstandard_vs_learned = df.loc[(df['key_bytes']==8) & (df['use_disk']==0)][['list_1_size', 'list_2_size', 'ratio', 'standard,plr_error=NA', 'learned,plr_error=10', 'learned,plr_error=100']].sort_values('list_2_size')\ndisplay(standard_vs_learned)\n\nstandard_vs_learned = df.loc[(df['key_bytes']==8) & (df['use_disk']==0)][['list_1_size', 'list_2_size', 'ratio', 'standard_join,plr_error=NA', 'learned_join,plr_error=10', 'learned_join,plr_error=100']].sort_values('list_2_size')\ndisplay(standard_vs_learned)\n\nstandard_vs_learned = df.loc[(df['key_bytes']==8) & (df['use_disk']==1)][['list_1_size', 'list_2_size', 'ratio', 'standard_join,plr_error=NA', 'learned_join,plr_error=10', 'learned_join,plr_error=100']].sort_values('list_2_size')\ndisplay(standard_vs_learned)\n\n#standard_vs_learned_join = df[['id','standard_join', 'learned_join']]\n#display(standard_vs_learned_join)\n"

### Size Skewness vs PLR Error bound

Experiment Parameters:

* Size n1, n2.
* Items entirely stored in memory or loaded as pages on disk
* PLR Error bound
* Key types: (char \*), uint64_t, 128bit 
* Merge type: Standard (compare all heads), Learned (lookup limit, skip comparisons)

Measuring

* Merge time
* Model creation time
* Index size


### PLR Error bound vs Training time and Size

In [9]:
import matplotlib.pyplot as plt

def get_merge_duration_table(is_parallel, is_disk, key_bytes):
    conditions = (df['merge_mode'].str.contains('erge')) & (df['merge_mode'].str.contains('Parallel') == is_parallel) & (df['use_disk']==is_disk) & (df['key_bytes']==key_bytes)
    columns = ['list_1_size', 'ratio', 'merge_mode_with_error', 'merge_duration_sec']
    table = df.loc[conditions][columns]
    ordered_table = table.pivot_table('merge_duration_sec', ['ratio'], 'merge_mode_with_error')
    display(ordered_table)
    cols = []
    if not is_parallel:
        cols = ['Standard Merge (plr_error:nan)' , 'Learned Merge (plr_error:10.0)']
    else:
        cols = [
            'Parallelized Standard Merge (plr_error:nan)',
           'Parallelized Learned Merge (plr_error:10.0)']
    return ordered_table[cols]



In [10]:
parallel = [False, True]
disk = [False, True]
key_sizes = [(8, "10M"), (16, "5M"), (32, "2.5M")]

for is_disk in disk:
    for is_parallel in parallel:
        for key_size in key_sizes:
            table = get_merge_duration_table(is_parallel=is_parallel, is_disk=is_disk, key_bytes=key_size[0])
            if is_parallel:
                print("Threads: " + str(4))
            else:
                print("Single threaded")
            print("Key Size: " + str(key_size[0]) + "bytes, Num Keys: " + str(key_size[1]))
            print("On Disk: " + str(is_disk))

            columns = table.columns
            display(table[[columns[0], columns[1]]])



merge_mode_with_error,Learned Merge (plr_error:10.0),Learned Merge (plr_error:100.0),Standard Merge (plr_error:10.0),Standard Merge (plr_error:100.0),Standard Merge (plr_error:nan)
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,0.993,1.573,0.779,2.101,0.493
10.0,2.546,3.699,0.905,2.053,1.6255
50.0,6.153,7.696,1.136,2.195,6.9235
60.0,7.038,8.59,1.2,2.211,8.2375
80.0,8.898,10.506,1.312,2.327,10.867
100.0,10.567,12.122,1.402,2.373,13.6555


Single threaded
Key Size: 8bytes, Num Keys: 10M
On Disk: False


merge_mode_with_error,Standard Merge (plr_error:nan),Learned Merge (plr_error:10.0)
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,0.493,0.993
10.0,1.6255,2.546
50.0,6.9235,6.153
60.0,8.2375,7.038
80.0,10.867,8.898
100.0,13.6555,10.567


merge_mode_with_error,Learned Merge (plr_error:10.0),Learned Merge (plr_error:100.0),Standard Merge (plr_error:10.0),Standard Merge (plr_error:100.0),Standard Merge (plr_error:nan)
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,0.826,1.145,0.484,1.206,0.319
10.0,1.63,2.176,0.556,1.153,0.97
50.0,4.043,4.738,0.795,1.692,3.7905
60.0,4.617,5.347,0.842,1.374,4.511
80.0,5.763,6.622,0.929,1.449,6.0105
100.0,6.87,7.76,1.012,1.511,7.461


Single threaded
Key Size: 16bytes, Num Keys: 5M
On Disk: False


merge_mode_with_error,Standard Merge (plr_error:nan),Learned Merge (plr_error:10.0)
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,0.319,0.826
10.0,0.97,1.63
50.0,3.7905,4.043
60.0,4.511,4.617
80.0,6.0105,5.763
100.0,7.461,6.87


merge_mode_with_error
ratio


KeyError: "None of [Index(['Standard Merge (plr_error:nan)', 'Learned Merge (plr_error:10.0)'], dtype='object', name='merge_mode_with_error')] are in the [columns]"

In [4]:
def get_plr_overhead_size_table(is_parallel, is_disk, key_bytes):
    conditions = (df['merge_mode'].str.contains('Parallel') == is_parallel) & (df['use_disk']==is_disk) & (df['key_bytes']==key_bytes)
    columns = ['list_2_size', 'merge_mode_with_error', 'iter_1_model_size_bytes', 'iter_1_creation_sec']
    table = df.loc[conditions][columns]
    ordered_table = table.pivot_table('iter_1_model_size_bytes', ['list_2_size'], 'merge_mode_with_error')
    cols = []
    if not is_parallel:
        cols = ['Standard Merge (plr_error:nan)' ,'Learned Merge (plr_error:2.0)', 'Learned Merge (plr_error:10.0)', 'Learned Merge (plr_error:50.0)']
    else:
        cols = [
            'Parallelized Standard Merge (plr_error:nan)',
            'Parallelized Learned Merge (plr_error:2.0)',
           'Parallelized Learned Merge (plr_error:10.0)',
           'Parallelized Learned Merge (plr_error:50.0)']
    return ordered_table[cols]

print(8, "40M")
display(get_plr_overhead_size_table(True, True, 8))
print(16, "20M")
display(get_plr_overhead_size_table(True, True, 16))
print(32, "10M")
display(get_plr_overhead_size_table(True, True, 32))

8 40M


merge_mode_with_error,Parallelized Standard Merge (plr_error:nan),Parallelized Learned Merge (plr_error:2.0),Parallelized Learned Merge (plr_error:10.0),Parallelized Learned Merge (plr_error:50.0)
list_2_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
40000000,0.0,114836096.0,9502144.0,450080.0
400000000,0.0,770518720.0,63802144.0,3024032.0
1200000000,0.0,770522528.0,63755264.0,3023552.0
2000000000,0.0,770639488.0,63798816.0,3026944.0


16 20M


merge_mode_with_error,Parallelized Standard Merge (plr_error:nan),Parallelized Learned Merge (plr_error:2.0),Parallelized Learned Merge (plr_error:10.0),Parallelized Learned Merge (plr_error:50.0)
list_2_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20000000,0.0,57392960.0,4761632.0,223712.0
200000000,0.0,385305344.0,31908832.0,1512064.0
600000000,0.0,385205120.0,,1511520.0
1000000000,0.0,385251072.0,31887296.0,1512960.0
1200000000,0.0,385293344.0,31908608.0,1512704.0


32 10M


merge_mode_with_error,Parallelized Standard Merge (plr_error:nan),Parallelized Learned Merge (plr_error:2.0),Parallelized Learned Merge (plr_error:10.0),Parallelized Learned Merge (plr_error:50.0)
list_2_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10000000,0.0,28711008.0,2380320.0,113664.0
100000000,0.0,192663488.0,15945440.0,756320.0
300000000,0.0,192629632.0,15950880.0,754880.0
500000000,0.0,192614304.0,15954016.0,757504.0
600000000,,192655616.0,15944736.0,756608.0


In [5]:
def get_plr_overhead_train_table(is_parallel, is_disk, key_bytes):
    conditions = (df['merge_mode'].str.contains('Parallel') == is_parallel) & (df['use_disk']==is_disk) & (df['key_bytes']==key_bytes)
    columns = ['list_2_size', 'merge_mode_with_error', 'iter_1_model_size_bytes', 'iter_1_creation_sec']
    table = df.loc[conditions][columns]
    ordered_table = table.pivot_table('iter_1_creation_sec', ['list_2_size'], 'merge_mode_with_error')
    cols = []
    if not is_parallel:
        cols = ['Standard Merge (plr_error:nan)' ,'Learned Merge (plr_error:2.0)', 'Learned Merge (plr_error:10.0)', 'Learned Merge (plr_error:50.0)']
    else:
        cols = [
            'Parallelized Standard Merge (plr_error:nan)',
            'Parallelized Learned Merge (plr_error:2.0)',
           'Parallelized Learned Merge (plr_error:10.0)',
           'Parallelized Learned Merge (plr_error:50.0)']
    return ordered_table[cols]

print(8, "40M")
display(get_plr_overhead_train_table(False, True, 8))
print(16, "20M")
display(get_plr_overhead_train_table(False, True, 16))
print(32, "10M")
display(get_plr_overhead_train_table(False, True, 32))

8 40M


merge_mode_with_error,Standard Merge (plr_error:nan),Learned Merge (plr_error:2.0),Learned Merge (plr_error:10.0),Learned Merge (plr_error:50.0)
list_2_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
40000000,0.525,2.247,1.991,1.901
400000000,5.522,18.176,16.54,15.706
1200000000,19.433,39.806,38.398,37.183
2000000000,34.31,63.413,59.634,59.176


16 20M


merge_mode_with_error,Standard Merge (plr_error:nan),Learned Merge (plr_error:2.0),Learned Merge (plr_error:10.0),Learned Merge (plr_error:50.0)
list_2_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20000000,0.427,1.561,1.462,1.445
200000000,4.521,13.575,13.021,12.464
600000000,16.356,32.991,32.996,32.543
1000000000,32.248,54.229,54.575,51.96
1200000000,38.587,63.328,62.953,61.406


32 10M


merge_mode_with_error,Standard Merge (plr_error:nan),Learned Merge (plr_error:2.0),Learned Merge (plr_error:10.0),Learned Merge (plr_error:50.0)
list_2_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10000000,0.375,1.265,1.173,1.198
100000000,3.968,11.33,11.208,10.943
300000000,16.26,30.484,30.207,29.671
500000000,31.238,50.868,50.28,51.15
600000000,39.908,60.776,52.941,53.912
