In [10]:
import pandas as pd
from parse_results import benchmark_output_as_dataframe

# These methods are hard-coded to the data in run_10M_3 (4 threads, 10M keys).
# It shouldn't be too much work to generalize it, but as of now it's not.

def extract_merge_duration(df, baseline, experiment, key_size_byte, use_disk):

    exp_condition = (df['merge_mode'] == baseline) 
    exp_condition = exp_condition | (df['merge_mode'] == experiment)
    condition = exp_condition
    condition = condition & (df['key_size_bytes']==key_size_byte)
    condition = condition & (df['use_disk'] == use_disk)
    
    rdf = df.loc[condition].sort_values('plr_error')
    rdf = rdf[['merge_mode', 'plr_error', 'merge_duration_sec', 'ratio', 'key_size_bytes']]
    rdf = pd.pivot_table(rdf, index='ratio', columns=['merge_mode', 'plr_error'] , aggfunc='median')

    # Assume baseline has always plr_error=0
    baseline = ('merge_duration_sec', baseline, 0) 
    for column in rdf.columns:
        column_rel = list(column)
        column_rel[0] = 'merge_duration_rel'
        rdf[tuple(column_rel)] = (((rdf[baseline]-rdf[column])/rdf[baseline]) * 100.0).round(2)
    return rdf
        

def print_experiment(df, exp, baseline, experiment, key_size, on_disk):
    df = extract_merge_duration(df, baseline, experiment, key_size, on_disk)
    print("#### %s\n" % exp)
    print("##### Absolute number (sec) \n")
    print(df['merge_duration_sec'].to_markdown())
    print("")

    print("###### Relative number (%) \n")
    print(df['merge_duration_rel'].to_markdown())
    print("")

def print_key_size_data(df, key_size):
    print("## %d Byte String Keys\n" % key_size)
    print("### In Memory\n")
    print_experiment(df, 'Single Threaded Merge', 'standard', 'learned', key_size, False)
    print_experiment(df, 'Multi-Threaded Merge, 4 threads', 'parallel_standard', 'parallel_learned', key_size, False)
    print_experiment(df, 'Single Threaded Join', 'standard_join', 'learned_join', key_size, False)

    print("### On Disk\n")
    print_experiment(df, 'Single Threaded Merge', 'standard', 'learned', key_size, True)
    print_experiment(df, 'Multi-Threaded Merge, 4 threads', 'parallel_standard', 'parallel_learned', key_size, True)
    print_experiment(df, 'Single Threaded Join', 'standard_join', 'learned_join', key_size, True)
    


In [14]:
df = benchmark_output_as_dataframe('run_10M_3.txt')

key_sizes = [8, 16, 32]
for key_size in key_sizes:
    print_key_size_data(df, key_size)

model_size = df.loc[(df['key_size_bytes']==8) & (df['plr_error'] > 0)][['iter_1.num_items', 'plr_error', 'iter_1.model_size_bytes', 'iter_1.item_total_size_bytes']].groupby(['iter_1.num_items', 'plr_error'])
model_size_bytes = model_size.median()
model_size_bytes['rel_percent'] = (model_size_bytes['iter_1.model_size_bytes']/model_size_bytes['iter_1.item_total_size_bytes'] * 100.0).round(2)
print('### Model Size to PLR Error')
print(model_size_bytes[['iter_1.model_size_bytes', 'rel_percent']].to_markdown(tablefmt="pretty"))



## 8 Byte String Keys

### In Memory

#### Single Threaded Merge

##### Absolute number (sec) 

|   ratio |   ('learned', 2) |   ('learned', 10) |   ('learned', 100) |   ('learned', 1000) |   ('standard', 0) |
|--------:|-----------------:|------------------:|-------------------:|--------------------:|------------------:|
|       1 |            0.943 |             0.999 |              1.543 |               7.897 |             0.371 |
|      10 |            2.374 |             2.52  |              3.565 |              14.776 |             2.048 |
|      50 |            6.132 |             6.329 |              7.668 |              20.151 |             9.486 |
|      60 |            7.068 |             7.264 |              8.621 |              21.559 |            11.354 |
|      80 |            8.906 |             9.119 |             10.525 |              23.419 |            15.079 |
|     100 |           10.754 |            10.976 |             12.384 |              25.803 |            1

#### Single Threaded Join

##### Absolute number (sec) 

|   ratio |   ('learned_join', 2) |   ('learned_join', 10) |   ('learned_join', 100) |   ('learned_join', 1000) |   ('standard_join', 0) |
|--------:|----------------------:|-----------------------:|------------------------:|-------------------------:|-----------------------:|
|       1 |                 0.303 |                  0.335 |                   0.644 |                    3.784 |                  0.1   |
|      10 |                 0.377 |                  0.398 |                   0.655 |                    3.572 |                  0.26  |
|      50 |                 0.524 |                  0.576 |                   0.817 |                    3.726 |                  0.97  |
|      60 |                 0.54  |                  0.607 |                   0.842 |                    3.618 |                  1.15  |
|      80 |                 0.56  |                  0.657 |                   0.879 |                    3.7

### Size Skewness vs PLR Error bound

Experiment Parameters:

* Size n1, n2.
* Items entirely stored in memory or loaded as pages on disk
* PLR Error bound
* Key types: (char \*), uint64_t, 128bit 
* Merge type: Standard (compare all heads), Learned (lookup limit, skip comparisons)

Measuring

* Merge time
* Model creation time
* Index size


### PLR Error bound vs Training time and Size

In [9]:
import matplotlib.pyplot as plt

def get_merge_duration_table(is_parallel, is_disk, key_bytes):
    conditions = (df['merge_mode'].str.contains('erge')) & (df['merge_mode'].str.contains('Parallel') == is_parallel) & (df['use_disk']==is_disk) & (df['key_bytes']==key_bytes)
    columns = ['list_1_size', 'ratio', 'merge_mode_with_error', 'merge_duration_sec']
    table = df.loc[conditions][columns]
    ordered_table = table.pivot_table('merge_duration_sec', ['ratio'], 'merge_mode_with_error')
    display(ordered_table)
    cols = []
    if not is_parallel:
        cols = ['Standard Merge (plr_error:nan)' , 'Learned Merge (plr_error:10.0)']
    else:
        cols = [
            'Parallelized Standard Merge (plr_error:nan)',
           'Parallelized Learned Merge (plr_error:10.0)']
    return ordered_table[cols]



In [10]:
parallel = [False, True]
disk = [False, True]
key_sizes = [(8, "10M"), (16, "5M"), (32, "2.5M")]

for is_disk in disk:
    for is_parallel in parallel:
        for key_size in key_sizes:
            table = get_merge_duration_table(is_parallel=is_parallel, is_disk=is_disk, key_bytes=key_size[0])
            if is_parallel:
                print("Threads: " + str(4))
            else:
                print("Single threaded")
            print("Key Size: " + str(key_size[0]) + "bytes, Num Keys: " + str(key_size[1]))
            print("On Disk: " + str(is_disk))

            columns = table.columns
            display(table[[columns[0], columns[1]]])



merge_mode_with_error,Learned Merge (plr_error:10.0),Learned Merge (plr_error:100.0),Standard Merge (plr_error:10.0),Standard Merge (plr_error:100.0),Standard Merge (plr_error:nan)
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,0.993,1.573,0.779,2.101,0.493
10.0,2.546,3.699,0.905,2.053,1.6255
50.0,6.153,7.696,1.136,2.195,6.9235
60.0,7.038,8.59,1.2,2.211,8.2375
80.0,8.898,10.506,1.312,2.327,10.867
100.0,10.567,12.122,1.402,2.373,13.6555


Single threaded
Key Size: 8bytes, Num Keys: 10M
On Disk: False


merge_mode_with_error,Standard Merge (plr_error:nan),Learned Merge (plr_error:10.0)
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,0.493,0.993
10.0,1.6255,2.546
50.0,6.9235,6.153
60.0,8.2375,7.038
80.0,10.867,8.898
100.0,13.6555,10.567


merge_mode_with_error,Learned Merge (plr_error:10.0),Learned Merge (plr_error:100.0),Standard Merge (plr_error:10.0),Standard Merge (plr_error:100.0),Standard Merge (plr_error:nan)
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,0.826,1.145,0.484,1.206,0.319
10.0,1.63,2.176,0.556,1.153,0.97
50.0,4.043,4.738,0.795,1.692,3.7905
60.0,4.617,5.347,0.842,1.374,4.511
80.0,5.763,6.622,0.929,1.449,6.0105
100.0,6.87,7.76,1.012,1.511,7.461


Single threaded
Key Size: 16bytes, Num Keys: 5M
On Disk: False


merge_mode_with_error,Standard Merge (plr_error:nan),Learned Merge (plr_error:10.0)
ratio,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,0.319,0.826
10.0,0.97,1.63
50.0,3.7905,4.043
60.0,4.511,4.617
80.0,6.0105,5.763
100.0,7.461,6.87


merge_mode_with_error
ratio


KeyError: "None of [Index(['Standard Merge (plr_error:nan)', 'Learned Merge (plr_error:10.0)'], dtype='object', name='merge_mode_with_error')] are in the [columns]"

In [4]:
def get_plr_overhead_size_table(is_parallel, is_disk, key_bytes):
    conditions = (df['merge_mode'].str.contains('Parallel') == is_parallel) & (df['use_disk']==is_disk) & (df['key_bytes']==key_bytes)
    columns = ['list_2_size', 'merge_mode_with_error', 'iter_1_model_size_bytes', 'iter_1_creation_sec']
    table = df.loc[conditions][columns]
    ordered_table = table.pivot_table('iter_1_model_size_bytes', ['list_2_size'], 'merge_mode_with_error')
    cols = []
    if not is_parallel:
        cols = ['Standard Merge (plr_error:nan)' ,'Learned Merge (plr_error:2.0)', 'Learned Merge (plr_error:10.0)', 'Learned Merge (plr_error:50.0)']
    else:
        cols = [
            'Parallelized Standard Merge (plr_error:nan)',
            'Parallelized Learned Merge (plr_error:2.0)',
           'Parallelized Learned Merge (plr_error:10.0)',
           'Parallelized Learned Merge (plr_error:50.0)']
    return ordered_table[cols]

print(8, "40M")
display(get_plr_overhead_size_table(True, True, 8))
print(16, "20M")
display(get_plr_overhead_size_table(True, True, 16))
print(32, "10M")
display(get_plr_overhead_size_table(True, True, 32))

8 40M


merge_mode_with_error,Parallelized Standard Merge (plr_error:nan),Parallelized Learned Merge (plr_error:2.0),Parallelized Learned Merge (plr_error:10.0),Parallelized Learned Merge (plr_error:50.0)
list_2_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
40000000,0.0,114836096.0,9502144.0,450080.0
400000000,0.0,770518720.0,63802144.0,3024032.0
1200000000,0.0,770522528.0,63755264.0,3023552.0
2000000000,0.0,770639488.0,63798816.0,3026944.0


16 20M


merge_mode_with_error,Parallelized Standard Merge (plr_error:nan),Parallelized Learned Merge (plr_error:2.0),Parallelized Learned Merge (plr_error:10.0),Parallelized Learned Merge (plr_error:50.0)
list_2_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20000000,0.0,57392960.0,4761632.0,223712.0
200000000,0.0,385305344.0,31908832.0,1512064.0
600000000,0.0,385205120.0,,1511520.0
1000000000,0.0,385251072.0,31887296.0,1512960.0
1200000000,0.0,385293344.0,31908608.0,1512704.0


32 10M


merge_mode_with_error,Parallelized Standard Merge (plr_error:nan),Parallelized Learned Merge (plr_error:2.0),Parallelized Learned Merge (plr_error:10.0),Parallelized Learned Merge (plr_error:50.0)
list_2_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10000000,0.0,28711008.0,2380320.0,113664.0
100000000,0.0,192663488.0,15945440.0,756320.0
300000000,0.0,192629632.0,15950880.0,754880.0
500000000,0.0,192614304.0,15954016.0,757504.0
600000000,,192655616.0,15944736.0,756608.0


In [5]:
def get_plr_overhead_train_table(is_parallel, is_disk, key_bytes):
    conditions = (df['merge_mode'].str.contains('Parallel') == is_parallel) & (df['use_disk']==is_disk) & (df['key_bytes']==key_bytes)
    columns = ['list_2_size', 'merge_mode_with_error', 'iter_1_model_size_bytes', 'iter_1_creation_sec']
    table = df.loc[conditions][columns]
    ordered_table = table.pivot_table('iter_1_creation_sec', ['list_2_size'], 'merge_mode_with_error')
    cols = []
    if not is_parallel:
        cols = ['Standard Merge (plr_error:nan)' ,'Learned Merge (plr_error:2.0)', 'Learned Merge (plr_error:10.0)', 'Learned Merge (plr_error:50.0)']
    else:
        cols = [
            'Parallelized Standard Merge (plr_error:nan)',
            'Parallelized Learned Merge (plr_error:2.0)',
           'Parallelized Learned Merge (plr_error:10.0)',
           'Parallelized Learned Merge (plr_error:50.0)']
    return ordered_table[cols]

print(8, "40M")
display(get_plr_overhead_train_table(False, True, 8))
print(16, "20M")
display(get_plr_overhead_train_table(False, True, 16))
print(32, "10M")
display(get_plr_overhead_train_table(False, True, 32))

8 40M


merge_mode_with_error,Standard Merge (plr_error:nan),Learned Merge (plr_error:2.0),Learned Merge (plr_error:10.0),Learned Merge (plr_error:50.0)
list_2_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
40000000,0.525,2.247,1.991,1.901
400000000,5.522,18.176,16.54,15.706
1200000000,19.433,39.806,38.398,37.183
2000000000,34.31,63.413,59.634,59.176


16 20M


merge_mode_with_error,Standard Merge (plr_error:nan),Learned Merge (plr_error:2.0),Learned Merge (plr_error:10.0),Learned Merge (plr_error:50.0)
list_2_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20000000,0.427,1.561,1.462,1.445
200000000,4.521,13.575,13.021,12.464
600000000,16.356,32.991,32.996,32.543
1000000000,32.248,54.229,54.575,51.96
1200000000,38.587,63.328,62.953,61.406


32 10M


merge_mode_with_error,Standard Merge (plr_error:nan),Learned Merge (plr_error:2.0),Learned Merge (plr_error:10.0),Learned Merge (plr_error:50.0)
list_2_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10000000,0.375,1.265,1.173,1.198
100000000,3.968,11.33,11.208,10.943
300000000,16.26,30.484,30.207,29.671
500000000,31.238,50.868,50.28,51.15
600000000,39.908,60.776,52.941,53.912


## 8 Byte String Keys

### In Memory

#### Single Threaded Merge

##### Absolute number (sec) 

|   ratio |   ('learned', 2) |   ('learned', 10) |   ('learned', 100) |   ('learned', 1000) |   ('standard', 0) |
|--------:|-----------------:|------------------:|-------------------:|--------------------:|------------------:|
|       1 |            0.943 |             0.999 |              1.543 |               7.897 |             0.371 |
|      10 |            2.374 |             2.52  |              3.565 |              14.776 |             2.048 |
|      50 |            6.132 |             6.329 |              7.668 |              20.151 |             9.486 |
|      60 |            7.068 |             7.264 |              8.621 |              21.559 |            11.354 |
|      80 |            8.906 |             9.119 |             10.525 |              23.419 |            15.079 |
|     100 |           10.754 |            10.976 |             12.384 |              25.803 |            1

#### Single Threaded Join

##### Absolute number (sec) 

|   ratio |   ('learned_join', 2) |   ('learned_join', 10) |   ('learned_join', 100) |   ('learned_join', 1000) |   ('standard_join', 0) |
|--------:|----------------------:|-----------------------:|------------------------:|-------------------------:|-----------------------:|
|       1 |                 0.303 |                  0.335 |                   0.644 |                    3.784 |                  0.1   |
|      10 |                 0.377 |                  0.398 |                   0.655 |                    3.572 |                  0.26  |
|      50 |                 0.524 |                  0.576 |                   0.817 |                    3.726 |                  0.97  |
|      60 |                 0.54  |                  0.607 |                   0.842 |                    3.618 |                  1.15  |
|      80 |                 0.56  |                  0.657 |                   0.879 |                    3.7