In [78]:
import pandas as pd
from parse_results import benchmark_output_as_dataframe

# These methods are hard-coded to the data in run_10M_3 (4 threads, 10M keys).
# It shouldn't be too much work to generalize it, but as of now it's not.

def extract_merge_duration(df, baseline, experiment, key_size_byte, use_disk):
    exp_condition = (df['merge_mode'] == baseline) 
    exp_condition = exp_condition | (df['merge_mode'] == experiment)
    condition = exp_condition
    condition = condition & (df['key_size_bytes']==key_size_byte)
    condition = condition & (df['use_disk'] == use_disk)

    num_items = df.loc[condition]['iter_0.num_items'].unique()
    num_threads = df.loc[condition]['num_threads'].unique()
    print(num_threads)
    ret = []
    for num_thread in num_threads:
        for num_item in num_items:
            rdf = df.loc[condition & (df['iter_0.num_items'] == num_item)]
            rdf = rdf.sort_values('plr_error')
            rdf = rdf[['merge_mode', 'plr_error', 'merge_duration_sec', 'ratio']]
            rdf = pd.pivot_table(rdf, index='ratio', columns=['merge_mode', 'plr_error'] , aggfunc='median')
            baseline = ('merge_duration_sec', baseline, 0) 
            for column in rdf.columns:
                column_rel = list(column)
                column_rel[0] = 'merge_duration_rel'
                rdf[tuple(column_rel)] = (((rdf[baseline]-rdf[column])/rdf[baseline]) * 100.0).round(2)
        ret.append({'num_items': num_item, 'df': rdf, 'num_threads': num_thread})
    return ret
        

def print_experiment(df, exp, baseline, experiment, key_size, on_disk):
    ret = extract_merge_duration(df, baseline, experiment, key_size, on_disk)
    num_items = ret[0]['num_items']
    df = ret[0]['df']
    num_threads = ret[0]['num_threads']

    print("#### %s\n" % exp)
    print('Num Keys in List 1: ' + str(num_items))
    print('Num Threads: ' + str(num_threads))

    print("##### Absolute number (sec) \n")
    print(df['merge_duration_sec'].to_markdown())
    print("")

    print("###### Relative number (%) \n")
    print(df['merge_duration_rel'].to_markdown())
    print("")

def print_key_size_data(df, key_size):
    print("## %d Byte String Keys\n" % key_size)
    print("### In Memory\n")
    print_experiment(df, 'Single Threaded Merge', 'standard', 'learned', key_size, False)
    print_experiment(df, 'Multi-Threaded Merge', 'parallel_standard', 'parallel_learned', key_size, False)
    print_experiment(df, 'Single Threaded Join', 'standard_join', 'learned_join', key_size, False)

    print("### On Disk\n")
    print_experiment(df, 'Single Threaded Merge', 'standard', 'learned', key_size, True)
    print_experiment(df, 'Multi-Threaded Merge', 'parallel_standard', 'parallel_learned', key_size, True)
    print_experiment(df, 'Single Threaded Join', 'standard_join', 'learned_join', key_size, True)
    


In [79]:
df = benchmark_output_as_dataframe('debug_run_report_uint64.txt')
display(df)

key_sizes = [8]
for key_size in key_sizes:
    print_key_size_data(df, key_size)

model_size = df.loc[(df['key_size_bytes']==8) & (df['plr_error'] > 0)][['iter_1.num_items', 'plr_error', 'iter_1.model_size_bytes', 'iter_1.item_total_size_bytes']].groupby(['iter_1.num_items', 'plr_error'])
model_size_bytes = model_size.median()
model_size_bytes['rel_percent'] = (model_size_bytes['iter_1.model_size_bytes']/model_size_bytes['iter_1.item_total_size_bytes'] * 100.0).round(2)
print('### Model Size to PLR Error')
print(model_size_bytes[['iter_1.model_size_bytes', 'rel_percent']].to_markdown(tablefmt="pretty"))



Unnamed: 0,merge_duration_sec,status,merge_mode,use_disk,key_size_bytes,num_threads,ratio,plr_error,iter_0.num_items,iter_0.creation_sec,iter_0.model_size_bytes,iter_1.num_items,iter_1.creation_sec,iter_1.model_size_bytes,iter_0.item_total_size_bytes,iter_1.item_total_size_bytes
0,0.000,Failed,no_op,False,0,1,0.0,0,0,0.000,0,0,0.000,0,0,0
1,0.000,Success,no_op,False,8,1,1.0,0,40000,0.006,9184,40000,0.006,9184,320000,320000
2,0.000,Success,no_op,True,8,1,1.0,0,40000,0.006,9184,40000,0.006,9184,320000,320000
3,0.000,Success,no_op,False,8,1,10.0,0,40000,0.006,9184,400000,0.033,94912,320000,3200000
4,0.000,Success,no_op,True,8,1,10.0,0,40000,0.006,9184,400000,0.033,94912,320000,3200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302,0.042,Success,learned,True,32,1,80.0,10,10000,0.001,2336,800000,0.057,187808,320000,25600000
303,0.036,Success,parallel_learned,True,32,4,60.0,1000,10000,0.002,32,600000,0.040,32,320000,19200000
304,0.017,Success,learned_join,True,16,1,100.0,100,20000,0.003,64,2000000,0.118,240704,320000,32000000
305,0.084,Success,learned_join,True,16,1,100.0,1000,20000,0.003,32,2000000,0.115,1760,320000,32000000


## 8 Byte String Keys

### In Memory

[]


IndexError: list index out of range