In [None]:
import ast
import collections
import itertools
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sb
import sqlite3

In [None]:
class SQLiteContext:
    def __init__(self):
        self.conn = sqlite3.connect('/home/ghsong/django-substring/mysite/db.sqlite3')
        self.conn.row_factory = dict_factory
    
    def __enter__(self):
        return self.conn.cursor()
    
    def __exit__(self, type, value, traceback):
        self.conn.close()

In [None]:
def execute_query(query):
    with SQLiteContext() as cur:
        cur.execute(query)
        return cur.fetchall()

In [None]:
query_tmpl = 'SELECT exp_date, algorithm_id, dataset_id, parameter, result FROM projectManager_expitem WHERE project_id=1 {} ORDER BY -exp_date'

In [None]:
def dict_factory(cursor, row):
    d = {}
    for idx, col in enumerate(cursor.description):
        d[col[0]] = row[idx]
    return d

In [None]:
def str2dict(s):
    return ast.literal_eval(s)

In [None]:
# setting_list = ['LSS-NAIVE', 'LSS-L+PR', 'LSS-C', 'LSS-C+P', 'LSS-C+P+L', 'LSS-C+P+L+PR', 'LSS-PKWISE']#, 'LSS-PKWISE2', 'LSS-PKWISE3']
# setting_list = ['LSS-NAIVE', 'LSS-C', 'LSS-P', 'LSS-L', 'LSS-PR', 'LSS-C+P', 'LSS-C+L', 'LSS-P+L','LSS-C+P+L', 'LSS-C+P+L+PR']
# setting_list = ['LSS-NAIVE', 'LSS-C', 'LSS-P', 'LSS-L',  'LSS-C+P', 'LSS-C+L', 'LSS-P+L','LSS-C+P+L', 'LSS-C+P+PR', 'LSS-C+P+L+PR']
setting_list = ['LSS-NAIVE', 'LSS-C', 'LSS-P', 'LSS-C+P', 'LSS-C+L', 'LSS-P+L', 'LSS-C+P+L']

def get_setting_label(aid, dict_param):
#     print(aid, dict_param)
    if aid == 28: # PrefixSearch_6.20
        return get_setting_label_PrefixSearch(dict_param)
    elif aid == 31: # PkwiseSynSearch_1.02
        return get_setting_label_PkwiseSynSearch(dict_param)
    
def get_setting_label_PrefixSearch(dict_param):
    bLF, bPF = map(lambda x: x == 'true', [dict_param[key] for key in ['bLF', 'bPF']])
    idx_impl = dict_param['index_impl']
    if not bLF and not bPF and idx_impl == 'None': return 'LSS-NAIVE'
    elif not bLF and not bPF and idx_impl == 'Naive': return None#return 'LSS-INDEX'
    elif not bLF and not bPF and idx_impl == 'Count': return 'LSS-C'
    elif not bLF and not bPF and idx_impl == 'PositionOnly': return 'LSS-P'
    elif bLF and not bPF and idx_impl == 'Naive': return 'LSS-L'
    elif not bLF and bPF and idx_impl == 'Naive': return 'LSS-PR'
    
    elif not bLF and not bPF and idx_impl == 'Position': return 'LSS-C+P'
    elif bLF and not bPF and idx_impl == 'Count': return 'LSS-C+L'
    elif bLF and not bPF and idx_impl == 'PositionOnly': return 'LSS-P+L'
    
    elif bLF and not bPF and idx_impl == 'Position': return 'LSS-C+P+L'
    elif not bLF and bPF and idx_impl == 'Position': return 'LSS-C+P+PR'    
    elif bLF and bPF and idx_impl == 'Position': return 'LSS-C+P+L+PR'
    
    elif bLF and bPF and idx_impl == 'Naive': return 'LSS-L+PR'
    else: return None

def get_setting_label_PkwiseSynSearch(dict_param):
    qlen = dict_param['qlen']
    kmax = dict_param['kmax']
    if   qlen == '1' and kmax == '1': return 'LSS-PKWISE'
    elif qlen == '3' and kmax == '2': return 'LSS-PKWISE'
    elif qlen == '5' and kmax == '2': return 'LSS-PKWISE'
    elif qlen == '7' and kmax == '2': return 'LSS-PKWISE'
    elif qlen == '9' and kmax == '2': return 'LSS-PKWISE'
    else: return None

In [None]:
alg_list = ['PrefixSearch_6.20', 'PkwiseSynSearch_1.02']
# alg_list = ['PrefixSearch_6.20']

In [None]:
dict_alg = {}
dict_aid = {}
with SQLiteContext() as cur:
    cur.execute("SELECT id, name, version FROM projectManager_algorithm WHERE project_id=1")
    rows = cur.fetchall()
    for row in rows: 
        dict_alg[row['name']+'_'+row['version']] = row['id']
        dict_aid[row['id']] = row['name']+'_'+row['version']

In [None]:
DataInfo = collections.namedtuple('DataInfo', ['name', 'size', 'nr', 'qlen'])

In [None]:
dict_data = {}
dict_did = {}
with SQLiteContext() as cur:
    cur.execute("SELECT id, name FROM projectManager_dataset WHERE project_id=1 and id >= 71")
    rows = cur.fetchall()
    for row in rows: 
        dname, size, nr, qlen = row['name'].rsplit('_', 3)
        size, nr, qlen = size[1:], nr[1:], qlen[1:]
        try: dict_data[dname]
        except: dict_data[dname] = {}
        dict_data[dname][(size, nr, qlen)] = row['id']
        dict_did[row['id']] = (dname, size, nr, qlen)

In [None]:
def get_did_list(data_info):
    assert type(data_info) == DataInfo
    dname, size, nr, qlen = data_info
    const_idx_list = list(map(lambda x:x[0], filter(lambda x: x[1]!='*', enumerate([size, nr, qlen]))))
    assert len(const_idx_list) == 2
    target_idx = [size, nr, qlen].index('*')
    data_item_list = dict_data[dname].items()
    for idx in const_idx_list:
        data_item_list = list(filter(lambda x: x[0][idx] == data_info[idx+1], data_item_list))
    return list(sorted(map(lambda x: (x[0][target_idx], x[1]), data_item_list), key=lambda x:int(x[0])))

In [None]:
def get_size_did_list(dname, nr, qlen):
    return list(
        sorted(
            map(lambda x: (x[0][0], x[1]), 
                filter(lambda x: x[0][1]==nr and x[0][2]==qlen, dict_data[dname].items()))
        , key=lambda x: int(x[0])))

In [None]:
def get_qlen_did_list(dname, size, nr):
    return list(
        sorted(
            map(lambda x: (x[0][2], x[1]), 
                filter(lambda x: x[0][0]==size and x[0][1]==nr, dict_data[dname].items()))
        , key=lambda x: int(x[0])))

In [None]:
# marker_list = itertools.cycle(['+', 'x', 'o', '^', 'D', 's', 'v'])
marker_list = ['+', 'x', 'o', '^', 'D', 's', 'v']
dashes_list = itertools.cycle([0])

In [None]:
def parse_data_info(data_info):
    assert type(data_info) == DataInfo
    dname, size, nr, qlen = data_info
    return dname, size, nr, qlen

In [None]:
def df_time_by_filtering(data_info, alg_name=alg_name_final):
    dname, size, nr, qlen = parse_data_info(data_info)
    did = dict_data[dname][(size, nr, qlen)]
    predicates_alg = ' OR '.join(['algorithm_id={}'.format(dict_alg[alg_name]) for alg_name in alg_list])
    predicates = ' AND '.join(['', 'dataset_id={}'.format(did), '({})'.format(predicates_alg)])
    query = query_tmpl.format(predicates)
    output_list = []
    for row in execute_query(query):
        dict_param = str2dict(row['parameter'])
        dict_rslt = str2dict(row['result'])
        output = {}
        output['algorithm_id'] = row['algorithm_id']
        output['theta'] = float(dict_param['theta'])
#         bLF, bPF = map(lambda x: x == 'true', [dict_param[key] for key in ['bLF', 'bPF']])
#         idx_impl = dict_param['index_impl']
        output['setting'] = get_setting_label(output['algorithm_id'], dict_param)
        if output['setting'] is None: continue
        output['Time_Total'] = float(dict_rslt['Time_Total'])/1000
        output['Time_SearchPerQuery_MEAN'] = float(dict_rslt['Time_SearchPerQuery_MEAN'])/1000
        output['Num_QS_Verified'] = int(dict_rslt['Num_QS_Verified'])
        output['Num_TS_Verified'] = int(dict_rslt['Num_TS_Verified'])
        output['Num_Verified'] = int(dict_rslt['Num_QS_Verified']) + int(dict_rslt['Num_TS_Verified'])
        output_list.append(output)
    df = pd.DataFrame.from_dict(output_list)
    df.setting = pd.Categorical(df.setting, categories=setting_list)
    return df

In [None]:
def df_time_by_size(data_info, alg_list=alg_list):
    dname, size, nr, qlen = parse_data_info(data_info)
    size_did_list = get_did_list(data_info)
    predicates_alg = ' OR '.join(['algorithm_id={}'.format(dict_alg[alg_name]) for alg_name in alg_list])
    output_list = []
    for size, did in size_did_list:
        predicates = ' AND '.join(['', 'dataset_id={}'.format(did), '({})'.format(predicates_alg)])
        query = query_tmpl.format(predicates)
        for row in execute_query(query):
            aid = int(row['algorithm_id'])
            dict_param = str2dict(row['parameter'])
            dict_rslt = str2dict(row['result'])
            dict_param['qlen'] = dict_rslt['Dataset_qlen']

            output = {}
            output['n'] = int(dict_rslt['Dataset_numIndexed'])
            output['theta'] = float(dict_param['theta'])
            output['setting'] = get_setting_label(aid, dict_param)
            if output['setting'] is None: continue
            output['Time_Total'] = float(dict_rslt['Time_Total'])/1000
            try: output['Time_QS_Total'] = float(dict_rslt['Time_QSTotal'])/1000
            except: output['Time_QS_Total'] = float(dict_rslt['Time_QS_Total'])/1000
            try: output['Time_TS_Total'] = float(dict_rslt['Time_TSTotal'])/1000
            except: output['Time_TS_Total'] = float(dict_rslt['Time_TS_Total'])/1000
            output['Time_SearchPerQuery_MEAN'] = float(dict_rslt['Time_SearchPerQuery_MEAN'])/1000
            output['Time_BuildIndex'] = float(dict_rslt['Time_BuildIndex'])/1000
            output['Num_Verified'] = int(dict_rslt['Num_QS_Verified']) + int(dict_rslt['Num_TS_Verified'])
            output['Num_Result'] = int(dict_rslt['Num_Result'])
            output['Num_QS_Verified'] = int(dict_rslt['Num_QS_Verified'])
            output['Num_TS_Verified'] = int(dict_rslt['Num_TS_Verified'])
            output_list.append(output)
    df = pd.DataFrame.from_dict(output_list)
    df.setting = pd.Categorical(df.setting, categories=setting_list)
    return df

In [None]:
def df_time_by_qlen(data_info, alg_list=alg_list):
    dname, size, nr, qlen = parse_data_info(data_info)
    qlen_did_list = get_did_list(data_info)
    qlen_list = list(map(lambda x:int(x[0]), qlen_did_list))
    predicates_alg = ' OR '.join(['algorithm_id={}'.format(dict_alg[alg_name]) for alg_name in alg_list])
    output_list = []
    for qlen, did in qlen_did_list:
        predicates = ' AND '.join(['', 'dataset_id={}'.format(did), '({})'.format(predicates_alg)])
        query = query_tmpl.format(predicates)
        for row in execute_query(query):
            aid = int(row['algorithm_id'])
            dict_param = str2dict(row['parameter'])
            dict_rslt = str2dict(row['result'])
            dict_param['qlen'] = dict_rslt['Dataset_qlen']

            output = {}
            output['qlen'] = int(qlen)
            output['theta'] = float(dict_param['theta'])
            output['setting'] = get_setting_label(aid, dict_param)
            if output['setting'] is None: continue
            output['Time_Total'] = float(dict_rslt['Time_Total'])/1000
            try: output['Time_QS_Total'] = float(dict_rslt['Time_QSTotal'])/1000
            except: output['Time_QS_Total'] = float(dict_rslt['Time_QS_Total'])/1000
            try: output['Time_TS_Total'] = float(dict_rslt['Time_TSTotal'])/1000
            except: output['Time_TS_Total'] = float(dict_rslt['Time_TS_Total'])/1000
            output['Time_SearchPerQuery_MEAN'] = float(dict_rslt['Time_SearchPerQuery_MEAN'])/1000
            output['Time_IndexFilter'] = float(dict_rslt['Time_QS_IndexFilter']) + float(dict_rslt['Time_TS_IndexFilter'])
            output['Time_Validation'] = float(dict_rslt['Time_QS_Validation']) + float(dict_rslt['Time_TS_Validation'])
            output['Num_Result'] = int(dict_rslt['Num_Result'])
            output['Num_Verified'] = int(dict_rslt['Num_QS_Verified']) + int(dict_rslt['Num_TS_Verified'])
            output['Num_QS_Verified'] = int(dict_rslt['Num_QS_Verified'])
            output['Num_TS_Verified'] = int(dict_rslt['Num_TS_Verified'])
            output['Len_Retrieved'] = int(dict_rslt['Len_QS_Retrieved']) + int(dict_rslt['Len_TS_Retrieved'])
            output_list.append(output)
    df = pd.DataFrame.from_dict(output_list)
    df.setting = pd.Categorical(df.setting, categories=setting_list)
    return df

In [None]:
def df_time_by_nr(data_info, alg_list=alg_list):
    assert type(data_info) == DataInfo
    dname, size, nr, qlen = data_info
    nr_did_list = get_did_list(data_info)
    predicates_alg = ' OR '.join(['algorithm_id={}'.format(dict_alg[alg_name]) for alg_name in alg_list])
    output_list = []
    for nr, did in nr_did_list:
        predicates = ' AND '.join(['', 'dataset_id={}'.format(did), '({})'.format(predicates_alg)])
        query = query_tmpl.format(predicates)
        for row in execute_query(query):
            aid = int(row['algorithm_id'])
            dict_param = str2dict(row['parameter'])
            dict_rslt = str2dict(row['result'])
            dict_param['qlen'] = dict_rslt['Dataset_qlen']

            output = {}
            output['nr'] = int(nr)
            output['theta'] = float(dict_param['theta'])
            output['setting'] = get_setting_label(aid, dict_param)
            if output['setting'] is None: continue
            output['Time_Total'] = float(dict_rslt['Time_Total'])/1000
            output['Time_SearchPerQuery_MEAN'] = float(dict_rslt['Time_SearchPerQuery_MEAN'])/1000
            output['Time_BuildIndex'] = float(dict_rslt['Time_BuildIndex'])/1000
            output_list.append(output)
    df = pd.DataFrame.from_dict(output_list)
    df.setting = pd.Categorical(df.setting, categories=setting_list)
    return df

In [None]:
def df_acc(alg_name=alg_name_final):
    output_list = []
    aid = dict_alg[alg_name]
    predicates = ' '.join(['AND algorithm_id={}'.format(aid)])
    query = query_tmpl.format(predicates)
    for row in execute_query(query):
        aid = int(row['algorithm_id'])
        dict_param = str2dict(row['parameter'])
        dict_rslt = str2dict(row['result'])
        bLF, bPF = map(lambda x: x == 'true', [dict_param[key] for key in ['bLF', 'bPF']])
        idx_impl = dict_param['index_impl']

        output = {}
        output['data_name'] = dict_rslt['Dataset_Name'].split('_',1)[0]
        output['n'] = int(dict_rslt['Dataset_nt'])
        output['nr'] = int(dict_rslt['Dataset_nr'])
        output['qlen'] = int(dict_rslt['Dataset_qlen'])
        output['theta'] = float(dict_param['theta'])
        output['setting'] = get_setting_label(aid, dict_param)
        if output['setting'] is None: continue
        output['Num_Result'] = int(dict_rslt['Num_Result'])
        output['Num_QS_Result'] = int(dict_rslt['Num_QS_Result'])
        output['Num_TS_Result'] = int(dict_rslt['Num_TS_Result'])
        output_list.append(output)
        
    df = pd.DataFrame.from_dict(output_list)
    df.setting = pd.Categorical(df.setting, categories=setting_list)
    return df

In [None]:
def df_from_PrefixSearchFilterPowerTest():
    setting_list = ['LSS-NAIVE', 'LSS-C', 'LSS-P', 'LSS-L', 'LSS-PR', 'LSS-C+P', 'LSS-C+L', 'LSS-P+L','LSS-C+P+L', 'LSS-C+P+L+PR']
    path = '../tmp/PrefixSearchFilterPowerTest.txt'
    output_list = []
    with open(path) as f:
        for line in f:
            token_list = line.strip().split('\t')
            row = {k:v for k,v in map(lambda x:x.split(':',1), token_list)}
            dict_param = {k:v for k,v in map(lambda x:x.split(':',1), row['Param'][1:-3].split(', '))}
            
            output = {}
            output['theta'] = float(dict_param['theta'])
            output['data_name'] = row['Dataset_Name'].split('_',1)[0]
            output['n'] = int(row['Dataset_nt'])
            output['nr'] = int(row['Dataset_nr'])
            output['qlen'] = int(row['Dataset_qlen'])
            output['setting'] = get_setting_label(28, dict_param)
            output['Num_QS_Verified'] = int(row['Num_QS_Verified'])
            output['Num_TS_Verified'] = int(row['Num_TS_Verified'])
            output['Num_Verified'] = int(row['Num_QS_Verified']) + int(row['Num_TS_Verified'])
            output['Time_Total'] = float(row['Time_Total'])/1000
            output_list.append(output)
    df = pd.DataFrame.from_dict(output_list)
    df.setting = pd.Categorical(df.setting, categories=setting_list)
    return df

In [None]:
def df_from_ExactPrefixSearch():
    path = '../tmp/ExactPrefixSearch.txt'
    output_list = []
    with open(path) as f:
        for line in f:
            token_list = line.strip().split('\t')
            row = {k:v for k,v in map(lambda x:x.split(':',1), token_list)}
            dict_param = {k:v for k,v in map(lambda x:x.split(':',1), row['Param'][1:-3].split(', '))}
            output = {}
            output['theta'] = float(dict_param['theta'])
            output['data_name'] = row['Dataset_Name'].split('_',1)[0]
            output['n'] = int(row['Dataset_nt'])
            output['nr'] = int(row['Dataset_nr'])
            output['qlen'] = int(row['Dataset_qlen'])
            output['Num_QS_Result'] = int(row['Num_QS_Result'])
            output['Num_TS_Result'] = int(row['Num_TS_Result'])
            output['Num_Result'] = int(row['Num_Result'])
            output_list.append(output)
    df = pd.DataFrame.from_dict(output_list)
    df.drop_duplicates(inplace=True)
    return df

In [None]:
def plot_verify_by_filtering(theta, qlen):
    df = df_from_PrefixSearchFilterPowerTest()
    df = df[df.theta == float(theta)][df.qlen == int(qlen)]
    f, axes = plt.subplots(1, 3, figsize=(21, 6), sharex=True, sharey=False)
    sb.catplot(x='setting', y='Num_Verified', hue='setting', data=df[df.data_name == 'WIKI'], kind='bar', ax=axes[0])
    sb.catplot(x='setting', y='Num_Verified', hue='setting', data=df[df.data_name == 'PUBMED'], kind='bar', ax=axes[1])
    sb.catplot(x='setting', y='Num_Verified', hue='setting', data=df[df.data_name == 'AMAZON'], kind='bar', ax=axes[2])
    axes[0].set(ylabel='# Verified', yscale='log')
    axes[1].set(ylabel='# Verified', yscale='log')
    axes[2].set(ylabel='# Verified', yscale='log')
    for i in range(2,5): plt.close(i)

In [None]:
def plot_time_by_filtering(data_info, alg_name=alg_name_final):
    df = df_time_by_filtering(data_info, alg_name=alg_name_final)
    f, axes = plt.subplots(1, 3, figsize=(21, 6), sharex=False, sharey=False)
    sb.catplot(x='theta', y='Time_Total', hue='setting', data=df, kind='bar', ax=axes[0])
    sb.catplot(x='theta', y='Time_SearchPerQuery_MEAN', hue='setting', data=df, kind='bar', ax=axes[1])
    sb.catplot(x='theta', y='Num_Verified', hue='setting', data=df, kind='bar', ax=axes[2])
    axes[0].set(ylabel='Total Time (sec)', yscale='log')
    axes[1].set(ylabel='Mean query time (s)', yscale='log')
    axes[2].set(ylabel='# Verified', yscale='log')
    for i in range(2,5): plt.close(i)

In [None]:
def plot_time_by_size(data_info, alg_list=alg_list, y='Time_SearchPerQuery_MEAN', ylabel='Mean query time (s)'):
    df = df_time_by_size(data_info, alg_list=alg_list)
    f, axes = plt.subplots(1, 5, figsize=(25, 5), sharex=False, sharey=False)
#     for i, theta in zip(range(5), [0.6, 0.7, 0.8, 0.9, 1.0]):
#         sb.lineplot(data=df[df.theta==theta], x="n", y="Time_Total", hue="setting", marker='o', ax=axes[0,i])
#         axes[0,i].set(ylabel='Total Time (sec)', xscale='log', yscale='log')
    for i, theta in zip(range(5), [0.6, 0.7, 0.8, 0.9, 1.0]):
        sb.lineplot(data=df[df.theta==theta], x="n", y=y, hue="setting", marker='o', ax=axes[i])
        axes[i].set(ylabel=ylabel, xscale='log', yscale='log', title='theta=%.1f'%theta)
        if i == 4: axes[i].legend(loc='center left', bbox_to_anchor=(1, 0.5))
        else: axes[i].get_legend().remove()

In [None]:
def plot_time_by_qlen(data_info, alg_list=alg_list, y='Time_SearchPerQuery_MEAN', ylabel='Mean query time (s)'):
    dname, size, nr, qlen = parse_data_info(data_info)
    qlen_list = list(map(lambda x:int(x[0]), get_did_list(data_info)))
    df = df_time_by_qlen(data_info, alg_list=alg_list)
    f, axes = plt.subplots(1, 5, figsize=(25, 5), sharex=False, sharey=False)
#     for i, theta in zip(range(5), [0.6, 0.7, 0.8, 0.9, 1.0]):
#         sb.lineplot(data=df[df.theta==theta], x="n", y="Time_Total", hue="setting", marker='o', ax=axes[0,i])
#         axes[0,i].set(ylabel='Total Time (sec)', xscale='log', yscale='log')
    for i, theta in zip(range(5), [0.6, 0.7, 0.8, 0.9, 1.0]):
        sb.lineplot(data=df[df.theta==theta], x="qlen", y=y, hue="setting", marker='o', ax=axes[i])
        axes[i].set(xticks=qlen_list, ylabel=ylabel, yscale='log', title='theta=%.1f'%theta)
        if i == 4: axes[i].legend(loc='center left', bbox_to_anchor=(1, 0.5))
        else: axes[i].get_legend().remove()

In [None]:
def plot_time_by_nr(data_info, alg_list=alg_list):
    df = df_time_by_nr(data_info, alg_list=alg_list)
    f, axes = plt.subplots(1, 5, figsize=(25, 5), sharex=False, sharey=False)
#     for i, theta in zip(range(5), [0.6, 0.7, 0.8, 0.9, 1.0]):
#         sb.lineplot(data=df[df.theta==theta], x="n", y="Time_Total", hue="setting", marker='o', ax=axes[0,i])
#         axes[0,i].set(ylabel='Total Time (sec)', xscale='log', yscale='log')
    for i, theta in zip(range(5), [0.6, 0.7, 0.8, 0.9, 1.0]):
        sb.lineplot(data=df[df.theta==theta], x="nr", y="Time_SearchPerQuery_MEAN", hue="setting", marker='o', ax=axes[i])
        axes[i].set(ylabel='Mean query time (s)', xscale='log', yscale='log', title='theta=%.1f'%theta)
        if i == 4: axes[i].legend(loc='center left', bbox_to_anchor=(1, 0.5))
        else: axes[i].get_legend().remove()

In [None]:
def compare_alg(alg0_name, alg1_name):
    aid0 = dict_alg[alg0_name]
    aid1 = dict_alg[alg1_name]
    predicates = ' '.join(['AND (algorithm_id={} OR algorithm_id={})'.format(aid0, aid1)])
    query = query_tmpl.format(predicates)
    output_list = []
    for row in execute_query(query):
        dict_param = str2dict(row['parameter'])
        dict_rslt = str2dict(row['result'])
        bLF, bPF = map(lambda x: x == 'true', [dict_param[key] for key in ['bLF', 'bPF']])
        idx_impl = dict_param['index_impl']

        output = {}
        output['dataset'] = dict_rslt['Dataset_Name']
        output['size'] = dict_rslt['Dataset_Name']
        output['alg'] = dict_rslt['Alg_Name']+'_'+dict_rslt['Alg_Version']
        output['theta'] = float(dict_param['theta'])
        output['setting'] = get_setting_label(bLF, bPF, idx_impl)
        if output['setting'] is None: continue
        output['Time_Total'] = float(dict_rslt['Time_Total'])/1000
        try: output['Time_QS_Total'] = float(dict_rslt['Time_QSTotal'])/1000
        except: output['Time_QS_Total'] = float(dict_rslt['Time_QS_Total'])/1000
        try: output['Time_TS_Total'] = float(dict_rslt['Time_TSTotal'])/1000
        except: output['Time_TS_Total'] = float(dict_rslt['Time_TS_Total'])/1000
        output['Time_Search'] = output['Time_QS_Total'] + output['Time_TS_Total']
        output['Num_Verified'] = int(dict_rslt['Num_QS_Verified']) + int(dict_rslt['Num_TS_Verified'])
        output['Num_QS_Result'] = int(dict_rslt['Num_QS_Result'])
        output['Num_TS_Result'] = int(dict_rslt['Num_TS_Result'])
        
        output_list.append(output)
    df = pd.DataFrame.from_dict(output_list)
    df.setting = pd.Categorical(df.setting, categories=['NoFilter', 'IF', 'ICF', 'NoIndex', 'NaivePF', 'IPF', 'LF', 'PF'])
    
    df0 = df[df.alg==alg0_name].drop(['alg'], axis=1)
    df1 = df[df.alg==alg1_name].drop(['alg'], axis=1)
    df_merged = pd.merge(df0, df1, on=['dataset', 'size', 'theta', 'setting'], how='outer', suffixes=['_0','_1'])
    df_merged['diff_Time'] = df_merged['Time_Total_0'] - df_merged['Time_Total_1']
    df_merged['diff_Time_ratio'] = (df_merged['Time_Total_0'] - df_merged['Time_Total_1'])/df_merged['Time_Total_0']
    df_merged['diff_Time_Search'] = df_merged['Time_Search_0'] - df_merged['Time_Search_1']
    df_merged['diff_Time_Search_ratio'] = (df_merged['Time_Search_0'] - df_merged['Time_Search_1'])/df_merged['Time_Search_0']
    df_merged['diff_Verify'] = df_merged['Num_Verified_0'] - df_merged['Num_Verified_1']
    df_merged['diff_Verify_ratio'] = (df_merged['Num_Verified_0'] - df_merged['Num_Verified_1'])/df_merged['Num_Verified_0']
    df_merged['diff_QS_Result'] = df_merged['Num_QS_Result_0'] - df_merged['Num_QS_Result_1']
    df_merged['diff_TS_Result'] = df_merged['Num_TS_Result_0'] - df_merged['Num_TS_Result_1']
    print('diff_Time:',df_merged['diff_Time'].mean())
    print('diff_Time_ratio:',df_merged['diff_Time_ratio'].mean())
    print('diff_Time_Search:',df_merged['diff_Time_Search'].mean())
    print('diff_Time_Search_ratio:',df_merged['diff_Time_Search_ratio'].mean())
    print('diff_Verify:',df_merged['diff_Verify'].mean())
    print('diff_Verify_ratio:',df_merged['diff_Verify_ratio'].mean())
    print('diff_QS_Result:',df_merged['diff_QS_Result'].mean())
    print('diff_TS_Result:',df_merged['diff_TS_Result'].mean())
    df_merged = df_merged.reindex(sorted(df_merged.columns), axis=1)
    return df_merged

In [None]:
def performance_comparison():
    
    def compare_algs(df, alg0, alg1, data_info=None, measure='Time_SearchPerQuery_MEAN'):
        vals0 = df[df.setting==alg0][measure].values
        vals1 = df[df.setting==alg1][measure].values
        n = min(len(vals0), len(vals1))
        print(measure, '' if data_info is None else data_info.name, '{}/{}'.format(alg0, alg1), vals0[:n]/vals1[:n])
    
    df = df_from_PrefixSearchFilterPowerTest()
    df = df[df.theta==theta0][df.n==n0][df.qlen==qlen0]
    compare_algs(df, 'IF', 'NaivePF', measure='Num_Verified')
    compare_algs(df, 'IF', 'ICF', measure='Num_Verified')
    compare_algs(df, 'IF', 'IPF', measure='Num_Verified')
    
    for data_info in [DataInfo('WIKI', '*', '107836', '3')]:#, DataInfo('PUBMED', '*', '79011', '3'), DataInfo('AMAZON', '*', '107836', '3')]:
        df = df_time_by_size(data_info)
        df = df[df.theta==theta0]
        for setting in ['IPF', 'LF', 'PF']:
            compare_algs(df, 'ICF', setting)
        compare_algs(df, 'NaivePF', 'PF')
        compare_algs(df, 'IF', 'NaivePF')

In [None]:
def df_heuristic_accuracy():
    df0 = df_from_ExactPrefixSearch()
    df1 = df_acc()
    df = df0.merge(df1, on=['data_name', 'n', 'nr', 'qlen', 'theta'], suffixes=['0',''], how='inner')
    df = df[['data_name', 'n', 'nr', 'qlen', 'theta', 'setting', 'Num_Result0', 'Num_Result', 'Num_QS_Result0', 'Num_QS_Result', 'Num_TS_Result0', 'Num_TS_Result']]
    df['Acc'] = df['Num_Result']/df['Num_Result0']
    df['Acc_QS'] = df['Num_QS_Result']/df['Num_QS_Result0']
    df['Acc_TS'] = df['Num_TS_Result']/df['Num_TS_Result0']

    df = df[df.n==100000][df.nr==31622][df.setting=='PF']
    return df

In [None]:
def performance_comparison():
    theta0 = 0.7
    qlen0 = 5
    n0 = 100000
    
    def compare_algs(df, alg0, alg1, data_info=None, measure='Time_SearchPerQuery_MEAN'):
        vals0 = df[df.setting==alg0][measure].values
        vals1 = df[df.setting==alg1][measure].values
        n = min(len(vals0), len(vals1))
        print(measure, '' if data_info is None else data_info.name, '{}/{}'.format(alg0, alg1), vals0[:n]/vals1[:n])
    
    df = df_from_PrefixSearchFilterPowerTest()
    df = df[df.theta==theta0][df.n==n0][df.qlen==qlen0]
    compare_algs(df, 'LSS-NAIVE', 'LSS-COUNT', measure='Num_Verified')
    compare_algs(df, 'LSS-COUNT', 'LSS-POS', measure='Num_Verified')
    compare_algs(df, 'LSS-POS', 'LSS-LEN', measure='Num_Verified')
    compare_algs(df, 'LSS-LEN', 'LSS-PREFIX', measure='Num_Verified')
    compare_algs(df, 'LSS-NAIVE', 'LSS-PKDUCK', measure='Num_Verified')
    
    print('Varying nt')
    for data_info in [DataInfo('WIKI', '*', '107836', '5'), DataInfo('PUBMED', '*', '79011', '5'), DataInfo('AMAZON', '*', '107836', '5')]:
        df = df_time_by_size(data_info)
        df = df[df.theta==theta0]
#         for setting in ['LSS-POS', 'LSS-LEN', 'LSS-PREFIX']:
#             compare_algs(df, 'LSS-COUNT', setting, data_info)
#         compare_algs(df, 'LSS-NAIVE', 'LSS-PREFIX', data_info)
        compare_algs(df, 'LSS-P+L', 'LSS-C+P', data_info)
        compare_algs(df, 'LSS-P+L', 'LSS-C+P+L', data_info)
        
    print('Varying qlen')
    for data_info in [DataInfo('WIKI', '100000', '107836', '*'), DataInfo('PUBMED', '100000', '79011', '*'), DataInfo('AMAZON', '100000', '107836', '*')]:
        df = df_time_by_qlen(data_info)
        df = df[df.theta==theta0]
        for setting in ['LSS-POS', 'LSS-LEN', 'LSS-PREFIX']:
            compare_algs(df, 'LSS-COUNT', setting, data_info)
        compare_algs(df, 'LSS-NAIVE', 'LSS-PREFIX', data_info)
        compare_algs(df, 'LSS-INDEX', 'LSS-NAIVE', data_info)

    print('Varying nr')
    for data_info in [DataInfo('WIKI', '100000', '*', '5'), DataInfo('PUBMED', '100000', '*', '5'), DataInfo('AMAZON', '100000', '*', '5')]:
        df = df_time_by_nr(data_info)
        df = df[df.theta==theta0]
#         for setting in ['LSS-POS', 'LSS-LEN', 'LSS-PREFIX']:
#             compare_algs(df, 'LSS-COUNT', setting, data_info)
#         compare_algs(df, 'LSS-NAIVE', 'LSS-PREFIX', data_info)
        compare_algs(df, 'LSS-P', 'LSS-C+P', data_info)
        compare_algs(df, 'LSS-P', 'LSS-C+P+L', data_info)
        compare_algs(df, 'LSS-P+L', 'LSS-C+P', data_info)
        compare_algs(df, 'LSS-P+L', 'LSS-C+P+L', data_info)

        
    print('Varying theta')
    for data_info in [DataInfo('WIKI', '100000', '107836', '5'), DataInfo('PUBMED', '100000', '79011', '5'), DataInfo('AMAZON', '100000', '107836', '5')]:
        df = df_time_by_filtering(data_info).groupby(['theta', 'setting']).head(1).sort_values('theta')
        compare_algs(df, 'LSS-P+L', 'LSS-C+P', data_info)
        compare_algs(df, 'LSS-P+L', 'LSS-C+P+L', data_info)