In [None]:
from os.path import basename, join, splitext
import itertools
import matplotlib.pyplot as plt
from numpy import ceil, log10
import pandas as pd
import seaborn as sb
from subprocess import call
%matplotlib inline
%run util.ipynb

In [None]:
class PlotGenerator:
    pt_list = [2,3,8,4,6,12,1,10,14]
    
    def __init__(self, data_info, theta, ymax_offset):
        self.data_info = data_info
        self.theta = theta
        
#         self.goal = None
#         self.xlabel = None
#         self.ylabel = None
#         self.xrange = None
        self.ylabel = 'Avg. execution time \(sec\)'
        self.xtics = None
        self.xmax = 0
        self.ymax = 0
        self.ymax_offset = ymax_offset
        self.xlog = True
        self.xformat = "10^{%L}"
        
        self.title_list = []
        self.setting_list = setting_list
   
    def execute(data_info, theta, ymax_offset=0):
        if data_info.size == '*': TimeBySizePlotGenerator(data_info, theta, ymax_offset).run()
        elif data_info.qlen == '*': TimeByQlenPlotGenerator(data_info, theta, ymax_offset).run()
        elif data_info.nr == '*': TimeByNrulePlotGenerator(data_info, theta, ymax_offset).run()
        elif data_info.lr == '*': TimeByLenRatioPlotGenerator(data_info, theta, ymax_offset).run()

    def run(self):
        self.output_name = '{}__{}__{:.1f}'.format(self.goal, self.data_info.name, self.theta).replace('.', '_')
        self.load_df()
        self.write_result()
        self.output_script()
        self.output_plot()
        
    def load_df(self):
        pass
        
    def parse_row(self, row):
        pass

    def update_xmax(self, row):
        pass
    
    def update_ymax(self, row):
        self.ymax = max(self.ymax, row.Time_SearchPerQuery_MEAN)
    
    def additional_settings(self, f):
        pass
    
    def write_result(self):
        self.path_result = join('result', '{}.result'.format(self.output_name))
        df = self.df
        with open(self.path_result, 'w') as f:
            for setting in self.setting_list:
                f.write("# {}\n".format(setting))
                df_target = df[df.theta==self.theta][df.setting==setting]
                for row in df_target.itertuples():
                    self.update_xmax(row)
                    self.update_ymax(row)
                    f.write(self.parse_row(row))
                f.write('\n\n')
                self.title_list.append(setting if df_target.size > 0 else None)

    
    def output_script(self):
        self.path_script = join('script', '{}.plot'.format(self.output_name))
        self.path_plot = join('plot', '{}.pdf'.format(self.output_name))
        with open(self.path_script, 'w') as f:
#             f.write('set terminal font \",20\"\n')
            f.write('set xlabel \"{}\" font \",20\"\n'.format(self.xlabel))
            f.write('set ylabel \"{}\" font \",20\"\n'.format(self.ylabel))
            if self.xtics is not None: f.write('set xtics {}\n'.format(self.xtics))
            f.write('set xtics font \",16"\n')
            f.write('set ytics font \",16\"\n')
            f.write('set xrange {}\n'.format(self.xrange))
#             f.write('set yrange [*:{}]\n'.format(10**(ceil(log10(self.ymax))+self.ymax_offset)))
            f.write('set yrange [*:{}]\n'.format(10**(4.5+self.ymax_offset)))
            f.write('set key vertical maxrows 5\n')
            if self.xlog:
                f.write('set logscale x\n')
                f.write('set format x \"{}\"\n'.format(self.xformat))
            f.write('set logscale y\n')
            f.write('set format y \"10^{%L}\"\n')
            f.write('set size 0.6,0.5\n')
            self.additional_settings(f)
            

            f.write('set term postscript\n')
            f.write('set output\"| ps2pdf - {}\"\n'.format(self.path_plot))

            cmd_list = []
            idx = 0
            for tidx, title in enumerate(self.title_list):
                if title is None: continue
                cmd = '\"{}\" index {} with linespoints lc \"black\" lw 2 ps 1.5 dt {} pt {} title \"{}\"'.format(
                self.path_result, idx, tidx+1, PlotGenerator.pt_list[tidx%len(PlotGenerator.pt_list)], title)
                cmd_list.append(cmd)
                idx += 1
            f.write('plot\\\n'+',\\\n'.join(cmd_list))    #',\\\n'.join(cmd_list)
    
    def output_plot(self):
        call(['gnuplot', self.path_script])
        call(['sleep', '0.3'])
        call(['pdfcrop', self.path_plot, 'tmp_plot'])
        call(['mv', 'tmp_plot', self.path_plot])

In [None]:
class TimeBySizePlotGenerator(PlotGenerator):

    def __init__(self, data_info, theta, ymax_offset, alg_list=alg_list, setting_list=setting_list, goal=None):
        super(TimeBySizePlotGenerator, self).__init__(data_info, theta, ymax_offset)
        self.goal = 'time_by_size' if goal is None else goal
        
        if self.data_info.name.endswith('-DOC'):
            self.xlabel = 'Number of documents'
            self.xrange = '[1000:100000]' 
        else:
            self.xlabel = 'Number of strings'
            self.xrange = '[10000:1000000]'
              
        self.xlog = True
        self.ymax_offset = ymax_offset
        self.alg_list = alg_list
        self.setting_list = setting_list

    def load_df(self):
        self.df = df_time_by_size(self.data_info, self.alg_list)
    
    def parse_row(self, row):
        if self.data_info.name.endswith('-DOC'):
            return '{}\t{}\n'.format(row.n_doc, row.Time_SearchPerQuery_MEAN)
        else:
            return '{}\t{}\n'.format(row.n, row.Time_SearchPerQuery_MEAN)

In [None]:
class TimeByQlenPlotGenerator(PlotGenerator):

    def __init__(self, data_info, theta, ymax_offset):
        super(TimeByQlenPlotGenerator, self).__init__(data_info, theta, ymax_offset)
        self.goal = 'time_by_qlen'
        self.xlabel = 'Query size'
        self.xrange = '[1:9]'
        self.xlog = False
        self.ymax_offset = ymax_offset

    def load_df(self):
        self.df = df_time_by_qlen(self.data_info)
    
    def parse_row(self, row):
        return '{}\t{}\n'.format(row.qlen, row.Time_SearchPerQuery_MEAN)
    
    def additional_settings(self, f):
        f.write('set xtics 1,2,9\n')

In [None]:
class TimeByNrulePlotGenerator(PlotGenerator):

    def __init__(self, data_info, theta, ymax_offset):
        super(TimeByNrulePlotGenerator, self).__init__(data_info, theta, ymax_offset)
        self.goal = 'time_by_nr'
        self.xlabel = 'Number of rules'
        self.xrange = '[1000:110000]'
        self.xlog = True
        self.xformat = "2{/Symbol\\264}10^{%L}"
        
    def load_df(self):
        self.df = df_time_by_nr(self.data_info)
    
    def parse_row(self, row):
        return '{}\t{}\n'.format(row.nr, row.Time_SearchPerQuery_MEAN)

In [None]:
class TimeByLenRatioPlotGenerator(PlotGenerator):

    def __init__(self, data_info, theta, ymax_offset):
        super(TimeByLenRatioPlotGenerator, self).__init__(data_info, theta, ymax_offset)
        self.goal = 'time_by_lr'
        self.xlabel = 'Ratio of string size'
        self.xrange = '[0.2:1.0]'
        self.xtics = 0.2
        self.xlog = False
        self.xformat = "%.1f"
        
    def load_df(self):
        self.df = df_time_by_lr(self.data_info)
    
    def parse_row(self, row):
        return '{}\t{}\n'.format(row.lr, row.Time_SearchPerQuery_MEAN)

In [None]:
class TimeByThetaPlotGenerator(PlotGenerator):

    def __init__(self, data_info, theta, ymax_offset):
        super(TimeByThetaPlotGenerator, self).__init__(data_info, theta, ymax_offset)
        self.goal = 'time_by_theta'
        self.xlabel = 'Minimum threshold'
        self.xrange = '[0.6:1.0]'
        self.xlog = False
        
    def load_df(self):
        self.df = df_time_by_filtering(self.data_info).groupby(['theta', 'setting']).head(1)
    
    def parse_row(self, row):
        return '{}\t{}\n'.format(row.theta, row.Time_SearchPerQuery_MEAN)
    
    def additional_settings(self, f):
        f.write('set xtics 0.6,0.1,1.0\n')
    
    def write_result(self):
        self.path_result = join('result', '{}.result'.format(self.output_name))
        df = self.df
        with open(self.path_result, 'w') as f:
            for setting in self.setting_list:
                f.write("# {}\n".format(setting))
                df_target = df[df.setting==setting]
                for row in df_target.itertuples():
                    self.update_xmax(row)
                    self.update_ymax(row)
                    f.write(self.parse_row(row))
                f.write('\n\n')
                self.title_list.append(setting if df_target.size > 0 else None)

In [None]:
def plot_verify_by_alg(data_info, theta):
    goal = 'verify_by_alg'
#     df = df_time_by_filtering(data_info)
    df = df_from_PrefixSearchFilterPowerTest()
    df = df[~pd.isnull(df.setting)]
    df = df[df.data_name==data_info.name][df.n==int(data_info.size)][df.nr==int(data_info.nr)][df.qlen==int(data_info.qlen)]
    output_name = '{}__{}__{:.1f}'.format(goal, data_info.name, theta).replace('.', '_')
    df_target = df[df.theta==theta].sort_values(['setting'])
    title_list = []

    path_result = join('result', '{}.result'.format(output_name))
    if data_info.name == 'WIKI': ymin = 10**2
    elif data_info.name == 'PUBMED': ymin = 10**2
    elif data_info.name == 'AMAZON': ymin = 10**2
    ymax = 10**12
    with open(path_result, 'w') as f:
        for idx, row in enumerate(df_target.itertuples()): 
            try: 
                title = row.setting
                title_list.append(title)
                f.write('{}\t{}\t{}\t{}\t{}\n'.format(idx, title, row.Num_Verified, row.Num_QS_Verified, row.Num_TS_Verified))
#                 ymin = min(ymin, row.Num_Verified)
            except: pass

    path_script = join('script', '{}.plot'.format(output_name))
    path_plot = join('plot', '{}.pdf'.format(output_name))
    with open(path_script, 'w') as f:
        f.write('set style fill solid border\n')
#         f.write('set xlabel font \",20\"\n')
        f.write('set ylabel \"#Verified pairs\" font \",20\"\n')
        f.write('set yrange [{}:{}]\n'.format(ymin, ymax))
        f.write('set xtics font \",16"\n')
        f.write('set ytics font \",16\"\n')
        f.write('set key vertical maxrows 5\n')

        f.write('set logscale y\n')
        f.write('set format y \"10^{%L}\"\n')
    #     f.write('set style fill pattern border -1\n')
        f.write('set boxwidth 0.9\n')
        f.write('set xtics format \"\"\n')
        f.write('set size 0.6,0.6\n')

        f.write('set term postscript\n')
        f.write('set output\"| ps2pdf - {}\"\n'.format(path_plot))

        cmd_list = []
        for idx, title in enumerate(title_list):
            cmd = '\"{}\" every ::{}::{} using 1:3 with boxes fs pattern {} lw 2 title \"{}\"'.format(
            path_result, idx, idx, idx+2, title)
            cmd_list.append(cmd)
        f.write('plot\\\n'+',\\\n'.join(cmd_list))

    call(['gnuplot', path_script])
    call(['sleep', '0.3'])
    call(['pdfcrop', path_plot, 'tmp_plot'])
    call(['mv', 'tmp_plot', path_plot])

In [None]:
def plot_filter_time_by_alg(data_info, theta):
    goal = 'filter_time_by_alg'
#     df = df_time_by_filtering(data_info)
    df = df_from_PrefixSearchFilterPowerTest()
    df = df[~pd.isnull(df.setting)]
    df = df[df.data_name==data_info.name][df.n==int(data_info.size)][df.nr==int(data_info.nr)][df.qlen==int(data_info.qlen)]
    output_name = '{}__{}__{:.1f}'.format(goal, data_info.name, theta).replace('.', '_')
    df_target = df[df.theta==theta].sort_values(['setting'])
    title_list = []

    path_result = join('result', '{}.result'.format(output_name))
#     if data_info.name == 'WIKI': ymin = 10**3
#     elif data_info.name == 'PUBMED': ymin = 10**3
#     elif data_info.name == 'AMAZON': ymin = 10**3
    ymin = 10**(-1)
    ymax = 10**(5.5)
    with open(path_result, 'w') as f:
        for idx, row in enumerate(df_target.itertuples()): 
            try: 
                title = row.setting
                title_list.append(title)
                f.write('{}\t{}\t{}\n'.format(idx, title, row.Time_Total))
#                 ymin = min(ymin, row.Num_Verified)
            except: pass

    path_script = join('script', '{}.plot'.format(output_name))
    path_plot = join('plot', '{}.pdf'.format(output_name))
    with open(path_script, 'w') as f:
        f.write('set style fill solid border\n')
#         f.write('set xlabel font \",20\"\n')
        f.write('set ylabel \"#Verified pairs\" font \",20\"\n')
        f.write('set yrange [{}:{}]\n'.format(ymin, ymax))
        f.write('set xtics font \",16"\n')
        f.write('set ytics font \",16\"\n')
        f.write('set key vertical maxrows 5\n')

        f.write('set logscale y\n')
        f.write('set format y \"10^{%L}\"\n')
    #     f.write('set style fill pattern border -1\n')
        f.write('set boxwidth 0.9\n')
        f.write('set xtics format \"\"\n')
        f.write('set size 0.6,0.6\n')

        f.write('set term postscript\n')
        f.write('set output\"| ps2pdf - {}\"\n'.format(path_plot))

        cmd_list = []
        for idx, title in enumerate(title_list):
            cmd = '\"{}\" every ::{}::{} using 1:3 with boxes fs pattern {} lw 2 title \"{}\"'.format(
            path_result, idx, idx, idx+2, title)
            cmd_list.append(cmd)
        f.write('plot\\\n'+',\\\n'.join(cmd_list))

    call(['gnuplot', path_script])
    call(['sleep', '0.3'])
    call(['pdfcrop', path_plot, 'tmp_plot'])
    call(['mv', 'tmp_plot', path_plot])

In [None]:
def plot_index_build_time(varying, theta=0.7):
    goal = 'index_build_time'
    n0 = '1000000'
    q0 = '5'
    lr0 = '1.0'
    output_name = '{}__{}'.format(goal, varying).replace('.', '_')
    title_list = ['WIKI', 'PUBMED', 'AMAZON']
    
    if varying == 'nt': 
        data_info_list = [DataInfo('WIKI', '*', '107836', q0, lr0), DataInfo('PUBMED', '*', '79011', q0, lr0), DataInfo('AMAZON', '*', '107836', q0, lr0),]
        xlabel = '\#strings'
        xrange = '[10000:1000000]'
        xformat = "10^{%L}"
    elif varying == 'nr': 
        data_info_list = [DataInfo('WIKI', n0, '*', q0, lr0), DataInfo('PUBMED', n0, '*', q0, lr0), DataInfo('AMAZON', n0, '*', q0, lr0),]
        xlabel = '\#rules'
        xrange = '[1000:110000]'
        xformat = "2{/Symbol\\264}10^{%L}"
    
    path_result = join('result', '{}.result'.format(output_name))
    with open(path_result, 'w') as f:
        for data_info in data_info_list:
            if varying == 'nt':
                df = df_time_by_size(data_info)
                df = df[df.theta==theta][df.setting=='PF'][['n', 'Time_BuildIndex']]
            elif varying == 'nr':
                df = df_time_by_nr(data_info)
                df = df[df.theta==theta][df.setting=='PF'][['nr', 'Time_BuildIndex']]

            for row in df.itertuples(): 
                try: 
                    f.write('{}\t{}\n'.format(row[1], row[2]))
    #                 ymin = min(ymin, row.Num_Verified)
                except: pass
            f.write('\n\n')

    path_script = join('script', '{}.plot'.format(output_name))
    path_plot = join('plot', '{}.pdf'.format(output_name))
    with open(path_script, 'w') as f:
        f.write('set xlabel \"{}\" font \",20\"\n'.format(xlabel))
        f.write('set ylabel \"{}\" font \",20\"\n'.format("Time \(sec\)"))
        f.write('set xtics font \",16"\n')
        f.write('set ytics font \",16\"\n')
        f.write('set xrange {}\n'.format(xrange))
#         f.write('set yrange [*:{}]\n'.format(10**(ceil(log10(self.ymax))+self.ymax_offset)))
#         f.write('set key vertical maxrows 4\n')
        f.write('set logscale x\n')
        f.write('set format x \"{}\"\n'.format(xformat))
        f.write('set logscale y\n')
        f.write('set format y \"10^{%L}\"\n')
        f.write('set size 0.6,0.6\n')

        f.write('set term postscript\n')
        f.write('set output\"| ps2pdf - {}\"\n'.format(path_plot))

        cmd_list = []
        for idx, title in enumerate(title_list):
            cmd = '\"{}\" index {} with linespoints lc \"black\" lw 2 ps 1.5 dt {} pt {} title \"{}\"'.format(
            path_result, idx, idx+1, PlotGenerator.pt_list[idx%len(PlotGenerator.pt_list)], title_list[idx])
            cmd_list.append(cmd)
        f.write('plot\\\n'+',\\\n'.join(cmd_list))    #',\\\n'.join(cmd_list)

    call(['gnuplot', path_script])
    call(['sleep', '0.3'])
    call(['pdfcrop', path_plot, 'tmp_plot'])
    call(['mv', 'tmp_plot', path_plot])

In [None]:
def tex_heuristic_accuracy():
    df = df_heuristic_accuracy()
    for theta in [0.6, 0.7, 0.8, 0.9, 1.0]:
        line = ''
        line += '{:.1f}'.format(theta)
        for data_name in ['WIKI', 'PUBMED', 'AMAZON']:
            row = df[df.theta==theta][df.data_name==data_name]
            for side in ['QS', 'TS']:
                line += ' & {:.4f}'.format(row['Acc_{}'.format(side)].values[0])
        print(line+' \\\\\\hline')

In [None]:
def tex_table_filter_power(attr, setting_list=None):
    assert attr in ['Time_SearchPerQuery_MEAN', 'Num_Verified', 'Len_Verified']
    if setting_list is None:
        setting_list = ['LSS-NAIVE', 'LSS-C', 'LSS-P', 'LSS-L', 'LSS-C+P', 'LSS-C+L', 'LSS-P+L','LSS-C+P+L', 'LSS-C+P+R', 'LSS-C+P+L+R']
    datainfo_list = [
            DataInfo('WIKI', '*', '107836', '5', '1.0'),
            DataInfo('PUBMED', '*', '79011', '5', '1.0'),
            DataInfo('AMAZON', '*', '107836', '5', '1.0'),
    ]
    df = df_from_PrefixSearchFilterPowerTest(setting_list)

    print((' & '.join(['\\multirow{2}{*}{$\\theta$}']+list(map(lambda x:'\\makecell[c]{\\textit{'+x+'}}', setting_list)))).replace('LSS-', '') +' \\\\\\hline')
    for datainfo in datainfo_list:
        df0 = df[df.data_name==datainfo.name][~pd.isnull(df.setting)][df.n==100000][df.theta>=0.7][df.qlen==int(datainfo.qlen)][[attr, 'setting', 'theta']].sort_values(['theta', 'setting'])

        n = 9
        j = 0
        for theta in [0.7, 0.8, 0.9, 1.0]:
            print('{}'.format(theta), end='')
            for setting in setting_list:
                try:
                    v = df0[df.theta==theta][df.setting==setting][attr].values[0]
                    if attr == 'Time_SearchPerQuery_MEAN': print(' & {:,.3f}'.format(v*1000), end='')
                    elif attr == 'Num_Verified' or attr == 'Len_Verified': print(' & {:,}'.format(v), end='')
                except:
                    print(' & -', end='')
            print(' \\\\\\hline')