# Read dataset

In [15]:
from pers import PersistentResults
import pandas as pd
from collections import Counter
import numpy as np
from collections import defaultdict
from tqdm import tqdm
import random
import warnings
import os
import sys
import pandas as pd
from tabulate import tabulate

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
tqdm.pandas()


# Config
algorithms = ['bf', 'kr', 'qs', 'nsn', 'smith', 'rcolussi', 'askip', 'br', 'fs', 'ffs', 'bfs', 'ts', 'ssabs', 'hash3', 'hash5', 'hash8', 'aut', 'rf', 'bom', 'bom2', 'ildm1', 'ildm2', 'ebom', 'fbom', 'sebom', 'sfbom', 'so', 'sa', 'bndm', 'bndml', 'sbndm', 'sbndm2', 'sbndm-bmh', 'faoso2', 'aoso2', 'aoso4', 'fsbndm', 'bndmq2', 'bndmq4', 'bndmq6', 'sbndmq2', 'sbndmq4', 'sbndmq6', 'sbndmq8', 'ufndmq4', 'ufndmq6', 'ufndmq8']

alg_name_to_id = {}
for idx, alg in enumerate(algorithms):
    alg_name_to_id[alg] = idx


alg_correct = ['fsbndm', 'ebom', 'rf', 'sebom', 'br', 'askip', 'sbndm', 'so',
           'bndm', 'bndml', 'bom', 'ffs', 'smith', 'fs', 'sfbom', 'fbom',
           'ts', 'nsn', 'bfs']

(x_train,       # 00. Pattern (vector of length 256) e.g. array([10, 10, 52, 48, ...
    y_train,        # 01. Class, e.g. 'ebom'
    x_test,         # 02. As x_train but for tests
    y_test,         # 03. As y_train but for tests
    X_train_time,   # 04. DataFrame with all data for training
    X_test_time,    # 05. DataFrame with all data for testing
    Y_train_time,   # 06. Possibly the same as X_train_time
    Y_test_time,    # 07. Possibly the same as X_test_time
    y_train_onehot, # 08. One hot for train
    y_test_onehot,  # 09. One hot for test
    y_test_time,    # 10. Times for each of the tested algorithms (at the time of writing 27)
    y_train_time,   # 11. Times for each of the tested algorithms (at the time of writing 27)
    X_columns,      # 12. Names of columns with subsequent pattern symbols e.g. pattern_0 (first character), etc.
    alg_name_to_id, # 13. Mapping of the algorithm name to its ID
    algorithms,     # 14. Names of all algorithms that have been tested
    algorithm_to_onehot
) = pd.read_pickle('dataset-full.pickle')

df = Y_test_time

# Ratio

In [16]:
df['best_time'] = df[alg_correct].min(axis=1)
df['best_alg']  = df[alg_correct].idxmin(axis=1)

ratios = []
for alg in df['algorithm'].unique():
    ratios += [{
        'algorithm': alg,
        'ratio': sum(df[alg]/df['best_time'])/len(df)
    }]
res = pd.DataFrame(ratios).sort_values(['ratio']).reset_index(drop=True)
res

Unnamed: 0,algorithm,ratio
0,ebom,1.138827
1,sebom,1.158793
2,fsbndm,1.384196
3,sbndm,1.786763
4,rf,2.476593
5,fbom,2.682208
6,sfbom,2.748523
7,bndml,2.846885
8,br,2.903083
9,ffs,2.938919


In [17]:
# jom - does not find all occurrences
# sabp - finds more occurrences than exists
# ssef - reports -1 occurrences sometimes
# tsa - crashes for long patterns

# Read results

In [18]:
results = PersistentResults(
    'results.pickle',    # do jakiego pliku zapisywac
    interval=1,    # co x wzorców
    tmpfilename=f'~results.pickle.tmp',    # plik tymczasowy
    result_prefix='',
    skip_list=['model', 'x_test_ex', 'y_test_time', 'y_test', 'algorithms', 'data']
)

models_results = pd.DataFrame(results.data)
models_results['inputs_str'] = models_results['inputs_str'].str.replace('c_data_lvl_', '')
models_results['model_name'] = models_results['model_name'].str.replace('get_', '').str.upper()

res2 = models_results[
    (
        models_results['model_min_m'] == 6
    )&(
        models_results['model_max_m'] == 256
    )&(
        models_results['min_m'] == 6
    )&(
        models_results['max_m'] == 256
    )
].pivot_table(
    index='inputs_str', 
    values='ratio',
    columns='model_name',
    aggfunc='mean'
).reset_index()

# Results for different input

In [19]:
# print(res2.sort_values('get_xgb').to_markdown())
resss = res2.rename(columns={
    'get_ada': 'ADA',
    'get_bagg':'BAGG', 
    'get_et':'ET',
    'get_hgb':'HGB',
    'get_rf':'RF',
    'get_xgb':'XGB',
})
resss['inputs_str'] = resss['inputs_str'].str.replace('c_data_lvl_', '')
order_d = {
    'x_data': 0,
    'c_data_sigm': 2, 
    'c_data_mm': 4, 
    'c_data_hist': 6, 
    'h0': 8, 
    'h0reg': 10, 
    'h1': 12, 
    'h1reg': 14, 
    'h2': 16, 
    'h2reg': 18, 
}

def inp_f(x):
    features = x.split(',')
    f_sorted = sorted(features, key=lambda x: order_d[x])
    return ','.join(f_sorted)
    

    
resss['inputs_str'] = resss['inputs_str'].apply(inp_f)

order_row = {
    'x_data': 1_000_000_000,
    'c_data_sigm': 100_000_000, 
    'c_data_mm': 10_000_000, 
    'c_data_hist': 1_000_000, 
    'h0': 6, 
    'h0reg': 5, 
    'h1': 4, 
    'h1reg': 3, 
    'h2': 2, 
    'h2reg': 1, 
}

resss = resss.sort_values(by='inputs_str', key=lambda x: x.apply(lambda y: sum([order_row[z] for z in y.split(',')])), ascending=False)
# resss['order'] = resss['inputs_str'].apply(lambda y: sum([order_row[z] for z in y.split(',')]))

resss['inputs_str'] = resss['inputs_str'].str.replace(',h0reg', r',$\mathcal{C_{\textit{h0R}}}$')
resss['inputs_str'] = resss['inputs_str'].str.replace(',h1reg', r',$\mathcal{C_{\textit{h1R}}}$')
resss['inputs_str'] = resss['inputs_str'].str.replace(',h2reg', r',$\mathcal{C_{\textit{h2R}}}$')

resss['inputs_str'] = resss['inputs_str'].str.replace(',h0', r',$\mathcal{C_{\textit{h0}}}$')
resss['inputs_str'] = resss['inputs_str'].str.replace(',h1', r',$\mathcal{C_{\textit{h1}}}$')
resss['inputs_str'] = resss['inputs_str'].str.replace(',h2', r',$\mathcal{C_{\textit{h2}}}$')

resss['inputs_str'] = resss['inputs_str'].str.replace('c_data_hist', r'$\mathcal{H}$')
resss['inputs_str'] = resss['inputs_str'].str.replace('c_data_mm', r'$m$')
resss['inputs_str'] = resss['inputs_str'].str.replace('c_data_sigm', r'$\sigma$')
resss['inputs_str'] = resss['inputs_str'].str.replace('x_data', r"$P'$")

resss.style.format(precision=5).hide(axis="index")

inputs_str,ADA,BAGG,ET,HGB,RF,XGB
"$P'$,$\sigma$,$m$,$\mathcal{H}$,$\mathcal{C_{\textit{h0}}}$,$\mathcal{C_{\textit{h0R}}}$,$\mathcal{C_{\textit{h1}}}$,$\mathcal{C_{\textit{h1R}}}$,$\mathcal{C_{\textit{h2}}}$,$\mathcal{C_{\textit{h2R}}}$",1.03807,1.02763,1.02632,1.0268,1.02793,1.02669
"$P'$,$\sigma$,$m$,$\mathcal{H}$,$\mathcal{C_{\textit{h0}}}$",1.03882,1.02943,1.02724,1.02703,1.02973,1.02597
"$P'$,$\sigma$,$m$,$\mathcal{H}$,$\mathcal{C_{\textit{h0R}}}$",1.0422,1.02809,1.02604,1.02665,1.02823,1.02533
"$P'$,$\sigma$,$m$,$\mathcal{H}$,$\mathcal{C_{\textit{h1}}}$",1.03894,1.02913,1.02651,1.02734,1.029,1.0266
"$P'$,$\sigma$,$m$,$\mathcal{H}$,$\mathcal{C_{\textit{h1R}}}$",1.03958,1.02902,1.02762,1.02709,1.02834,1.0266
"$P'$,$\sigma$,$m$,$\mathcal{H}$,$\mathcal{C_{\textit{h2}}}$",1.03911,1.02942,1.02715,1.02687,1.02901,1.02544
"$P'$,$\sigma$,$m$,$\mathcal{H}$,$\mathcal{C_{\textit{h2R}}}$",1.04005,1.02846,1.02693,1.02736,1.02888,1.02684
"$P'$,$\sigma$,$m$,$\mathcal{H}$",1.03996,1.02882,1.02679,1.02747,1.02923,1.02537
"$P'$,$\sigma$,$m$",1.09469,1.03429,1.05088,1.03235,1.03656,1.02842
"$P'$,$\mathcal{C_{\textit{h0}}}$,$\mathcal{C_{\textit{h0R}}}$,$\mathcal{C_{\textit{h1}}}$,$\mathcal{C_{\textit{h1R}}}$,$\mathcal{C_{\textit{h2}}}$,$\mathcal{C_{\textit{h2R}}}$",1.09953,1.03639,1.05516,1.03305,1.03733,1.02872


# Full results as a ratio

In [20]:
res3 = res2.min().reset_index(drop=False)
res3 = (
    res3[res3['model_name']!='inputs_str']
    .rename(
        columns={
            'model_name':'algorithm', 
            0: 'ratio'
    })
)
res3 = res3.replace(
    ['get_ada', 'get_bagg', 'get_et', 'get_hgb', 'get_rf', 'get_xgb'],
    ['ADA', 'BAGG', 'ET', 'HGB', 'RF', 'XGB']
)
res3

(
    pd.concat((res.reset_index(), res3.reset_index()))
    .sort_values(by='ratio')
    .reset_index(drop=True)[['algorithm', 'ratio']]
    .style.format(precision=3).hide(axis="index")
)

algorithm,ratio
XGB,1.025
ET,1.026
HGB,1.027
BAGG,1.028
RF,1.028
ADA,1.038
ebom,1.139
sebom,1.159
fsbndm,1.384
sbndm,1.787


# Prediction time

In [21]:
def get_median(x):
    qwe = list(sorted(x))
    return qwe[len(qwe)//2]

models_results['single_pred_time_median'] = models_results['single_pred_time'].apply(get_median)
models_results['single_pred_time_median']

pred_res = models_results[
    (
        models_results['model_min_m'] == 6
    )&(
        models_results['model_max_m'] == 256
    )&(
        models_results['min_m'] == 6
    )&(
        models_results['max_m'] == 256
    )
    &(
       models_results['inputs_str'] == 'c_data_hist,h1,c_data_mm,c_data_sigm,x_data'
    )
    &(
       models_results['dataset'] == 'all'
    )
].sort_values(by='model_name').pivot_table(
    index='dataset', 
    values='single_pred_time_median',
    columns='model_name',
    aggfunc='mean'
).reset_index()

pred_res['dataset'] = pred_res['dataset'].apply(lambda x: x.split('/')[-1])
(pred_res[['ADA', 'BAGG', 'ET', 'HGB', 'RF', 'XGB']].T * 1000).round(3)

Unnamed: 0_level_0,0
model_name,Unnamed: 1_level_1
ADA,29.046
BAGG,228.892
ET,12.872
HGB,21.203
RF,13.516
XGB,0.666
