In [None]:
import logging
import pathlib
import pickle
import re
import warnings
from collections import defaultdict

import attr
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tqdm
from joblib import Parallel, delayed
from plotnine import (
    aes,
    element_blank,
    element_rect,
    element_text,
    ggplot,
    stat_smooth,
    theme,
)
from sklearn.metrics import precision_recall_curve, roc_auc_score
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.neighbors import NearestNeighbors

from skrough.base import Bireduct
from skrough.bireducts.dynamically_adapted_approximate_bireduct import (
    DynamicallyAdaptedApproximateBireduct,
)

warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

import config

In [None]:
DATA_DIR = pathlib.Path(config.DATA_DIR)
TMP_DIR = pathlib.Path(config.TMP_DIR)
SEP = ";"

# FILEPATH_IN = DATA_DIR / 'toolbox_tabular_data_annonymized_discretized_uniform.csv'
FILEPATH_IN = DATA_DIR / "toolbox_tabular_data_annonymized_discretized_quantile.csv"
# FILEPATH_IN = DATA_DIR / 'toolbox_tabular_data_annonymized_discretized_kmeans.csv'
# FILEPATH_IN = DATA_DIR / 'toolbox_tabular_data_annonymized.csv'

EMBEDDINGS_FILEPATH_IN = DATA_DIR / "process_embeddings_d50.csv"

In [None]:
df = pd.read_csv(FILEPATH_IN, sep=SEP)
process_ids = df.pop("process_ids")
df = df.astype("category")
df = df.apply(lambda x: x.cat.codes)
df_dec = df.pop("target")
df_dec = 1 - df_dec

embeddings = pd.read_csv(EMBEDDINGS_FILEPATH_IN)
process_category = embeddings[["process_ids", "process_category"]]
embeddings.drop(["process_category"], axis=1, inplace=True)
embeddings.set_index("process_ids", inplace=True)
emb_nbrs = NearestNeighbors().fit(embeddings)

In [None]:
sum(df_dec) / len(df_dec)

In [None]:
# result_xgboost_test = []
# for p in (TMP_DIR/'paper_xgboost_hyper_native_2_test').glob('*.csv'):
#     m = re.search('_dataset_(\w+)'
#                   '_leave_one_process_out_k_(\d+).*_num_boost_round_(\d+).*_learning_rate_([.\d]+).*_max_depth_(\d+).*_base_score_([.\d]+).*\.csv', p.name)
#     if m is not None:
#         dataset = m.group(1)
#         k = int(m.group(2))
#         num_boost_round = int(m.group(3))
#         learning_rate = float(m.group(4))
#         max_depth = int(m.group(5))
#         base_score = float(m.group(6))
#         x = pd.read_csv(p, sep=';')
#         result_xgboost_test.append([dataset, k, num_boost_round, learning_rate, max_depth, base_score,
#                        roc_auc_score(df_dec, np.nan_to_num(x.iloc[:, 0]/x.iloc[:, 1]))])
# result_xgboost_test = pd.DataFrame(result_xgboost_test, columns=('dataset', 'k', 'num_boost_round', 'learning_rate', 'max_depth', 'base_score', 'AUC'))
# result_xgboost_test.sort_values('AUC', ascending=False)

In [None]:
result_xgboost = []
for p in (TMP_DIR / "paper_xgboost_hyper_native_2_").glob("*.csv"):
    m = re.search(
        r"_dataset_(\w+)"
        r"_leave_one_process_out_k_(\d+).*_num_boost_round_(\d+).*_learning_rate_([.\d]+).*_max_depth_(\d+).*_base_score_([.\d]+).*\.csv",
        p.name,
    )
    if m is not None:
        dataset = m.group(1)
        k = int(m.group(2))
        num_boost_round = int(m.group(3))
        learning_rate = float(m.group(4))
        max_depth = int(m.group(5))
        base_score = float(m.group(6))
        x = pd.read_csv(p, sep=";")
        result_xgboost.append(
            [
                dataset,
                k,
                num_boost_round,
                learning_rate,
                max_depth,
                base_score,
                roc_auc_score(df_dec, np.nan_to_num(x.iloc[:, 0] / x.iloc[:, 1])),
            ]
        )
result_xgboost = pd.DataFrame(
    result_xgboost,
    columns=(
        "dataset",
        "k",
        "num_boost_round",
        "learning_rate",
        "max_depth",
        "base_score",
        "AUC",
    ),
)

In [None]:
# result_tree_test = []
# for p in (TMP_DIR/'paper_tree_2_test').glob('*.csv'):
#     m = re.search('_dataset_(\w+)'
#                   '_leave_one_process_out_k_(\d+).*_min_impurity_decrease_([.\d]+).*_max_depth_(\d+|None).*\.csv', p.name)
#     if m is not None:
#         dataset = m.group(1)
#         k = int(m.group(2))
#         min_impurity_decrease = float(m.group(3))
#         max_depth = None if m.group(4) == 'None' else int(m.group(4))
#         x = pd.read_csv(p, sep=';')
#         result_tree_test.append([dataset, k, min_impurity_decrease, max_depth,
#                        roc_auc_score(df_dec, np.nan_to_num(x.iloc[:, 0]/x.iloc[:, 1]))])
# result_tree_test = pd.DataFrame(result_tree_test, columns=('dataset', 'k', 'min_impurity_decrease', 'max_depth', 'AUC'))
# result_tree_test.sort_values('AUC', ascending=False)

In [None]:
result_tree = []
for p in (TMP_DIR / "paper_tree_2_").glob("*.csv"):
    m = re.search(
        r"_dataset_(\w+)"
        r"_leave_one_process_out_k_(\d+).*_min_impurity_decrease_([.\d]+).*_max_depth_(\d+|None).*\.csv",
        p.name,
    )
    if m is not None:
        dataset = m.group(1)
        k = int(m.group(2))
        min_impurity_decrease = float(m.group(3))
        max_depth = None if m.group(4) == "None" else int(m.group(4))
        x = pd.read_csv(p, sep=";")
        result_tree.append(
            [
                dataset,
                k,
                min_impurity_decrease,
                max_depth,
                roc_auc_score(df_dec, np.nan_to_num(x.iloc[:, 0] / x.iloc[:, 1])),
            ]
        )
result_tree = pd.DataFrame(
    result_tree, columns=("dataset", "k", "min_impurity_decrease", "max_depth", "AUC")
)

In [None]:
result_bireducts = []
for p in (TMP_DIR / "paper_bireducts_2_").glob("*.csv"):
    m = re.search(
        r"_dataset_(\w+)"
        r"_leave_one_process_out_k_(\d+).*_n_bireducts_(\d+).*_candidate_n_attrs_(\d+).*_allowed_randomness_([.\d]+).*_max_n_attrs_(\d+|None).*_iteration_(\d+).*\.csv",
        p.name,
    )
    if m is not None:
        dataset = m.group(1)
        k = int(m.group(2))
        n_bireducts = int(m.group(3))
        candidate_n_attrs = int(m.group(4))
        allowed_randomness = float(m.group(5))
        max_n_attrs = None if m.group(6) == "None" else int(m.group(6))
        iteration = int(m.group(7))
        x = pd.read_csv(p, sep=";")
        result_bireducts.append(
            [
                dataset,
                k,
                n_bireducts,
                candidate_n_attrs,
                allowed_randomness,
                max_n_attrs,
                iteration,
                roc_auc_score(df_dec, np.nan_to_num(x.iloc[:, 0] / x.iloc[:, 1])),
                roc_auc_score(df_dec, np.nan_to_num(x.iloc[:, 0] / x.iloc[:, 2])),
            ]
        )
result_bireducts = pd.DataFrame(
    result_bireducts,
    columns=(
        "dataset",
        "k",
        "n_bireducts",
        "candidate_n_attrs",
        "allowed_randomness",
        "max_n_attrs",
        "iteration",
        "AUC",
        "AUC2",
    ),
)

In [None]:
result_bireducts_old = []
for p in (TMP_DIR / "paper").glob("*.csv"):
    m = re.search(
        r"k_(\d+).*_bireducts_(\d+).*_candidate_attrs_(\d+).*_allowed_randomness_([.\d]+).*_rep_(\d+).*\.csv",
        p.name,
    )
    if m is not None:
        k = int(m.group(1))
        bireducts = int(m.group(2))
        candidate_attrs = int(m.group(3))
        allowed_randomness = float(m.group(4))
        iteration = int(m.group(5))
        x = pd.read_csv(p, sep=";")
        result_bireducts_old.append(
            [
                k,
                bireducts,
                candidate_attrs,
                allowed_randomness,
                iteration,
                1 - roc_auc_score(df_dec, np.nan_to_num(x.iloc[:, 0] / x.iloc[:, 1])),
                1 - roc_auc_score(df_dec, np.nan_to_num(x.iloc[:, 0] / x.iloc[:, 2])),
            ]
        )
result_bireducts_old = pd.DataFrame(
    result_bireducts_old,
    columns=(
        "k",
        "bireducts",
        "candidate_attrs",
        "allowed_randomness",
        "iteration",
        "AUC",
        "AUC2",
    ),
)

In [None]:
result_bireducts_old_3 = []
for p in (TMP_DIR / "paper_max_3_attrs").glob("*.csv"):
    m = re.search(
        r"k_(\d+).*_bireducts_(\d+).*_candidate_attrs_(\d+).*_allowed_randomness_([.\d]+).*_max_attrs_(\d+).*_rep_(\d+).*\.csv",
        p.name,
    )
    if m is not None:
        k = int(m.group(1))
        bireducts = int(m.group(2))
        candidate_attrs = int(m.group(3))
        allowed_randomness = float(m.group(4))
        max_attrs = int(m.group(5))
        iteration = int(m.group(6))
        x = pd.read_csv(p, sep=";")
        result_bireducts_old_3.append(
            [
                k,
                bireducts,
                candidate_attrs,
                allowed_randomness,
                max_attrs,
                iteration,
                1 - roc_auc_score(df_dec, np.nan_to_num(x.iloc[:, 0] / x.iloc[:, 1])),
                1 - roc_auc_score(df_dec, np.nan_to_num(x.iloc[:, 0] / x.iloc[:, 2])),
            ]
        )
result_bireducts_old_3 = pd.DataFrame(
    result_bireducts_old_3,
    columns=(
        "k",
        "bireducts",
        "candidate_attrs",
        "allowed_randomness",
        "max_attrs",
        "iteration",
        "AUC",
        "AUC2",
    ),
)

In [None]:
# x2 = result_xgboost[(result_xgboost['num_boost_round'] == 1000) & (result_xgboost['learning_rate'] == 0.001) & (result_xgboost['max_depth'] == 2) & (result_xgboost['base_score'] == 0.0696)]
# x3 = result_xgboost[(result_xgboost['num_boost_round'] == 1000) & (result_xgboost['learning_rate'] == 0.001) & (result_xgboost['max_depth'] == 3) & (result_xgboost['base_score'] == 0.0696)]
# x4 = result_xgboost[(result_xgboost['num_boost_round'] == 1000) & (result_xgboost['learning_rate'] == 0.001) & (result_xgboost['max_depth'] == 4) & (result_xgboost['base_score'] == 0.0696)]
# x5 = result_xgboost[(result_xgboost['num_boost_round'] == 1000) & (result_xgboost['learning_rate'] == 0.001) & (result_xgboost['max_depth'] == 5) & (result_xgboost['base_score'] == 0.0696)]
# x10 = result_xgboost[(result_xgboost['num_boost_round'] == 1000) & (result_xgboost['learning_rate'] == 0.001) & (result_xgboost['max_depth'] == 10) & (result_xgboost['base_score'] == 0.0696)]

x3 = result_xgboost[
    (result_xgboost["dataset"] == "toolbox_tabular_data_annonymized")
    & (result_xgboost["num_boost_round"] == 1000)
    & (result_xgboost["learning_rate"] == 0.001)
    & (result_xgboost["max_depth"] == 3)
    & (result_xgboost["base_score"] == 0.0696)
]
x2_discretized = result_xgboost[
    (
        result_xgboost["dataset"]
        == "toolbox_tabular_data_annonymized_discretized_quantile"
    )
    & (result_xgboost["num_boost_round"] == 1000)
    & (result_xgboost["learning_rate"] == 0.001)
    & (result_xgboost["max_depth"] == 2)
    & (result_xgboost["base_score"] == 0.0696)
]


t1 = result_tree[
    (result_tree["dataset"] == "toolbox_tabular_data_annonymized")
    & (result_tree["min_impurity_decrease"] == 0.0)
    & (result_tree["max_depth"] == 2)
]
t2 = result_tree[
    (result_tree["dataset"] == "toolbox_tabular_data_annonymized")
    & (result_tree["min_impurity_decrease"] == 0.001)
    & (result_tree["max_depth"] == 2)
]
t3 = result_tree[
    (result_tree["dataset"] == "toolbox_tabular_data_annonymized")
    & (result_tree["min_impurity_decrease"] == 0.0)
    & (result_tree["max_depth"] == 3)
]
t4 = result_tree[
    (result_tree["dataset"] == "toolbox_tabular_data_annonymized")
    & (result_tree["min_impurity_decrease"] == 0.001)
    & (result_tree["max_depth"] == 3)
]
t5 = result_tree[
    (result_tree["dataset"] == "toolbox_tabular_data_annonymized")
    & (result_tree["min_impurity_decrease"] == 0.0)
    & (result_tree["max_depth"] == 4)
]
t6 = result_tree[
    (result_tree["dataset"] == "toolbox_tabular_data_annonymized")
    & (result_tree["min_impurity_decrease"] == 0.001)
    & (result_tree["max_depth"] == 4)
]
t7 = result_tree[
    (result_tree["dataset"] == "toolbox_tabular_data_annonymized")
    & (result_tree["min_impurity_decrease"] == 0.0)
    & (result_tree["max_depth"] == 5)
]
t8 = result_tree[
    (result_tree["dataset"] == "toolbox_tabular_data_annonymized")
    & (result_tree["min_impurity_decrease"] == 0.001)
    & (result_tree["max_depth"] == 5)
]

b1 = result_bireducts[
    (result_bireducts["max_n_attrs"].isnull()) & (result_bireducts["iteration"] == 0)
]
b2 = result_bireducts[
    (result_bireducts["max_n_attrs"] == 3) & (result_bireducts["iteration"] == 1)
]

bo1 = result_bireducts_old[result_bireducts_old["iteration"] == 0]
bo2 = result_bireducts_old[result_bireducts_old["iteration"] == 1]
bo3 = result_bireducts_old[result_bireducts_old["iteration"] == 2]
bo4 = result_bireducts_old[result_bireducts_old["iteration"] == 3]
bo5 = result_bireducts_old[result_bireducts_old["iteration"] == 4]

bo31 = result_bireducts_old_3[result_bireducts_old_3["iteration"] == 0]
bo32 = result_bireducts_old_3[result_bireducts_old_3["iteration"] == 1]
bo33 = result_bireducts_old_3[result_bireducts_old_3["iteration"] == 2]
bo34 = result_bireducts_old_3[result_bireducts_old_3["iteration"] == 3]
bo35 = result_bireducts_old_3[result_bireducts_old_3["iteration"] == 4]

In [None]:
result_tree[
    (result_tree["dataset"] == "toolbox_tabular_data_annonymized")
    & (result_tree["k"] == 40)
].sort_values("AUC", ascending=False)

In [None]:
sns.set_theme()
plt.figure(figsize=(15, 8))
plt.xlim(8, 200)

sns.lineplot(data=x3, x="k", y="AUC", markers=True, label="xgboost_depth_3")
# sns.lineplot(data=x2_discretized, x='k', y='AUC', markers=True, label='xgboost_depth_2_discretized')
# sns.lineplot(data=x2, x='k', y='AUC', markers=True, lw=1, label='xgboost')
# sns.lineplot(data=x3, x='k', y='AUC', markers=True, lw=1, label='xgboost')
# sns.lineplot(data=x5, x='k', y='AUC', markers=True, lw=1, label='xgboost_depth_5')
# sns.lineplot(data=x5, x='k', y='AUC', markers=True, lw=1, label='xgboost')

sns.lineplot(data=b1, x="k", y="AUC", markers=True, lw=1, label="bireducts")
# sns.lineplot(data=b2, x='k', y='AUC', markers=True, lw=1, label='bireducts_attrs_3')

# sns.lineplot(data=bo1, x='k', y='AUC', markers=True, lw=1, color='red', label='bireducts_old')
# sns.lineplot(data=bo2, x='k', y='AUC', markers=True, lw=1, color='red')
# sns.lineplot(data=bo3, x='k', y='AUC', markers=True, lw=1, color='red')
# sns.lineplot(data=bo4, x='k', y='AUC', markers=True, lw=1, color='red')
# sns.lineplot(data=bo5, x='k', y='AUC', markers=True, lw=1, color='red')

# sns.lineplot(data=result_bireducts_old, x='k', y='AUC', markers=True, lw=1, label='bireducts_old')

# sns.lineplot(data=bo31, x='k', y='AUC', markers=True, lw=1, color='plum', label='bireducts_old_3')
# sns.lineplot(data=bo32, x='k', y='AUC', markers=True, lw=1, color='plum')
# sns.lineplot(data=bo33, x='k', y='AUC', markers=True, lw=1, color='plum')
# sns.lineplot(data=bo34, x='k', y='AUC', markers=True, lw=1, color='plum')
# sns.lineplot(data=bo35, x='k', y='AUC', markers=True, lw=1, color='plum')

# sns.lineplot(data=result_bireducts_old_3, x='k', y='AUC', markers=True, lw=1, label='bireducts_old_3')

# sns.lineplot(data=t1, x='k', y='AUC', markers=True, lw=1, label='tree')
# sns.lineplot(data=t2, x='k', y='AUC', markers=True, lw=1)
# sns.lineplot(data=t3, x='k', y='AUC', markers=True, lw=1, label='3')
# sns.lineplot(data=t4, x='k', y='AUC', markers=True, lw=1)
# sns.lineplot(data=t5, x='k', y='AUC', markers=True, lw=1, label='5')
# sns.lineplot(data=t6, x='k', y='AUC', markers=True, lw=1)
sns.lineplot(data=t7, x="k", y="AUC", markers=True, lw=1, label="7")
# sns.lineplot(data=t8, x='k', y='AUC', markers=True, lw=1)

In [None]:
x3

In [None]:
# (ggplot()
#  + stat_smooth(data=x3, mapping=aes('k', 'AUC'), method='loess', color='royalblue', size=2, span=0.3)
#  + stat_smooth(data=b1, mapping=aes('k', 'AUC'), method='loess', color='limegreen', size=2, span=0.3)
#  + stat_smooth(data=t7, mapping=aes('k', 'AUC'), method='loess', color='tomato', size=2, span=0.3)
# )

In [None]:
x3
b1
t7

In [None]:
valx = x3[["k", "AUC"]].copy()
valx["algorithm"] = "xgboost"
valb = b1[["k", "AUC"]].copy()
valb["algorithm"] = "bireducts"
valt = t7[["k", "AUC"]].copy()
valt["algorithm"] = "decision tree"

vals = pd.concat([valx, valb, valt])

In [None]:
vals.to_csv("auc_vals.csv")

In [None]:
plot = (
    ggplot(vals, aes(x="k"))
    + stat_smooth(
        aes(
            y="AUC",
            color=["red"] * 20 + ["blue"] * 20 + ["green"] * 20,
            group="algorithm",
        ),
        span=0.4,
        size=2,
        level=0.95,
    )
    + theme(
        legend_direction="horizontal",
        legend_position="bottom",
        legend_key=element_rect(alpha=0),
        legend_title=element_blank(),
        legend_entry_spacing_x=50,
        legend_box_margin=-3,
        legend_box_spacing=0.45,
        #         rect=element_rect(color='black', size=3, fill='#EEBB0050'),
    )
)
plot

In [None]:
plot.save("kresults_span_04.pdf")

# Predictions

In [None]:
def get_predictions(file_path, norm_column=1):
    df = pd.read_csv(file_path, sep=";")
    return df.iloc[:, 0] / df.iloc[:, 1]


def get_predictions_metrics(df_dec, preds, round_decimals=3):
    precision, recall, thresholds = precision_recall_curve(df_dec, preds)
    f1_score = np.divide(
        2 * recall * precision,
        recall + precision,
        out=np.zeros(len(precision)),
        where=(recall + precision > 0),
    )
    i = np.argmax(f1_score)
    #     sns.lineplot(x=range(len(f1_score)), y=f1_score, label='f1_score')
    return {
        "precision": np.round(precision[i], decimals=round_decimals),
        "recall": np.round(recall[i], decimals=round_decimals),
        "f1_score": np.round(f1_score[i], decimals=round_decimals),
        "AUC": np.round(roc_auc_score(df_dec, preds), decimals=round_decimals),
    }


def get_predictions_table(df_dec, preds_all, preds_same, preds_40, round_decimal=3):
    table = pd.DataFrame.from_dict(
        {
            "all": get_predictions_metrics(df_dec, preds_all),
            "same": get_predictions_metrics(df_dec, preds_same),
            "top 40": get_predictions_metrics(df_dec, preds_40),
        },
        orient="index",
    )
    table = table.T
    return table

In [None]:
# pred_xgboost_discretized_all = get_predictions(
#     './tmp/paper_xgboost_hyper_native_2_/xgboost_native_dataset_toolbox_tabular_data_annonymized_discretized_quantile'
#     '_leave_one_process_out_k_None_num_boost_round_1000_learning_rate_0.001_max_depth_2_base_score_0.0696.csv')
# pred_xgboost_discretized_same = get_predictions(
#     './tmp/paper_xgboost_hyper_native_2_/xgboost_native_dataset_toolbox_tabular_data_annonymized_discretized_quantile'
#     '_leave_one_process_out_same_category_num_boost_round_1000_learning_rate_0.001_max_depth_2_base_score_0.0696.csv')
# pred_xgboost_discretized_40 = get_predictions(
#     './tmp/paper_xgboost_hyper_native_2_/xgboost_native_dataset_toolbox_tabular_data_annonymized_discretized_quantile'
#     '_leave_one_process_out_k_40_num_boost_round_1000_learning_rate_0.001_max_depth_2_base_score_0.0696.csv')


pred_xgboost_all = get_predictions(
    "./tmp/paper_xgboost_hyper_native_2_/xgboost_native_dataset_toolbox_tabular_data_annonymized"
    "_leave_one_process_out_k_None_num_boost_round_1000_learning_rate_0.001_max_depth_3_base_score_0.0696.csv"
)
pred_xgboost_same = get_predictions(
    "./tmp/paper_xgboost_hyper_native_2_/xgboost_native_dataset_toolbox_tabular_data_annonymized"
    "_leave_one_process_out_same_category_num_boost_round_1000_learning_rate_0.001_max_depth_3_base_score_0.0696.csv"
)
pred_xgboost_40 = get_predictions(
    "./tmp/paper_xgboost_hyper_native_2_/xgboost_native_dataset_toolbox_tabular_data_annonymized"
    "_leave_one_process_out_k_40_num_boost_round_1000_learning_rate_0.001_max_depth_3_base_score_0.0696.csv"
)


# pred_tree_discretized_all = get_predictions(
#     './tmp/paper_tree_2_/decision_tree_dataset_toolbox_tabular_data_annonymized_discretized_quantile'
#     '_leave_one_process_out_k_None_min_impurity_decrease_0.0_max_depth_3.csv')
# pred_tree_discretized_same = get_predictions(
#     './tmp/paper_tree_2_/decision_tree_dataset_toolbox_tabular_data_annonymized_discretized_quantile'
#     '_leave_one_process_out_same_category_min_impurity_decrease_0.0_max_depth_3.csv')
# pred_tree_discretized_40 = get_predictions(
#     './tmp/paper_tree_2_/decision_tree_dataset_toolbox_tabular_data_annonymized_discretized_quantile'
#     '_leave_one_process_out_k_40_min_impurity_decrease_0.0_max_depth_3.csv')


pred_tree_all = get_predictions(
    "./tmp/paper_tree_2_/decision_tree_dataset_toolbox_tabular_data_annonymized"
    "_leave_one_process_out_k_None_min_impurity_decrease_0.0_max_depth_5.csv"
)
pred_tree_same = get_predictions(
    "./tmp/paper_tree_2_/decision_tree_dataset_toolbox_tabular_data_annonymized"
    "_leave_one_process_out_same_category_min_impurity_decrease_0.0_max_depth_5.csv"
)
pred_tree_40 = get_predictions(
    "./tmp/paper_tree_2_/decision_tree_dataset_toolbox_tabular_data_annonymized"
    "_leave_one_process_out_k_40_min_impurity_decrease_0.0_max_depth_5.csv"
)


# pred_tree_all_4 = get_predictions(
#     './tmp/paper_tree_2_/decision_tree_dataset_toolbox_tabular_data_annonymized'
#     '_leave_one_process_out_k_None_min_impurity_decrease_0.0_max_depth_4.csv')
# pred_tree_same_4 = get_predictions(
#     './tmp/paper_tree_2_/decision_tree_dataset_toolbox_tabular_data_annonymized'
#     '_leave_one_process_out_same_category_min_impurity_decrease_0.0_max_depth_4.csv')
# pred_tree_40_4 = get_predictions(
#     './tmp/paper_tree_2_/decision_tree_dataset_toolbox_tabular_data_annonymized'
#     '_leave_one_process_out_k_40_min_impurity_decrease_0.0_max_depth_4.csv')


# pred_tree_all_3 = get_predictions(
#     './tmp/paper_tree_2_/decision_tree_dataset_toolbox_tabular_data_annonymized'
#     '_leave_one_process_out_k_None_min_impurity_decrease_0.0_max_depth_3.csv')
# pred_tree_same_3 = get_predictions(
#     './tmp/paper_tree_2_/decision_tree_dataset_toolbox_tabular_data_annonymized'
#     '_leave_one_process_out_same_category_min_impurity_decrease_0.0_max_depth_3.csv')
# pred_tree_40_3 = get_predictions(
#     './tmp/paper_tree_2_/decision_tree_dataset_toolbox_tabular_data_annonymized'
#     '_leave_one_process_out_k_40_min_impurity_decrease_0.0_max_depth_3.csv')


pred_bireducts_all = get_predictions(
    "./tmp/paper_bireducts_2_/bireducts_dataset_toolbox_tabular_data_annonymized_discretized_quantile"
    "_leave_one_process_out_k_None_n_bireducts_1000_candidate_n_attrs_100_allowed_randomness_0.05_max_n_attrs_None_iteration_0.csv"
)
pred_bireducts_same = get_predictions(
    "./tmp/paper_bireducts_2_/bireducts_dataset_toolbox_tabular_data_annonymized_discretized_quantile"
    "_leave_one_process_out_same_category_n_bireducts_1000_candidate_n_attrs_100_allowed_randomness_0.05_max_n_attrs_None_iteration_0.csv"
)
pred_bireducts_40 = get_predictions(
    "./tmp/paper_bireducts_2_/bireducts_dataset_toolbox_tabular_data_annonymized_discretized_quantile"
    "_leave_one_process_out_k_40_n_bireducts_1000_candidate_n_attrs_100_allowed_randomness_0.05_max_n_attrs_None_iteration_0.csv"
)

In [None]:
get_predictions_metrics(df_dec, pred_xgboost_all)

In [None]:
get_predictions_metrics(df_dec, pred_xgboost_same)

In [None]:
get_predictions_metrics(df_dec, pred_xgboost_40)

In [None]:
print(
    get_predictions_table(
        df_dec, pred_xgboost_all, pred_xgboost_same, pred_xgboost_40
    ).to_latex()
)

In [None]:
get_predictions_metrics(df_dec, pred_tree_all)

In [None]:
get_predictions_metrics(df_dec, pred_tree_same)

In [None]:
get_predictions_metrics(df_dec, pred_tree_40)

In [None]:
print(
    get_predictions_table(
        df_dec, pred_tree_all, pred_tree_same, pred_tree_40
    ).to_latex()
)

In [None]:
get_predictions_metrics(df_dec, pred_bireducts_all)

In [None]:
get_predictions_metrics(df_dec, pred_bireducts_same)

In [None]:
get_predictions_metrics(df_dec, pred_bireducts_40)

In [None]:
print(
    get_predictions_table(
        df_dec, pred_bireducts_all, pred_bireducts_same, pred_bireducts_40
    ).to_latex()
)