In [21]:
import logging
import pathlib
import pickle
import warnings

import attr
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm
import xgboost as xgb
from joblib import Parallel, delayed
from sklearn import tree
from sklearn.metrics import auc, precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.neighbors import NearestNeighbors

from skrough.base import Bireduct
from skrough.bireducts.dynamically_adapted_approximate_bireduct import (
    DynamicallyAdaptedApproximateBireduct,
)

warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

import config

In [28]:
DATA_DIR = pathlib.Path(config.DATA_DIR)
TMP_DIR = pathlib.Path(config.TMP_DIR)
SEP = ","
DISCRETIZED_SEP = ";"
N_JOBS = 7

# DISCRETIZED_FILEPATH_IN = DATA_DIR / 'toolbox_tabular_data_annonymized_discretized_uniform.csv'
DISCRETIZED_FILEPATH_IN = (
    DATA_DIR / "toolbox_tabular_data_annonymized_discretized_quantile.csv"
)
# DISCRETIZED_FILEPATH_IN = DATA_DIR / 'toolbox_tabular_data_annonymized_discretized_kmeans.csv'
FILEPATH_IN = DATA_DIR / "toolbox_tabular_data_annonymized.csv"

EMBEDDINGS_FILEPATH_IN = DATA_DIR / "process_embeddings_d50.csv"

XGBOOST_EXPERIMENT_DIR = "paper_xgboost_hyper_native_2_"
XGBOOST_FILENAME_OUT_TEMPLATE = (
    f"{XGBOOST_EXPERIMENT_DIR}/"
    "xgboost_native"
    f"_dataset_{pathlib.Path(FILEPATH_IN).stem}"
    "_leave_one_process_out_k_{k}"
    "_num_boost_round_{num_boost_round}"
    "_learning_rate_{learning_rate}"
    "_max_depth_{max_depth}"
    "_base_score_{base_score}"
    ".csv"
)

XGBOOST_FILENAME_OUT_TEMPLATE_SAME_CATEGORY = (
    f"{XGBOOST_EXPERIMENT_DIR}/"
    "xgboost_native"
    f"_dataset_{pathlib.Path(FILEPATH_IN).stem}"
    "_leave_one_process_out_same_category"
    "_num_boost_round_{num_boost_round}"
    "_learning_rate_{learning_rate}"
    "_max_depth_{max_depth}"
    "_base_score_{base_score}"
    ".csv"
)


TREE_EXPERIMENT_DIR = "paper_tree_2_"
TREE_FILENAME_OUT_TEMPLATE = (
    f"{TREE_EXPERIMENT_DIR}/"
    "decision_tree"
    f"_dataset_{pathlib.Path(FILEPATH_IN).stem}"
    "_leave_one_process_out_k_{k}"
    "_min_impurity_decrease_{min_impurity_decrease}"
    "_max_depth_{max_depth}"
    ".csv"
)
TREE_FILENAME_OUT_TEMPLATE_SAME_CATEGORY = (
    f"{TREE_EXPERIMENT_DIR}/"
    "decision_tree"
    f"_dataset_{pathlib.Path(FILEPATH_IN).stem}"
    "_leave_one_process_out_same_category"
    "_min_impurity_decrease_{min_impurity_decrease}"
    "_max_depth_{max_depth}"
    ".csv"
)


BIREDUCTS_EXPERIMENT_DIR = "paper_bireducts_2_"
BIREDUCTS_FILENAME_OUT_TEMPLATE = (
    f"{BIREDUCTS_EXPERIMENT_DIR}/"
    "bireducts"
    f"_dataset_{pathlib.Path(DISCRETIZED_FILEPATH_IN).stem}"
    "_leave_one_process_out_k_{k}"
    "_n_bireducts_{n_bireducts}"
    "_candidate_n_attrs_{candidate_n_attrs}"
    "_allowed_randomness_{allowed_randomness}"
    "_max_n_attrs_{max_n_attrs}"
    "_iteration_{iteration}"
    ".csv"
)
BIREDUCTS_FILENAME_OUT_TEMPLATE_SAME_CATEGORY = (
    f"{BIREDUCTS_EXPERIMENT_DIR}/"
    "bireducts"
    f"_dataset_{pathlib.Path(DISCRETIZED_FILEPATH_IN).stem}"
    "_leave_one_process_out_same_category"
    "_n_bireducts_{n_bireducts}"
    "_candidate_n_attrs_{candidate_n_attrs}"
    "_allowed_randomness_{allowed_randomness}"
    "_max_n_attrs_{max_n_attrs}"
    "_iteration_{iteration}"
    ".csv"
)

In [23]:
logging.basicConfig(filename=TMP_DIR / "b.log", level=logging.INFO)

In [48]:
df = pd.read_csv(FILEPATH_IN, sep=SEP)
process_ids = df.pop("process_ids")
df_dec = df.pop("target").astype("category").cat.codes
df_dec = 1 - df_dec

discretized_df = pd.read_csv(DISCRETIZED_FILEPATH_IN, sep=DISCRETIZED_SEP)
discretized_process_ids = discretized_df.pop("process_ids")
discretized_df = discretized_df.astype("category")
discretized_df = discretized_df.apply(lambda x: x.cat.codes)
discretized_df_dec = discretized_df.pop("target")
discretized_df_dec = 1 - discretized_df_dec

embeddings = pd.read_csv(EMBEDDINGS_FILEPATH_IN)
process_category = embeddings[["process_ids", "process_category"]]
embeddings.drop(["process_category"], axis=1, inplace=True)
embeddings.set_index("process_ids", inplace=True)
emb_nbrs = NearestNeighbors().fit(embeddings)

assert (process_ids == discretized_process_ids).all()
assert (df_dec == discretized_df_dec).all()

In [5]:
sum(df_dec) / len(df_dec)

0.06968954656801479

In [33]:
sum(discretized_df_dec) / len(discretized_df_dec)

0.06968954656801479

In [6]:
def refine_train(train, test, process_ids, embeddings, emb_nbrs, k):
    # search for k+1 neighbors because the one we search for is obviously the best match (thus +1)
    test_process_id = process_ids.loc[test.index[0]]
    nbrs = emb_nbrs.kneighbors(
        [embeddings.loc[test_process_id]], n_neighbors=k + 1, return_distance=False
    )[0]
    return train[process_ids.loc[train.index].isin(embeddings.index[nbrs])]


def fit_predict_xgboost_native(train, test, df_dec, **fit_predict_params):
    params = dict(
        fit_predict_params, objective="binary:logistic", eval_metric="logloss"
    )
    num_boost_round = params.pop("num_boost_round")
    dtrain = xgb.DMatrix(train.values, label=df_dec.loc[train.index])
    dtest = xgb.DMatrix(test.values)
    cl = xgb.train(params, dtrain, num_boost_round=num_boost_round)
    logging.info(f"num_boost_rounds == {num_boost_round}")
    scores = cl.predict(dtest)
    return (
        pd.Series(scores, index=test.index),
        pd.Series(1, index=test.index),
        pd.Series(1, index=test.index),
    )


def fit_predict_decision_tree(train, test, df_dec, **fit_predict_params):
    cl = tree.DecisionTreeClassifier(**fit_predict_params)
    cl = cl.fit(train.values, df_dec.loc[train.index])
    scores = cl.predict_proba(test)[:, 1]
    return (
        pd.Series(scores, index=test.index),
        pd.Series(1, index=test.index),
        pd.Series(1, index=test.index),
    )


def get_bireducts(
    train,
    df_dec,
    n_bireducts,
    n_of_probes,
    allowed_randomness,
    candidate_n_attrs,
    max_n_attrs,
):
    ghr = DynamicallyAdaptedApproximateBireduct(
        n_of_probes=n_of_probes,
        allowed_randomness=allowed_randomness,
        candidate_n_attrs=candidate_n_attrs,
        max_n_attrs=max_n_attrs,
    )
    ghr.fit(train, df_dec.loc[train.index], check_data_consistency=False)
    bireducts = Parallel(n_jobs=N_JOBS)(
        delayed(ghr.get_bireduct)() for i in range(n_bireducts)
    )
    return bireducts


def predict(train, test, bireduct, df_dec):
    objs = bireduct.objects
    cols = bireduct.attributes
    bireduct_train = train.iloc[objs, cols]
    bireduct_train = bireduct_train.drop_duplicates()
    nn = NearestNeighbors().fit(bireduct_train)
    dist, neighbors = nn.kneighbors(
        test.iloc[:, cols], n_neighbors=1, return_distance=True
    )
    result = [
        df_dec.loc[bireduct_train.index[n[0]]] if d[0] == 0 else np.nan
        for d, n in zip(dist, neighbors)
    ]
    return result


def fit_predict_bireducts(train, test, df_dec, **fit_predict_params):
    params = dict(fit_predict_params, n_of_probes=100)
    bireducts = get_bireducts(train, df_dec, **params)
    scores = np.zeros(len(test))
    counts = np.zeros(len(test))
    counts2 = np.zeros(len(test))
    for bireduct in bireducts:
        for i, dec in enumerate(predict(train, test, bireduct, df_dec)):
            if not np.isnan(dec):
                scores[i] += int(dec)
                counts[i] += 1
                counts2[i] += len(bireduct.objects) / len(train)
    return (
        pd.Series(scores, index=test.index),
        pd.Series(counts, index=test.index),
        pd.Series(counts2, index=test.index),
    )


def do_leave_one_process_out_k(
    fit_predict_fun, df, df_dec, process_ids, k, **fit_predict_params
):
    logging.info({"filename": FILEPATH_IN})
    logging.info({"k": k})
    logging.info(fit_predict_params)
    scores = pd.Series(dtype="float")
    counts = pd.Series(dtype="float")
    counts2 = pd.Series(dtype="float")
    i = 0
    for train_index, test_index in tqdm.tqdm(
        LeaveOneGroupOut().split(df, df_dec, groups=process_ids)
    ):
        i += 1
        logging.info(str(i))
        train = df.iloc[train_index]
        test = df.iloc[test_index]
        if k is not None:
            train = refine_train(train, test, process_ids, embeddings, emb_nbrs, k)
        s, c, c2 = fit_predict_fun(train, test, df_dec, **fit_predict_params)
        scores = scores.append(s)
        counts = counts.append(c)
        counts2 = counts2.append(c2)
    return scores.sort_index(), counts.sort_index(), counts2.sort_index()


def do_leave_one_process_out_same_category(
    fit_predict_fun, df, df_dec, process_ids, process_category, **fit_predict_params
):
    logging.info("same category")
    logging.info(fit_predict_params)
    scores = pd.Series(dtype="float")
    counts = pd.Series(dtype="float")
    counts2 = pd.Series(dtype="float")
    i = 0
    for leave_one_out_process_id in tqdm.tqdm(process_category["process_ids"].unique()):
        i += 1
        logging.info(str(i))
        test = df[process_ids == leave_one_out_process_id]
        test_category = process_category[
            process_category["process_ids"] == leave_one_out_process_id
        ]["process_category"].iloc[0]
        test_category_process_ids = process_category[
            process_category["process_category"] == test_category
        ]["process_ids"]
        train = df[
            process_ids.isin(test_category_process_ids)
            & (process_ids != leave_one_out_process_id)
        ]
        s, c, c2 = fit_predict_fun(train, test, df_dec, **fit_predict_params)
        scores = scores.append(s)
        counts = counts.append(c)
        counts2 = counts2.append(c2)
    return scores.sort_index(), counts.sort_index(), counts2.sort_index()

In [8]:
# x = do_leave_one_process_out_k(fit_predict_decision_tree,
#                                df, df_dec, process_ids, k=40,
#                                min_impurity_decrease=0.001,
#                                max_depth=3,
#                               )
# roc_auc_score(df_dec, x[0])

# XGBoost

In [None]:
k_values = list(range(10, 202, 10)) + [None]
learning_rate_values = [0.001]
max_depth_values = [3]
num_boost_round_values = [1000]
more_params_values = [{"base_score": 0.0696}]

# k_values = [40]
# learning_rate_values = [0.1, 0.01, 0.001]
# max_depth_values = [2, 3, 4, 5, 10]
# num_boost_round_values = [1000]
# more_params_values = [{'base_score': 0.0696}, {'base_score': 0.5}]

for more_params in more_params_values:
    for k in k_values:
        for num_boost_round in num_boost_round_values:
            for learning_rate in learning_rate_values:
                for max_depth in max_depth_values:
                    x = do_leave_one_process_out_k(
                        fit_predict_xgboost_native,
                        df,
                        df_dec,
                        process_ids,
                        k=k,
                        num_boost_round=num_boost_round,
                        learning_rate=learning_rate,
                        max_depth=max_depth,
                        **more_params,
                    )
                    pd.DataFrame(
                        {"scores": x[0], "counts": x[1], "counts2": x[2]}
                    ).to_csv(
                        TMP_DIR
                        / XGBOOST_FILENAME_OUT_TEMPLATE.format(
                            k=k,
                            num_boost_round=num_boost_round,
                            learning_rate=learning_rate,
                            max_depth=max_depth,
                            **more_params,
                        ),
                        sep=";",
                        index=False,
                    )

1it [00:02,  2.94s/it]

In [None]:
learning_rate = 0.001
max_depth = 3
num_boost_round = 1000
more_params = {"base_score": 0.0696}

x = do_leave_one_process_out_same_category(
    fit_predict_xgboost_native,
    df,
    df_dec,
    process_ids,
    process_category,
    num_boost_round=num_boost_round,
    learning_rate=learning_rate,
    max_depth=max_depth,
    **more_params,
)
pd.DataFrame({"scores": x[0], "counts": x[1], "counts2": x[2]}).to_csv(
    TMP_DIR
    / XGBOOST_FILENAME_OUT_TEMPLATE_SAME_CATEGORY.format(
        num_boost_round=num_boost_round,
        learning_rate=learning_rate,
        max_depth=max_depth,
        **more_params,
    ),
    sep=";",
    index=False,
)

 25%|██▍       | 50/202 [07:05<23:49,  9.40s/it]

In [10]:
# q = pd.read_csv('./tmp/to_remove/xgboost_leave_one_process_out_k_40_annonymized_discretized_quantile_num_boost_round_1_learning_rate_0.01_max_depth_3.csv', sep=';')
# roc_auc_score(df_dec, q['scores'])

In [11]:
# x = do_leave_one_process_out_k(fit_predict_xgboost_native,
#                                            df, df_dec, process_ids, k=40,
#                                            num_boost_round=1,
#                                            learning_rate=0.01,
#                                            max_depth=3)
# roc_auc_score(df_dec, x[0])

# Tree

In [10]:
k_values = list(range(10, 202, 10)) + [None]
min_impurity_decrease_values = [0.0, 0.001]
max_depth_values = [2, 3, 4, 5]
more_params = {}

# k_values = [40]
# max_depth_values = [2, 3, 4, 5]
# min_impurity_decrease_values = [0.0, 0.001, 0.01, 0.1]
# more_params = {}

for k in k_values:
    for min_impurity_decrease in min_impurity_decrease_values:
        for max_depth in max_depth_values:
            x = do_leave_one_process_out_k(
                fit_predict_decision_tree,
                df,
                df_dec,
                process_ids,
                k=k,
                min_impurity_decrease=min_impurity_decrease,
                max_depth=max_depth,
                **more_params,
            )
            pd.DataFrame({"scores": x[0], "counts": x[1], "counts2": x[2]}).to_csv(
                TMP_DIR
                / TREE_FILENAME_OUT_TEMPLATE.format(
                    k=k,
                    min_impurity_decrease=min_impurity_decrease,
                    max_depth=max_depth,
                    **more_params,
                ),
                sep=";",
                index=False,
            )

202it [00:11, 17.55it/s]
202it [00:12, 15.69it/s]
202it [00:13, 15.41it/s]
202it [00:11, 18.12it/s]
202it [00:12, 16.74it/s]
202it [00:12, 16.27it/s]
202it [00:13, 15.52it/s]
202it [00:16, 12.24it/s]
202it [00:17, 11.40it/s]
202it [00:13, 15.00it/s]
202it [00:15, 13.14it/s]
202it [00:15, 12.95it/s]
202it [00:15, 12.76it/s]
202it [00:20,  9.70it/s]
202it [00:23,  8.72it/s]
202it [00:15, 13.16it/s]
202it [00:18, 11.21it/s]
202it [00:17, 11.35it/s]
202it [00:19, 10.49it/s]
202it [00:25,  7.95it/s]
202it [00:29,  6.90it/s]
202it [00:18, 10.81it/s]
202it [00:21,  9.53it/s]
202it [00:21,  9.45it/s]
202it [00:21,  9.30it/s]
202it [00:30,  6.70it/s]
202it [00:34,  5.88it/s]
202it [00:20,  9.63it/s]
202it [00:23,  8.64it/s]
202it [00:23,  8.66it/s]
202it [00:24,  8.18it/s]
202it [00:35,  5.65it/s]
202it [00:40,  5.00it/s]
202it [00:23,  8.45it/s]
202it [00:27,  7.42it/s]
202it [00:27,  7.37it/s]
202it [00:27,  7.32it/s]
202it [00:42,  4.80it/s]
202it [00:48,  4.15it/s]
202it [00:27,  7.30it/s]


In [15]:
min_impurity_decrease = 0.0
max_depth = 5
more_params = {}

x = do_leave_one_process_out_same_category(
    fit_predict_decision_tree,
    df,
    df_dec,
    process_ids,
    process_category,
    min_impurity_decrease=min_impurity_decrease,
    max_depth=max_depth,
    **more_params,
)
pd.DataFrame({"scores": x[0], "counts": x[1], "counts2": x[2]}).to_csv(
    TMP_DIR
    / TREE_FILENAME_OUT_TEMPLATE_SAME_CATEGORY.format(
        min_impurity_decrease=min_impurity_decrease, max_depth=max_depth, **more_params
    ),
    sep=";",
    index=False,
)

100%|██████████| 202/202 [00:15<00:00, 13.44it/s]


# Bireducts

In [None]:
# k_values = [20, 50, 80, 110, 140, 170, 200]
# k_values = [10, 30, 60, 70, 90, 100, 120, 130, 150, 160, 180, 190, None]
k_values = list(range(10, 202, 10)) + [None]
n_bireducts_values = [1000]
candidate_n_attrs_values = [100]
allowed_randomness_values = [0.05]
max_n_attrs_values = [3]
more_params = {}

for iteration in range(3):
    if iteration < 1:
        continue
    for k in k_values:
        if k < 70:
            continue
        for n_bireducts in n_bireducts_values:
            for candidate_n_attrs in candidate_n_attrs_values:
                for allowed_randomness in allowed_randomness_values:
                    for max_n_attrs in max_n_attrs_values:
                        x = do_leave_one_process_out_k(
                            fit_predict_bireducts,
                            df,
                            df_dec,
                            process_ids,
                            k=k,
                            n_bireducts=n_bireducts,
                            candidate_n_attrs=candidate_n_attrs,
                            allowed_randomness=allowed_randomness,
                            max_n_attrs=max_n_attrs,
                            **more_params,
                        )
                        pd.DataFrame(
                            {"scores": x[0], "counts": x[1], "counts2": x[2]}
                        ).to_csv(
                            TMP_DIR
                            / BIREDUCTS_FILENAME_OUT_TEMPLATE.format(
                                k=k,
                                n_bireducts=n_bireducts,
                                candidate_n_attrs=candidate_n_attrs,
                                allowed_randomness=allowed_randomness,
                                max_n_attrs=max_n_attrs,
                                iteration=iteration,
                                **more_params,
                            ),
                            sep=";",
                            index=False,
                        )

In [None]:
n_bireducts = 1000
candidate_n_attrs = 100
allowed_randomness = 0.05
max_n_attrs = 3
iteration = 1
x = do_leave_one_process_out_same_category(
    fit_predict_bireducts,
    df,
    df_dec,
    process_ids,
    process_category,
    n_bireducts=n_bireducts,
    candidate_n_attrs=candidate_n_attrs,
    allowed_randomness=allowed_randomness,
    max_n_attrs=max_n_attrs,
    **more_params,
)
pd.DataFrame({"scores": x[0], "counts": x[1], "counts2": x[2]}).to_csv(
    TMP_DIR
    / BIREDUCTS_FILENAME_OUT_TEMPLATE_SAME_CATEGORY.format(
        n_bireducts=n_bireducts,
        candidate_n_attrs=candidate_n_attrs,
        allowed_randomness=allowed_randomness,
        max_n_attrs=max_n_attrs,
        iteration=iteration,
        **more_params,
    ),
    sep=";",
    index=False,
)