In [20]:
import logging
import pathlib
import pickle
import warnings

import attr
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm
import xgboost as xgb
from joblib import Parallel, delayed
from sklearn.metrics import auc, precision_score, recall_score, roc_curve
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.neighbors import NearestNeighbors

from skrough.base import Bireduct
from skrough.bireducts.dynamically_adapted_approximate_bireduct import (
    DynamicallyAdaptedApproximateBireduct,
)

warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

import config

In [21]:
DATA_DIR = pathlib.Path(config.DATA_DIR)
TMP_DIR = pathlib.Path(config.TMP_DIR)
SEP = ";"


# FILEPATH_IN = DATA_DIR / 'toolbox_tabular_data_annonymized_discretized_uniform.csv'
FILEPATH_IN = DATA_DIR / "toolbox_tabular_data_annonymized_discretized_quantile.csv"
# FILEPATH_IN = DATA_DIR / 'toolbox_tabular_data_annonymized_discretized_kmeans.csv'
EMBEDDINGS_FILEPATH_IN = DATA_DIR / "process_embeddings_d50.csv"


# FILENAME_OUT_TEMPLATE = 'paper_xgboost_100_learning_rate_001/xgboost_leave_one_process_out_k_{k}_annonymized_discretized_quantile_rep_{rep}.csv'
FILENAME_OUT_TEMPLATE = "paper_xgboost_100_learning_rate_001_max_depth_3/xgboost_leave_one_process_out_k_{k}_annonymized_discretized_quantile_rep_{rep}.csv"

In [22]:
logging.basicConfig(filename=TMP_DIR / "b.log", level=logging.INFO)

In [23]:
df = pd.read_csv(FILEPATH_IN, sep=SEP)
process_ids = df.pop("process_ids")
df = df.astype("category")
df = df.apply(lambda x: x.cat.codes)
df_dec = df.pop("target")


embeddings = pd.read_csv(EMBEDDINGS_FILEPATH_IN)
process_category = embeddings[["process_ids", "process_category"]]
embeddings.drop(["process_category"], axis=1, inplace=True)
embeddings.set_index("process_ids", inplace=True)
emb_nbrs = NearestNeighbors().fit(embeddings)

In [24]:
def refine_train(train, test, process_ids, embeddings, emb_nbrs, k):
    # search for k+1 neighbors because the one we search for is obviously the best match (thus +1)
    test_process_id = process_ids.loc[test.index[0]]
    nbrs = emb_nbrs.kneighbors(
        [embeddings.loc[test_process_id]], n_neighbors=k + 1, return_distance=False
    )[0]
    return train[process_ids.loc[train.index].isin(embeddings.index[nbrs])]


def fit_predict(train, test, df_dec):
    cl = xgb.XGBClassifier(
        eval_metric="logloss",
        use_label_encoder=False,
        n_estimators=100,
        max_depth=3,
        learning_rate=0.01,
    )
    cl.fit(train.values, df_dec.loc[train.index])
    scores = cl.predict_proba(test.values.copy())[:, 1]
    return (
        pd.Series(scores, index=test.index),
        pd.Series(1, index=test.index),
        pd.Series(1, index=test.index),
    )


def do_leave_one_process_out_k(df, df_dec, process_ids, k):
    scores = pd.Series(dtype="float")
    counts = pd.Series(dtype="float")
    counts2 = pd.Series(dtype="float")
    i = 0
    for train_index, test_index in tqdm.tqdm(
        LeaveOneGroupOut().split(df, df_dec, groups=process_ids)
    ):
        i += 1
        logging.info(str(i))
        train = df.iloc[train_index]
        test = df.iloc[test_index]
        if k is not None:
            train = refine_train(train, test, process_ids, embeddings, emb_nbrs, k)
        s, c, c2 = fit_predict(train, test, df_dec)
        scores = scores.append(s)
        counts = counts.append(c)
        counts2 = counts2.append(c2)
    return scores.sort_index(), counts.sort_index(), counts2.sort_index()


# def do_leave_one_process_out_same_category(df, df_dec, process_ids, process_category,
#                                            n_bireducts, bireduct_allowed_randomness,
#                                            bireduct_candidate_n_attrs, bireduct_max_n_attrs):
#     scores = pd.Series(dtype='float')
#     counts = pd.Series(dtype='float')
#     counts2 = pd.Series(dtype='float')
#     i = 0
#     for leave_one_out_process_id in tqdm.tqdm(process_category['process_ids'].unique()):
#         i += 1
#         logging.info(str(i))
#         test = df[process_ids == leave_one_out_process_id]
#         test_category = process_category[process_category['process_ids'] == leave_one_out_process_id]['process_category'].iloc[0]
#         test_category_process_ids = process_category[process_category['process_category'] == test_category]['process_ids']
#         train = df[process_ids.isin(test_category_process_ids) & (process_ids != leave_one_out_process_id)]
#         s, c, c2 = fit_predict(train, test, df_dec, n_bireducts, bireduct_allowed_randomness,
#                                bireduct_candidate_n_attrs, bireduct_max_n_attrs)
#         scores = scores.append(s)
#         counts = counts.append(c)
#         counts2 = counts2.append(c2)
#     return scores.sort_index(), counts.sort_index(), counts2.sort_index()

In [None]:
for k in [50, 80, 110, 140, 170, 200]:
    for rep in range(1):
        #         if k == 20 and rep <= 1:
        #             continue
        #         k = None
        logging.info(f"k={k} rep={rep}")
        x = do_leave_one_process_out_k(df, df_dec, process_ids, k=k)
        pd.DataFrame({"scores": x[0], "counts": x[1], "counts2": x[2]}).to_csv(
            TMP_DIR / FILENAME_OUT_TEMPLATE.format(k=k, rep=rep), sep=";", index=False
        )

202it [04:47,  1.42s/it]
202it [07:32,  2.24s/it]
202it [10:12,  3.03s/it]
102it [06:20,  3.59s/it]

In [None]:
# ghr = DynamicallyAdaptedApproximateBireduct(
#                             n_of_probes=100,
#                             allowed_randomness=BIREDUCT_ALLOWED_RANDOMNESS,
#                             candidate_n_attrs=BIREDUCT_CANDIDATE_N_ATTRS,
#                             max_n_attrs=2)
# ghr.fit(df, df_dec, check_data_consistency=False)
# q1 = ghr.get_bireduct()
# q1.attributes

In [42]:
cl = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")
cl.fit(df, df_dec)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)

In [44]:
sum(cl.predict(df.values.copy()) == df_dec.values)

19006