In [3]:
# from sklearn.metrics import roc_curve, auc, recall_score, precision_score
import pathlib
import warnings

import numpy as np
import pandas as pd

# from skrough.base import Bireduct
# from joblib import Parallel, delayed
# import time
import xgboost as xgb

# import matplotlib.pyplot as plt
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.neighbors import NearestNeighbors

# import tqdm.notebook
# import attr
# import pickle
# # from skrough.bireducts.greedy_heuristic_bireduct import GreedyHeuristicBireduct
# from skrough.bireducts.dynamically_adapted_approximate_bireduct import DynamicallyAdaptedApproximateBireduct


warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

import config

In [12]:
DATA_DIR = pathlib.Path(config.DATA_DIR)
TMP_DIR = pathlib.Path(config.TMP_DIR)
N_JOBS = 7
SEP = ";"

K_NEIGHBORS_PROCESSES = 40
N_BIREDUCTS = 500
BIREDUCT_ALLOWED_RANDOMNESS = 0.1
BIREDUCT_CANDIDATE_N_ATTRS = 30

FILEPATH_IN = DATA_DIR / "toolbox_tabular_data_annonymized_discretized_uniform.csv"
# FILEPATH_IN = DATA_DIR / 'toolbox_tabular_data_annonymized_discretized_quantile.csv'
# FILEPATH_IN = DATA_DIR / 'toolbox_tabular_data_annonymized_discretized_kmeans.csv'
EMBEDDINGS_FILEPATH_IN = DATA_DIR / "process_embeddings_d50.csv"

In [13]:
df = pd.read_csv(FILEPATH_IN, sep=SEP)
process_ids = df.pop("process_ids")
df = df.astype("category")
df = df.apply(lambda x: x.cat.codes)
df_dec = df.pop("target")


embeddings = pd.read_csv(EMBEDDINGS_FILEPATH_IN)
process_category = embeddings[["process_ids", "process_category"]]
embeddings.drop(["process_category"], axis=1, inplace=True)
embeddings.set_index("process_ids", inplace=True)
emb_nbrs = NearestNeighbors().fit(embeddings)

In [14]:
df_dec = 1 - df_dec

In [7]:
sum(df_dec) / len(df_dec)

0.06968954656801479

In [11]:
sum(df_dec) / len(df_dec)

0.06968954656801479

In [16]:
sum(df_dec) / sum(1 - df_dec)

0.07490998978988661

In [7]:
def refine_train(train, test, process_ids, embeddings, emb_nbrs, k):
    # search for k+1 neighbors because the one we search for is obviously the best match (thus +1)
    test_process_id = process_ids.loc[test.index[0]]
    nbrs = emb_nbrs.kneighbors(
        [embeddings.loc[test_process_id]], n_neighbors=k + 1, return_distance=False
    )[0]
    return train[process_ids.loc[train.index].isin(embeddings.index[nbrs])]


def get_bireducts(
    train, df_dec, n_bireducts, bireduct_allowed_randomness, bireduct_candidate_n_attrs
):
    ghr = DynamicallyAdaptedApproximateBireduct(
        n_of_probes=100,
        allowed_randomness=bireduct_allowed_randomness,
        candidate_n_attrs=bireduct_candidate_n_attrs,
    )
    ghr.fit(train, df_dec.loc[train.index], check_data_consistency=False)
    #     bireducts = []
    #     for i in range(n_bireducts):
    #         bireducts.append(ghr.get_bireduct())
    bireducts = Parallel(n_jobs=N_JOBS)(
        delayed(ghr.get_bireduct)() for i in range(n_bireducts)
    )
    return bireducts


def predict(train, test, bireduct, df_dec):
    objs = bireduct.objects
    cols = bireduct.attributes
    bireduct_train = train.iloc[objs, cols]
    bireduct_train = bireduct_train.drop_duplicates()
    nn = NearestNeighbors().fit(bireduct_train)
    dist, neighbors = nn.kneighbors(
        test.iloc[:, cols], n_neighbors=1, return_distance=True
    )
    result = [
        df_dec.loc[bireduct_train.index[n[0]]] if d[0] == 0 else np.nan
        for d, n in zip(dist, neighbors)
    ]
    return result


def fit_predict(
    train,
    test,
    df_dec,
    n_bireducts,
    bireduct_allowed_randomness,
    bireduct_candidate_n_attrs,
):
    #     start = time.time()
    bireducts = get_bireducts(
        train,
        df_dec,
        n_bireducts,
        bireduct_allowed_randomness,
        bireduct_candidate_n_attrs,
    )
    #     start2 = time.time()
    #     print(f'fit: {start2 - start}')
    scores = np.zeros(len(test))
    counts = np.zeros(len(test))
    counts2 = np.zeros(len(test))
    for bireduct in bireducts:
        for i, dec in enumerate(predict(train, test, bireduct, df_dec)):
            if not np.isnan(dec):
                scores[i] += int(dec)
                counts[i] += 1
                counts2[i] += len(bireduct.objects) / len(train)
    #     start3 = time.time()
    #     print(f'predict: {start3 - start2}')

    return (
        pd.Series(scores, index=test.index),
        pd.Series(counts, index=test.index),
        pd.Series(counts2, index=test.index),
    )


def do_leave_one_process_out_k(
    df,
    df_dec,
    process_ids,
    k,
    n_bireducts,
    bireduct_allowed_randomness,
    bireduct_candidate_n_attrs,
):
    scores = pd.Series(dtype="float")
    counts = pd.Series(dtype="float")
    counts2 = pd.Series(dtype="float")
    for train_index, test_index in tqdm.notebook.tqdm(
        LeaveOneGroupOut().split(df, df_dec, groups=process_ids)
    ):
        train = df.iloc[train_index]
        test = df.iloc[test_index]
        if k is not None:
            train = refine_train(train, test, process_ids, embeddings, emb_nbrs, k)
        s, c, c2 = fit_predict(
            train,
            test,
            df_dec,
            n_bireducts,
            bireduct_allowed_randomness,
            bireduct_candidate_n_attrs,
        )
        scores = scores.append(s)
        counts = counts.append(c)
        counts2 = counts2.append(c2)
    return scores.sort_index(), counts.sort_index(), counts2.sort_index()

In [8]:
splits = {}
for train_index, test_index in LeaveOneGroupOut().split(df, df_dec, groups=process_ids):
    splits[process_ids[test_index[0]]] = {
        "train_index": train_index,
        "test_index": test_index,
    }

In [9]:
def get_data(df, df_dec, process_ids, embeddings, emb_nbrs, k, proc_id):
    train_index = splits[proc_id]["train_index"]
    test_index = splits[proc_id]["test_index"]
    train = df.iloc[train_index]
    test = df.iloc[test_index]
    train = refine_train(train, test, process_ids, embeddings, emb_nbrs, k)
    return train, df_dec.loc[train.index], test, df_dec.loc[test.index]

In [17]:
train, train_target, test, test_target = get_data(
    df, df_dec, process_ids, embeddings, emb_nbrs, 40, "process_102"
)
dtrain = xgb.DMatrix(train, label=train_target)
dtest = xgb.DMatrix(test, label=test_target)

In [18]:
print(process_ids[test.index].unique())
print(process_ids[train.index].unique())

['process_102']
['process_129' 'process_168' 'process_11' 'process_192' 'process_82'
 'process_30' 'process_15' 'process_71' 'process_60' 'process_148'
 'process_70' 'process_181' 'process_81' 'process_98' 'process_103'
 'process_101' 'process_5' 'process_6' 'process_146' 'process_194'
 'process_133' 'process_100' 'process_48' 'process_166' 'process_161'
 'process_164' 'process_99' 'process_18' 'process_165' 'process_169'
 'process_37' 'process_132' 'process_86' 'process_91' 'process_8'
 'process_121' 'process_83' 'process_39' 'process_90' 'process_123']


In [20]:
num_boost_round = 50
param = {
    "learning_rate": 0.1,
    "max_depth": 3,
    "objective": "binary:logistic",
    "eval_metric": ["auc", "logloss"],
    #     'base_score': 0.075,
}
evallist = [(dtrain, "train"), (dtest, "eval")]

bst = xgb.train(param, dtrain, num_boost_round=num_boost_round, evals=evallist)

[0]	train-auc:0.58144	train-logloss:0.61557	eval-auc:0.50000	eval-logloss:0.61735
[1]	train-auc:0.65314	train-logloss:0.55192	eval-auc:0.54792	eval-logloss:0.55598
[2]	train-auc:0.65032	train-logloss:0.49937	eval-auc:0.54792	eval-logloss:0.50503
[3]	train-auc:0.65491	train-logloss:0.45479	eval-auc:0.54583	eval-logloss:0.46304
[4]	train-auc:0.70857	train-logloss:0.41732	eval-auc:0.54583	eval-logloss:0.42855
[5]	train-auc:0.71017	train-logloss:0.38530	eval-auc:0.54583	eval-logloss:0.39892
[6]	train-auc:0.71702	train-logloss:0.35765	eval-auc:0.54583	eval-logloss:0.37412
[7]	train-auc:0.71646	train-logloss:0.33374	eval-auc:0.54583	eval-logloss:0.35178
[8]	train-auc:0.71685	train-logloss:0.31326	eval-auc:0.54583	eval-logloss:0.33368
[9]	train-auc:0.72581	train-logloss:0.29548	eval-auc:0.55833	eval-logloss:0.31821
[10]	train-auc:0.72714	train-logloss:0.28003	eval-auc:0.55833	eval-logloss:0.30473
[11]	train-auc:0.72820	train-logloss:0.26661	eval-auc:0.55833	eval-logloss:0.29347
[12]	train-auc