In [1]:
import logging
import pathlib
import pickle
import warnings

import attr
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm.notebook
from joblib import Parallel, delayed
from sklearn.metrics import auc, precision_score, recall_score, roc_curve
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.neighbors import NearestNeighbors

from skrough.base import Bireduct
from skrough.bireducts.sampling_heuristic_bireduct import SamplingHeuristicBireduct

warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

import config

In [2]:
DATA_DIR = pathlib.Path(config.DATA_DIR)
TMP_DIR = pathlib.Path(config.TMP_DIR)
N_JOBS = 32

K_NEIGHBORS_PROCESSES = 40
N_BIREDUCTS = 1000
BIREDUCT_INITIAL_SAMPLE_N_ATTRS = 100
BIREDUCT_MAX_N_ATTRS = 20
# N_BIREDUCTS = 1
# BIREDUCT_INITIAL_SAMPLE_N_ATTRS = 100
# BIREDUCT_MAX_N_ATTRS = 100


FILEPATH_IN = DATA_DIR / "toolbox_tabular_data_annonymized.csv"
EMBEDDINGS_FILEPATH_IN = DATA_DIR / "process_embeddings_d50.csv"
FILENAME_OUT_TEMPLATE = "leave_one_process_out_k_{k}_bireducts_{n_bireducts}_sample_attrs_{initial_sample_n_attrs}_max_attrs_{max_n_attrs}.csv"
FILENAME_2_OUT_TEMPLATE = "leave_one_process_out_same_category_bireducts_{n_bireducts}_sample_attrs_{initial_sample_n_attrs}_max_attrs_{max_n_attrs}.csv"

In [3]:
logging.basicConfig(filename=TMP_DIR / "b.log", level=logging.INFO)

In [4]:
df = pd.read_csv(FILEPATH_IN)
process_ids = df.pop("process_ids")
df = df.astype("category")
df = df.apply(lambda x: x.cat.codes)
df_dec = df.pop("target")


embeddings = pd.read_csv(EMBEDDINGS_FILEPATH_IN)
process_category = embeddings[["process_ids", "process_category"]]
embeddings.drop(["process_category"], axis=1, inplace=True)
embeddings.set_index("process_ids", inplace=True)
emb_nbrs = NearestNeighbors().fit(embeddings)

In [5]:
def refine_train(train, test, process_ids, embeddings, emb_nbrs, k):
    # search for k+1 neighbors because the one we search for is obviously the best match (thus +1)
    test_process_id = process_ids.loc[test.index[0]]
    nbrs = emb_nbrs.kneighbors(
        [embeddings.loc[test_process_id]], n_neighbors=k + 1, return_distance=False
    )[0]
    return train[process_ids.loc[train.index].isin(embeddings.index[nbrs])]


def get_bireducts(
    train, df_dec, n_bireducts, bireduct_initial_sample_n_attrs, bireduct_max_n_attrs
):
    shr = SamplingHeuristicBireduct(
        initial_sample_n_attrs=bireduct_initial_sample_n_attrs,
        max_n_attrs=bireduct_max_n_attrs,
    )
    shr.fit(train, df_dec.loc[train.index], check_data_consistency=False)
    #     bireducts = []
    #     for i in range(n_bireducts):
    #         bireducts.append(shr.get_bireduct())
    bireducts = Parallel(n_jobs=N_JOBS)(
        delayed(shr.get_bireduct)() for i in range(n_bireducts)
    )
    return bireducts


def predict(train, test, bireduct, df_dec):
    objs = bireduct.objects
    cols = bireduct.attributes
    bireduct_train = train.iloc[objs, cols]
    nn = NearestNeighbors().fit(bireduct_train)
    dist, neighbors = nn.kneighbors(
        test.iloc[:, cols], n_neighbors=1, return_distance=True
    )
    result = [
        df_dec.loc[bireduct_train.index[n[0]]] if d[0] == 0 else np.nan
        for d, n in zip(dist, neighbors)
    ]
    return result


def fit_predict(
    train,
    test,
    df_dec,
    n_bireducts,
    bireduct_initial_sample_n_attrs,
    bireduct_max_n_attrs,
):
    bireducts = get_bireducts(
        train,
        df_dec,
        n_bireducts,
        bireduct_initial_sample_n_attrs,
        bireduct_max_n_attrs,
    )
    scores = np.zeros(len(test))
    counts = np.zeros(len(test))
    counts2 = np.zeros(len(test))
    for bireduct in bireducts:
        for i, dec in enumerate(predict(train, test, bireduct, df_dec)):
            if not np.isnan(dec):
                scores[i] += int(dec)
                counts[i] += 1
                counts2[i] += len(bireduct.objects) / len(train)
    return (
        pd.Series(scores, index=test.index),
        pd.Series(counts, index=test.index),
        pd.Series(counts2, index=test.index),
    )


def do_leave_one_process_out_k(
    df,
    df_dec,
    process_ids,
    k,
    n_bireducts,
    bireduct_initial_sample_n_attrs,
    bireduct_max_n_attrs,
):
    scores = pd.Series(dtype="float")
    counts = pd.Series(dtype="float")
    counts2 = pd.Series(dtype="float")
    i = 0
    for train_index, test_index in tqdm.notebook.tqdm(
        LeaveOneGroupOut().split(df, df_dec, groups=process_ids)
    ):
        i += 1
        logging.info(str(i))
        train = df.iloc[train_index]
        test = df.iloc[test_index]
        if k is not None:
            train = refine_train(train, test, process_ids, embeddings, emb_nbrs, k)
        s, c, c2 = fit_predict(
            train,
            test,
            df_dec,
            n_bireducts,
            bireduct_initial_sample_n_attrs,
            bireduct_max_n_attrs,
        )
        scores = scores.append(s)
        counts = counts.append(c)
        counts2 = counts2.append(c2)
    return scores.sort_index(), counts.sort_index(), counts2.sort_index()


def do_leave_one_process_out_same_category(
    df,
    df_dec,
    process_ids,
    process_category,
    n_bireducts,
    bireduct_initial_sample_n_attrs,
    bireduct_max_n_attrs,
):
    scores = pd.Series(dtype="float")
    counts = pd.Series(dtype="float")
    counts2 = pd.Series(dtype="float")
    for leave_one_out_process_id in tqdm.notebook.tqdm(
        process_category["process_ids"].unique()
    ):
        test = df[process_ids == leave_one_out_process_id]
        test_category = process_category[
            process_category["process_ids"] == leave_one_out_process_id
        ]["process_category"].iloc[0]
        test_category_process_ids = process_category[
            process_category["process_category"] == test_category
        ]["process_ids"]
        train = df[
            process_ids.isin(test_category_process_ids)
            & (process_ids != leave_one_out_process_id)
        ]
        s, c, c2 = fit_predict(
            train,
            test,
            df_dec,
            n_bireducts,
            bireduct_initial_sample_n_attrs,
            bireduct_max_n_attrs,
        )
        scores = scores.append(s)
        counts = counts.append(c)
        counts2 = counts2.append(c2)
    return scores.sort_index(), counts.sort_index(), counts2.sort_index()

In [None]:
z = do_leave_one_process_out_same_category(
    df,
    df_dec,
    process_ids,
    process_category,
    n_bireducts=N_BIREDUCTS,
    bireduct_initial_sample_n_attrs=BIREDUCT_INITIAL_SAMPLE_N_ATTRS,
    bireduct_max_n_attrs=BIREDUCT_MAX_N_ATTRS,
)
pd.DataFrame({"scores": z[0], "counts": z[1], "counts2": z[2]}).to_csv(
    TMP_DIR
    / FILENAME_2_OUT_TEMPLATE.format(
        n_bireducts=N_BIREDUCTS,
        initial_sample_n_attrs=BIREDUCT_INITIAL_SAMPLE_N_ATTRS,
        max_n_attrs=BIREDUCT_MAX_N_ATTRS,
    ),
    sep=";",
    index=False,
)

In [6]:
x = do_leave_one_process_out_k(
    df,
    df_dec,
    process_ids,
    k=K_NEIGHBORS_PROCESSES,
    n_bireducts=N_BIREDUCTS,
    bireduct_initial_sample_n_attrs=BIREDUCT_INITIAL_SAMPLE_N_ATTRS,
    bireduct_max_n_attrs=BIREDUCT_MAX_N_ATTRS,
)
pd.DataFrame({"scores": x[0], "counts": x[1], "counts2": x[2]}).to_csv(
    TMP_DIR
    / FILENAME_OUT_TEMPLATE.format(
        k=K_NEIGHBORS_PROCESSES,
        n_bireducts=N_BIREDUCTS,
        initial_sample_n_attrs=BIREDUCT_INITIAL_SAMPLE_N_ATTRS,
        max_n_attrs=BIREDUCT_MAX_N_ATTRS,
    ),
    sep=";",
    index=False,
)

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [7]:
y = do_leave_one_process_out_k(
    df,
    df_dec,
    process_ids,
    k=None,
    n_bireducts=N_BIREDUCTS,
    bireduct_initial_sample_n_attrs=BIREDUCT_INITIAL_SAMPLE_N_ATTRS,
    bireduct_max_n_attrs=BIREDUCT_MAX_N_ATTRS,
)
pd.DataFrame({"scores": y[0], "counts": y[1], "counts2": y[2]}).to_csv(
    TMP_DIR
    / FILENAME_OUT_TEMPLATE.format(
        k=None,
        n_bireducts=N_BIREDUCTS,
        initial_sample_n_attrs=BIREDUCT_INITIAL_SAMPLE_N_ATTRS,
        max_n_attrs=BIREDUCT_MAX_N_ATTRS,
    ),
    sep=";",
    index=False,
)

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [None]:
# x=pd.read_csv('./tmp/leave_one_process_out_k_10_m_1000_n_20.csv', sep=';')
# x=pd.read_csv('./tmp/leave_one_process_out_k_10_nb_100_bna_10_draw_objects_mk2.csv', sep=';')
# x=pd.read_csv('./tmp/leave_one_process_out_k_40_bireducts_1000_sample_attrs_100_max_attrs_10.csv', sep=';')
x = pd.read_csv(
    "./tmp/leave_one_process_out_same_category_bireducts_1_sample_attrs_100_max_attrs_100.csv",
    sep=";",
)


# fpr, tpr, _ = roc_curve(df_dec, np.nan_to_num(x[0]/x[1]))
fpr, tpr, _ = roc_curve(df_dec, np.nan_to_num(x.iloc[:, 0] / x.iloc[:, 1]), pos_label=1)

plt.figure(figsize=(6, 5))
lw = 2
plt.plot(
    fpr,
    tpr,
    color="darkorange",
    lw=lw,
    label="ROC curve (area = %0.2f)" % auc(fpr, tpr),
)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic")
plt.legend(loc="lower right")
plt.show()

In [None]:
auc(fpr, tpr)

In [None]:
recall_score(fpr, tpr)

In [None]:
# x=pd.read_csv('./tmp/leave_one_process_out_k_10_m_1000_n_20.csv', sep=';')
# x=pd.read_csv('./tmp/leave_one_process_out_k_10_nb_100_bna_10_draw_objects_mk2.csv', sep=';')
x = pd.read_csv(
    "./tmp/leave_one_process_out_k_None_bireducts_1000_sample_attrs_100_max_attrs_10.csv",
    sep=";",
)

# fpr, tpr, _ = roc_curve(df_dec, np.nan_to_num(x[0]/x[1]))
fpr, tpr, _ = roc_curve(df_dec, np.nan_to_num(x.iloc[:, 0] / x.iloc[:, 1]), pos_label=1)

plt.figure(figsize=(6, 5))
lw = 2
plt.plot(
    fpr,
    tpr,
    color="darkorange",
    lw=lw,
    label="ROC curve (area = %0.2f)" % auc(fpr, tpr),
)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic")
plt.legend(loc="lower right")
plt.show()

In [None]:
auc(fpr, tpr)