In [1]:
import logging
import pathlib
import pickle
import warnings

import attr
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm
import xgboost as xgb
from joblib import Parallel, delayed
from sklearn import tree
from sklearn.metrics import auc, precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.neighbors import NearestNeighbors

from skrough.base import Bireduct
from skrough.bireducts.dynamically_adapted_approximate_bireduct import (
    DynamicallyAdaptedApproximateBireduct,
)

warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

import config

In [2]:
DATA_DIR = pathlib.Path(config.DATA_DIR)
TMP_DIR = pathlib.Path(config.TMP_DIR)
SEP = ","
DISCRETIZED_SEP = ";"
N_JOBS = 7

# DISCRETIZED_FILEPATH_IN = DATA_DIR / 'toolbox_tabular_data_annonymized_discretized_uniform.csv'
DISCRETIZED_FILEPATH_IN = (
    DATA_DIR / "toolbox_tabular_data_annonymized_discretized_quantile.csv"
)
# DISCRETIZED_FILEPATH_IN = DATA_DIR / 'toolbox_tabular_data_annonymized_discretized_kmeans.csv'
FILEPATH_IN = DATA_DIR / "toolbox_tabular_data_annonymized.csv"

EMBEDDINGS_FILEPATH_IN = DATA_DIR / "process_embeddings_d50.csv"

FEATURE_GROUPS_FILEPATH_IN = DATA_DIR / "feature_groups.csv"

XGBOOST_EXPERIMENT_DIR = "paper_xgboost_3_"
XGBOOST_FILENAME_OUT_TEMPLATE = (
    f"{XGBOOST_EXPERIMENT_DIR}/"
    "xgboost_native"
    "_dataset_{dataset}"
    "_leave_one_process_out_k_{k}"
    "_num_boost_round_{num_boost_round}"
    "_learning_rate_{learning_rate}"
    "_max_depth_{max_depth}"
    "_base_score_{base_score}"
    ".csv"
)

XGBOOST_FILENAME_OUT_TEMPLATE_SAME_CATEGORY = (
    f"{XGBOOST_EXPERIMENT_DIR}/"
    "xgboost_native"
    "_dataset_{dataset}"
    "_leave_one_process_out_same_category"
    "_num_boost_round_{num_boost_round}"
    "_learning_rate_{learning_rate}"
    "_max_depth_{max_depth}"
    "_base_score_{base_score}"
    ".csv"
)
XGBOOST_WO_FEATURES_FILENAME_OUT_TEMPLATE = (
    f"{XGBOOST_EXPERIMENT_DIR}/"
    "xgboost_native"
    "_dataset_{dataset}"
    "_leave_one_process_out_k_{k}"
    "_num_boost_round_{num_boost_round}"
    "_learning_rate_{learning_rate}"
    "_max_depth_{max_depth}"
    "_base_score_{base_score}"
    "_without_feature_group_{group_name}"
    ".csv"
)

TREE_EXPERIMENT_DIR = "paper_tree_3_"
TREE_FILENAME_OUT_TEMPLATE = (
    f"{TREE_EXPERIMENT_DIR}/"
    "decision_tree"
    "_dataset_{dataset}"
    "_leave_one_process_out_k_{k}"
    "_min_impurity_decrease_{min_impurity_decrease}"
    "_max_depth_{max_depth}"
    ".csv"
)
TREE_FILENAME_OUT_TEMPLATE_SAME_CATEGORY = (
    f"{TREE_EXPERIMENT_DIR}/"
    "decision_tree"
    "_dataset_{dataset}"
    "_leave_one_process_out_same_category"
    "_min_impurity_decrease_{min_impurity_decrease}"
    "_max_depth_{max_depth}"
    ".csv"
)

TREE_WO_FEATURES_FILENAME_OUT_TEMPLATE = (
    f"{TREE_EXPERIMENT_DIR}/"
    "decision_tree"
    "_dataset_{dataset}"
    "_leave_one_process_out_k_{k}"
    "_min_impurity_decrease_{min_impurity_decrease}"
    "_max_depth_{max_depth}"
    "_without_feature_group_{group_name}"
    ".csv"
)


BIREDUCTS_EXPERIMENT_DIR = "paper_bireducts_3_"
BIREDUCTS_FILENAME_OUT_TEMPLATE = (
    f"{BIREDUCTS_EXPERIMENT_DIR}/"
    "bireducts"
    "_dataset_{dataset}"
    "_leave_one_process_out_k_{k}"
    "_n_bireducts_{n_bireducts}"
    "_candidate_n_attrs_{candidate_n_attrs}"
    "_allowed_randomness_{allowed_randomness}"
    "_max_n_attrs_{max_n_attrs}"
    "_iteration_{iteration}"
    ".csv"
)
BIREDUCTS_FILENAME_OUT_TEMPLATE_SAME_CATEGORY = (
    f"{BIREDUCTS_EXPERIMENT_DIR}/"
    "bireducts"
    "_dataset_{dataset}"
    "_leave_one_process_out_same_category"
    "_n_bireducts_{n_bireducts}"
    "_candidate_n_attrs_{candidate_n_attrs}"
    "_allowed_randomness_{allowed_randomness}"
    "_max_n_attrs_{max_n_attrs}"
    "_iteration_{iteration}"
    ".csv"
)
BIREDUCTS_WO_FEATURES_FILENAME_OUT_TEMPLATE = (
    f"{BIREDUCTS_EXPERIMENT_DIR}/"
    "bireducts"
    "_dataset_{dataset}"
    "_leave_one_process_out_k_{k}"
    "_n_bireducts_{n_bireducts}"
    "_candidate_n_attrs_{candidate_n_attrs}"
    "_allowed_randomness_{allowed_randomness}"
    "_max_n_attrs_{max_n_attrs}"
    "_iteration_{iteration}"
    "_without_feature_group_{group_name}"
    ".csv"
)

In [3]:
logging.basicConfig(filename=TMP_DIR / "b.log", level=logging.INFO)

In [4]:
embeddings = pd.read_csv(EMBEDDINGS_FILEPATH_IN)
process_category = embeddings[["process_ids", "process_category"]]
embeddings.drop(["process_category"], axis=1, inplace=True)
embeddings.set_index("process_ids", inplace=True)
emb_nbrs = NearestNeighbors().fit(embeddings)

In [5]:
feature_groups = pd.read_csv(FEATURE_GROUPS_FILEPATH_IN)

group_names = [
    "employment history",
    "skills",
    "education",
    "place of residence",
    "current status",
    "job offer",
    "person-offer relation",
]

In [4]:
# df = pd.read_csv(FILEPATH_IN, sep=SEP)
# process_ids = df.pop('process_ids')
# df_dec = df.pop('target').astype('category').cat.codes
# df_dec = 1 - df_dec

# discretized_df = pd.read_csv(DISCRETIZED_FILEPATH_IN, sep=DISCRETIZED_SEP)
# discretized_process_ids = discretized_df.pop('process_ids')
# discretized_df = discretized_df.astype('category')
# discretized_df = discretized_df.apply(lambda x: x.cat.codes)
# discretized_df_dec = discretized_df.pop('target')
# discretized_df_dec = 1 - discretized_df_dec

# assert (process_ids == discretized_process_ids).all()
# assert (df_dec == discretized_df_dec).all()

In [6]:
def refine_train(train, test, process_ids, embeddings, emb_nbrs, k):
    # search for k+1 neighbors because the one we search for is obviously the best match (thus +1)
    test_process_id = process_ids.loc[test.index[0]]
    nbrs = emb_nbrs.kneighbors(
        [embeddings.loc[test_process_id]], n_neighbors=k + 1, return_distance=False
    )[0]
    return train[process_ids.loc[train.index].isin(embeddings.index[nbrs])]


def fit_predict_xgboost_native(train, test, df_dec, **fit_predict_params):
    params = dict(
        fit_predict_params, objective="binary:logistic", eval_metric="logloss"
    )
    num_boost_round = params.pop("num_boost_round")
    dtrain = xgb.DMatrix(train.values, label=df_dec.loc[train.index])
    dtest = xgb.DMatrix(test.values)
    cl = xgb.train(params, dtrain, num_boost_round=num_boost_round)
    scores = cl.predict(dtest)
    return (
        pd.Series(scores, index=test.index),
        pd.Series(1, index=test.index),
        pd.Series(1, index=test.index),
    )


def fit_predict_decision_tree(train, test, df_dec, **fit_predict_params):
    cl = tree.DecisionTreeClassifier(**fit_predict_params)
    cl = cl.fit(train.values, df_dec.loc[train.index])
    scores = cl.predict_proba(test)[:, 1]
    return (
        pd.Series(scores, index=test.index),
        pd.Series(1, index=test.index),
        pd.Series(1, index=test.index),
    )


def get_bireducts(
    train,
    df_dec,
    n_bireducts,
    n_of_probes,
    allowed_randomness,
    candidate_n_attrs,
    max_n_attrs,
):
    ghr = DynamicallyAdaptedApproximateBireduct(
        n_of_probes=n_of_probes,
        allowed_randomness=allowed_randomness,
        candidate_n_attrs=candidate_n_attrs,
        max_n_attrs=max_n_attrs,
    )
    ghr.fit(train, df_dec.loc[train.index], check_data_consistency=False)
    bireducts = Parallel(n_jobs=N_JOBS)(
        delayed(ghr.get_bireduct)() for i in range(n_bireducts)
    )
    return bireducts


def predict(train, test, bireduct, df_dec):
    objs = bireduct.objects
    cols = bireduct.attributes
    bireduct_train = train.iloc[objs, cols]
    bireduct_train = bireduct_train.drop_duplicates()
    nn = NearestNeighbors().fit(bireduct_train)
    dist, neighbors = nn.kneighbors(
        test.iloc[:, cols], n_neighbors=1, return_distance=True
    )
    result = [
        df_dec.loc[bireduct_train.index[n[0]]] if d[0] == 0 else np.nan
        for d, n in zip(dist, neighbors)
    ]
    return result


def fit_predict_bireducts(train, test, df_dec, **fit_predict_params):
    params = dict(fit_predict_params, n_of_probes=100)
    bireducts = get_bireducts(train, df_dec, **params)
    scores = np.zeros(len(test))
    counts = np.zeros(len(test))
    counts2 = np.zeros(len(test))
    for bireduct in bireducts:
        for i, dec in enumerate(predict(train, test, bireduct, df_dec)):
            if not np.isnan(dec):
                scores[i] += int(dec)
                counts[i] += 1
                counts2[i] += len(bireduct.objects) / len(train)
    return (
        pd.Series(scores, index=test.index),
        pd.Series(counts, index=test.index),
        pd.Series(counts2, index=test.index),
    )


def do_leave_one_process_out_k(
    fit_predict_fun, df, df_dec, process_ids, k, **fit_predict_params
):
    logging.info({"filename": FILEPATH_IN})
    logging.info({"k": k})
    logging.info(fit_predict_params)
    scores = pd.Series(dtype="float")
    counts = pd.Series(dtype="float")
    counts2 = pd.Series(dtype="float")
    i = 0
    for train_index, test_index in tqdm.tqdm(
        LeaveOneGroupOut().split(df, df_dec, groups=process_ids)
    ):
        i += 1
        logging.info(str(i))
        train = df.iloc[train_index]
        test = df.iloc[test_index]
        if k is not None:
            train = refine_train(train, test, process_ids, embeddings, emb_nbrs, k)
        s, c, c2 = fit_predict_fun(train, test, df_dec, **fit_predict_params)
        scores = scores.append(s)
        counts = counts.append(c)
        counts2 = counts2.append(c2)
    return scores.sort_index(), counts.sort_index(), counts2.sort_index()


def do_leave_one_process_out_same_category(
    fit_predict_fun, df, df_dec, process_ids, process_category, **fit_predict_params
):
    logging.info("same category")
    logging.info(fit_predict_params)
    scores = pd.Series(dtype="float")
    counts = pd.Series(dtype="float")
    counts2 = pd.Series(dtype="float")
    i = 0
    for leave_one_out_process_id in tqdm.tqdm(process_category["process_ids"].unique()):
        i += 1
        logging.info(str(i))
        test = df[process_ids == leave_one_out_process_id]
        test_category = process_category[
            process_category["process_ids"] == leave_one_out_process_id
        ]["process_category"].iloc[0]
        test_category_process_ids = process_category[
            process_category["process_category"] == test_category
        ]["process_ids"]
        train = df[
            process_ids.isin(test_category_process_ids)
            & (process_ids != leave_one_out_process_id)
        ]
        s, c, c2 = fit_predict_fun(train, test, df_dec, **fit_predict_params)
        scores = scores.append(s)
        counts = counts.append(c)
        counts2 = counts2.append(c2)
    return scores.sort_index(), counts.sort_index(), counts2.sort_index()

# XGBoost

In [7]:
data_filepath = FILEPATH_IN
df = pd.read_csv(data_filepath, sep=SEP)
process_ids = df.pop("process_ids")
df_dec = df.pop("target").astype("category").cat.codes
df_dec = 1 - df_dec

In [7]:
# k_values = list(range(10, 202, 10)) + [None]
# learning_rate_values = [0.001]
# max_depth_values = [3]
# num_boost_round_values = [1000]
# more_params_values = [{'base_score': 0.0696}]

k_values = [40]
learning_rate_values = [0.1, 0.01, 0.001]
max_depth_values = [2, 3, 4, 5, 10]
num_boost_round_values = [1000]
more_params_values = [{"base_score": 0.0696}]

for more_params in more_params_values:
    for k in k_values:
        for num_boost_round in num_boost_round_values:
            for learning_rate in learning_rate_values:
                for max_depth in max_depth_values:
                    x = do_leave_one_process_out_k(
                        fit_predict_xgboost_native,
                        df,
                        df_dec,
                        process_ids,
                        k=k,
                        num_boost_round=num_boost_round,
                        learning_rate=learning_rate,
                        max_depth=max_depth,
                        **more_params,
                    )
                    pd.DataFrame(
                        {"scores": x[0], "counts": x[1], "counts2": x[2]}
                    ).to_csv(
                        TMP_DIR
                        / XGBOOST_FILENAME_OUT_TEMPLATE.format(
                            dataset=pathlib.Path(data_filepath).stem,
                            k=k,
                            num_boost_round=num_boost_round,
                            learning_rate=learning_rate,
                            max_depth=max_depth,
                            **more_params,
                        ),
                        sep=";",
                        index=False,
                    )

18it [08:33, 28.52s/it]


KeyboardInterrupt: 

In [None]:
learning_rate = 0.001
max_depth = 3
num_boost_round = 1000
more_params = {"base_score": 0.0696}

x = do_leave_one_process_out_same_category(
    fit_predict_xgboost_native,
    df,
    df_dec,
    process_ids,
    process_category,
    num_boost_round=num_boost_round,
    learning_rate=learning_rate,
    max_depth=max_depth,
    **more_params,
)
pd.DataFrame({"scores": x[0], "counts": x[1], "counts2": x[2]}).to_csv(
    TMP_DIR
    / XGBOOST_FILENAME_OUT_TEMPLATE_SAME_CATEGORY.format(
        dataset=pathlib.Path(data_filepath).stem,
        num_boost_round=num_boost_round,
        learning_rate=learning_rate,
        max_depth=max_depth,
        **more_params,
    ),
    sep=";",
    index=False,
)

In [8]:
del data_filepath
del df
del df_dec
del process_ids

# XGBoost without features

In [9]:
data_filepath = FILEPATH_IN
df = pd.read_csv(data_filepath, sep=SEP)
process_ids = df.pop("process_ids")
df_dec = df.pop("target").astype("category").cat.codes
df_dec = 1 - df_dec

In [None]:
k = 40
learning_rate = 0.001
max_depth = 3
num_boost_round = 1000
more_params = {"base_score": 0.0696}

for group_name in group_names:
    logging.info(f"feature group {group_name}")
    columns_to_remove = feature_groups[feature_groups["group_name"] == group_name][
        "annonymized_name"
    ].to_list()
    df_without_features = df.drop(columns=columns_to_remove)
    x = do_leave_one_process_out_k(
        fit_predict_xgboost_native,
        df_without_features,
        df_dec,
        process_ids,
        k=k,
        num_boost_round=num_boost_round,
        learning_rate=learning_rate,
        max_depth=max_depth,
        **more_params,
    )
    pd.DataFrame({"scores": x[0], "counts": x[1], "counts2": x[2]}).to_csv(
        TMP_DIR
        / XGBOOST_WO_FEATURES_FILENAME_OUT_TEMPLATE.format(
            dataset=pathlib.Path(data_filepath).stem,
            k=k,
            num_boost_round=num_boost_round,
            learning_rate=learning_rate,
            max_depth=max_depth,
            group_name=group_name.replace(" ", "_"),
            **more_params,
        ),
        sep=";",
        index=False,
    )

202it [33:44, 10.02s/it]
68it [03:44,  3.75s/it]

In [None]:
del data_filepath
del df
del df_dec
del process_ids

In [32]:
for group_name in group_names:
    columns_to_remove = feature_groups[feature_groups["group_name"] == group_name][
        "column_name"
    ].to_list()
    print(group_name)
    print(columns_to_remove)

employment history
['number_of_honors_and_awards', 'number_of_honors_and_awards_in_last_but_1_calendar_year', 'person_number_of_honors_and_awards_in_last_but_02_10', 'number_of_honors_and_awards_within_12_recent_months', 'number_of_honors_and_awards_within_24_recent_months', 'number_of_jobs', 'number_of_jobs_in_last_but_1_calendar_year', 'person_number_of_jobs_in_last_but_02_10', 'number_of_jobs_within_6_recent_months', 'number_of_jobs_within_12_recent_months', 'number_of_jobs_within_24_recent_months', 'person_jobs_country_count', 'person_jobs_city_count', 'jobs_start_timestamp', 'jobs_end_timestamp', 'shortest_employment_length_in_months', 'avg_employment_length_in_months', 'number_of_projects', 'number_of_projects_finished_in_last_but_1_calendar_year', 'person_number_of_projects_finished_in_last_but_02_10', 'number_of_projects_finished_within_1_recent_months', 'number_of_projects_finished_within_3_recent_months', 'number_of_projects_finished_within_6_recent_months', 'number_of_projec

# Tree

In [31]:
data_filepath = FILEPATH_IN
df = pd.read_csv(data_filepath, sep=SEP)
process_ids = df.pop("process_ids")
df_dec = df.pop("target").astype("category").cat.codes
df_dec = 1 - df_dec

In [None]:
k_values = list(range(10, 202, 10)) + [None]
min_impurity_decrease_values = [0.0, 0.001]
max_depth_values = [2, 3, 4, 5]
more_params = {}

# k_values = [40]
# max_depth_values = [2, 3, 4, 5]
# min_impurity_decrease_values = [0.0, 0.001, 0.01, 0.1]
# more_params = {}

for k in k_values:
    for min_impurity_decrease in min_impurity_decrease_values:
        for max_depth in max_depth_values:
            x = do_leave_one_process_out_k(
                fit_predict_decision_tree,
                df,
                df_dec,
                process_ids,
                k=k,
                min_impurity_decrease=min_impurity_decrease,
                max_depth=max_depth,
                **more_params,
            )
            pd.DataFrame({"scores": x[0], "counts": x[1], "counts2": x[2]}).to_csv(
                TMP_DIR
                / TREE_FILENAME_OUT_TEMPLATE.format(
                    dataset=pathlib.Path(data_filepath).stem,
                    k=k,
                    min_impurity_decrease=min_impurity_decrease,
                    max_depth=max_depth,
                    **more_params,
                ),
                sep=";",
                index=False,
            )

202it [00:20,  9.72it/s]
202it [00:21,  9.43it/s]
202it [00:22,  9.11it/s]
202it [00:22,  8.82it/s]
202it [00:20,  9.76it/s]
202it [00:21,  9.39it/s]
179it [00:19,  6.90it/s]

In [None]:
min_impurity_decrease = 0.0
max_depth = 5
more_params = {}

x = do_leave_one_process_out_same_category(
    fit_predict_decision_tree,
    df,
    df_dec,
    process_ids,
    process_category,
    min_impurity_decrease=min_impurity_decrease,
    max_depth=max_depth,
    **more_params,
)
pd.DataFrame({"scores": x[0], "counts": x[1], "counts2": x[2]}).to_csv(
    TMP_DIR
    / TREE_FILENAME_OUT_TEMPLATE_SAME_CATEGORY.format(
        dataset=pathlib.Path(data_filepath).stem,
        min_impurity_decrease=min_impurity_decrease,
        max_depth=max_depth,
        **more_params,
    ),
    sep=";",
    index=False,
)

In [None]:
del data_filepath
del df
del df_dec
del process_ids

# Tree without features

In [None]:
data_filepath = FILEPATH_IN
df = pd.read_csv(data_filepath, sep=SEP)
process_ids = df.pop("process_ids")
df_dec = df.pop("target").astype("category").cat.codes
df_dec = 1 - df_dec

In [None]:
k = 40
min_impurity_decrease = 0.0
max_depth = 5
more_params = {}

for group_name in group_names:
    logging.info(f"feature group {group_name}")
    columns_to_remove = feature_groups[feature_groups["group_name"] == group_name][
        "annonymized_name"
    ].to_list()
    df_without_features = df.drop(columns=columns_to_remove)
    x = do_leave_one_process_out_k(
        fit_predict_decision_tree,
        df_without_features,
        df_dec,
        process_ids,
        k=k,
        min_impurity_decrease=min_impurity_decrease,
        max_depth=max_depth,
        **more_params,
    )
    pd.DataFrame({"scores": x[0], "counts": x[1], "counts2": x[2]}).to_csv(
        TMP_DIR
        / TREE_WO_FEATURES_FILENAME_OUT_TEMPLATE.format(
            dataset=pathlib.Path(data_filepath).stem,
            k=k,
            min_impurity_decrease=min_impurity_decrease,
            max_depth=max_depth,
            group_name=group_name.replace(" ", "_"),
            **more_params,
        ),
        sep=";",
        index=False,
    )

In [None]:
del data_filepath
del df
del df_dec
del process_ids

# Bireducts

In [None]:
data_filepath = DISCRETIZED_FILEPATH_IN
df = pd.read_csv(data_filepath, sep=DISCRETIZED_SEP)
process_ids = df.pop("process_ids")
df = df.astype("category")
df = df.apply(lambda x: x.cat.codes)
df_dec = df.pop("target")
df_dec = 1 - df_dec

In [None]:
# k_values = [20, 50, 80, 110, 140, 170, 200]
# k_values = [10, 30, 60, 70, 90, 100, 120, 130, 150, 160, 180, 190, None]
k_values = list(range(10, 202, 10)) + [None]
n_bireducts_values = [1000]
candidate_n_attrs_values = [100]
allowed_randomness_values = [0.05]
max_n_attrs_values = [None]
# max_n_attrs_values = [3]
more_params = {}

for iteration in range(1):
    #     if iteration < 1:
    #         continue
    for k in k_values:
        #         if k < 70:
        #             continue
        for n_bireducts in n_bireducts_values:
            for candidate_n_attrs in candidate_n_attrs_values:
                for allowed_randomness in allowed_randomness_values:
                    for max_n_attrs in max_n_attrs_values:
                        x = do_leave_one_process_out_k(
                            fit_predict_bireducts,
                            df,
                            df_dec,
                            process_ids,
                            k=k,
                            n_bireducts=n_bireducts,
                            candidate_n_attrs=candidate_n_attrs,
                            allowed_randomness=allowed_randomness,
                            max_n_attrs=max_n_attrs,
                            **more_params,
                        )
                        pd.DataFrame(
                            {"scores": x[0], "counts": x[1], "counts2": x[2]}
                        ).to_csv(
                            TMP_DIR
                            / BIREDUCTS_FILENAME_OUT_TEMPLATE.format(
                                dataset=pathlib.Path(data_filepath).stem,
                                k=k,
                                n_bireducts=n_bireducts,
                                candidate_n_attrs=candidate_n_attrs,
                                allowed_randomness=allowed_randomness,
                                max_n_attrs=max_n_attrs,
                                iteration=iteration,
                                **more_params,
                            ),
                            sep=";",
                            index=False,
                        )

In [None]:
n_bireducts = 1000
candidate_n_attrs = 100
allowed_randomness = 0.05
max_n_attrs = 3
iteration = 1
x = do_leave_one_process_out_same_category(
    fit_predict_bireducts,
    df,
    df_dec,
    process_ids,
    process_category,
    n_bireducts=n_bireducts,
    candidate_n_attrs=candidate_n_attrs,
    allowed_randomness=allowed_randomness,
    max_n_attrs=max_n_attrs,
    **more_params,
)
pd.DataFrame({"scores": x[0], "counts": x[1], "counts2": x[2]}).to_csv(
    TMP_DIR
    / BIREDUCTS_FILENAME_OUT_TEMPLATE_SAME_CATEGORY.format(
        dataset=pathlib.Path(data_filepath).stem,
        n_bireducts=n_bireducts,
        candidate_n_attrs=candidate_n_attrs,
        allowed_randomness=allowed_randomness,
        max_n_attrs=max_n_attrs,
        iteration=iteration,
        **more_params,
    ),
    sep=";",
    index=False,
)

In [None]:
del data_filepath
del df
del df_dec
del process_ids

# Bireducts without features

In [29]:
data_filepath = DISCRETIZED_FILEPATH_IN
df = pd.read_csv(data_filepath, sep=DISCRETIZED_SEP)
process_ids = df.pop("process_ids")
df = df.astype("category")
df = df.apply(lambda x: x.cat.codes)
df_dec = df.pop("target")
df_dec = 1 - df_dec

In [None]:
k = 40
n_bireducts = 1000
candidate_n_attrs = 100
allowed_randomness = 0.05
max_n_attrs = None
iteration = 1
more_params = {}

for group_name in group_names:
    logging.info(f"feature group {group_name}")
    columns_to_remove = feature_groups[feature_groups["group_name"] == group_name][
        "annonymized_name"
    ].to_list()
    df_without_features = df.drop(columns=columns_to_remove)
    x = do_leave_one_process_out_k(
        fit_predict_bireducts,
        df_without_features,
        df_dec,
        process_ids,
        k=k,
        n_bireducts=n_bireducts,
        candidate_n_attrs=candidate_n_attrs,
        allowed_randomness=allowed_randomness,
        max_n_attrs=max_n_attrs,
        **more_params,
    )
    pd.DataFrame({"scores": x[0], "counts": x[1], "counts2": x[2]}).to_csv(
        TMP_DIR
        / BIREDUCTS_WO_FEATURES_FILENAME_OUT_TEMPLATE.format(
            dataset=pathlib.Path(data_filepath).stem,
            k=k,
            n_bireducts=n_bireducts,
            candidate_n_attrs=candidate_n_attrs,
            allowed_randomness=allowed_randomness,
            max_n_attrs=max_n_attrs,
            iteration=iteration,
            group_name=group_name.replace(" ", "_"),
            **more_params,
        ),
        sep=";",
        index=False,
    )

8it [06:05, 44.05s/it]

In [28]:
del data_filepath
del df
del df_dec
del process_ids

NameError: name 'data_filepath' is not defined