# Imports

In [43]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, fbeta_score

from skopt.space import Integer, Categorical

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import fbeta_score, roc_auc_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score

from skopt import BayesSearchCV

from typing import Dict, List, Literal
from skopt.space import Real, Integer, Categorical

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings("ignore")

## Read dataset

In [44]:
data = pd.read_csv("../data/processed/20. FINAL_mean_delta_multi_output.csv")
print(data.shape)
data.head()

(6091, 210)


Unnamed: 0,Hba1c,Hba1c Time,Hba1c FM,Hba1c FM Time,BMI,BMI Time,Cancer,Cancer Time,Carotid Disease,Carotid Disease Time,...,y_stk_or_aemb_3_months,y_stk_or_aemb_6_months,y_stk_or_aemb_12_months,y_stk_or_aemb_24_months,y_stk_or_aemb,History of Vascular Disease,Antihypertensive Medication,Diabetes Mellitus,Diabetes Medication,Abnormal Kidney Function
0,6.26,-501,6.26,-501,20.7,-19,0,10000,0,10000,...,0,0,0,0,0,0,1,0,0,0
1,6.26,-501,6.26,-501,26.7,-780,1,-4846,0,10000,...,0,0,0,0,0,0,1,0,0,0
2,5.8,-287,6.3,-2701,31.1,-35,0,10000,0,10000,...,0,0,0,0,0,0,1,1,1,1
3,6.26,-501,6.26,-501,21.3,-207,0,10000,0,10000,...,0,0,0,0,0,1,1,0,0,1
4,5.9,-162,5.4,-5209,37.8,-554,1,-86,0,10000,...,0,0,0,0,0,1,1,1,1,0


## Drop columns

In [45]:


cols_to_drop = [col for col in data.columns if col.endswith('_t')]
data = data.drop(columns=cols_to_drop)

targets = [col for col in data.columns if col.startswith('y_')]
print(targets)

cols_to_drop = [col for col in data.columns if col.endswith('Time')]
data = data.drop(columns=cols_to_drop)

cols_to_drop = [col for col in data.columns if col.startswith('Δ')]
data = data.drop(columns=cols_to_drop)

cols_to_drop = [col for col in data.columns if "FM" in col]
data = data.drop(columns=cols_to_drop)

print(data.shape)
data.head()

['y_acs', 'y_aemb', 'y_cvdeath', 'y_death', 'y_hf', 'y_inp', 'y_stk', 'y_acs_1_month', 'y_acs_3_months', 'y_acs_6_months', 'y_acs_12_months', 'y_acs_24_months', 'y_aemb_1_month', 'y_aemb_3_months', 'y_aemb_6_months', 'y_aemb_12_months', 'y_aemb_24_months', 'y_cvdeath_1_month', 'y_cvdeath_3_months', 'y_cvdeath_6_months', 'y_cvdeath_12_months', 'y_cvdeath_24_months', 'y_death_1_month', 'y_death_3_months', 'y_death_6_months', 'y_death_12_months', 'y_death_24_months', 'y_hf_1_month', 'y_hf_3_months', 'y_hf_6_months', 'y_hf_12_months', 'y_hf_24_months', 'y_inp_1_month', 'y_inp_3_months', 'y_inp_6_months', 'y_inp_12_months', 'y_inp_24_months', 'y_stk_1_month', 'y_stk_3_months', 'y_stk_6_months', 'y_stk_12_months', 'y_stk_24_months', 'y_stk_or_aemb_1_month', 'y_stk_or_aemb_3_months', 'y_stk_or_aemb_6_months', 'y_stk_or_aemb_12_months', 'y_stk_or_aemb_24_months', 'y_stk_or_aemb']
(6091, 114)


Unnamed: 0,Hba1c,BMI,Cancer,Carotid Disease,Coronary Disease,COPD,Creatinine,DBP,Dyslipidemia,eGFR,...,y_stk_or_aemb_3_months,y_stk_or_aemb_6_months,y_stk_or_aemb_12_months,y_stk_or_aemb_24_months,y_stk_or_aemb,History of Vascular Disease,Antihypertensive Medication,Diabetes Mellitus,Diabetes Medication,Abnormal Kidney Function
0,6.26,20.7,0,0,0,0,0.99,87.0,1,64.15,...,0,0,0,0,0,0,1,0,0,0
1,6.26,26.7,1,0,0,0,0.53,81.0,0,88.5,...,0,0,0,0,0,0,1,0,0,0
2,5.8,31.1,0,0,0,0,0.88,109.0,1,46.8,...,0,0,0,0,0,0,1,1,1,1
3,6.26,21.3,0,0,0,0,1.56,63.0,1,9.0,...,0,0,0,0,0,1,1,0,0,1
4,5.9,37.8,1,0,1,0,0.64,98.0,1,76.6,...,0,0,0,0,0,1,1,1,1,0


In [46]:
data.columns

Index(['Hba1c', 'BMI', 'Cancer', 'Carotid Disease', 'Coronary Disease', 'COPD',
       'Creatinine', 'DBP', 'Dyslipidemia', 'eGFR',
       ...
       'y_stk_or_aemb_3_months', 'y_stk_or_aemb_6_months',
       'y_stk_or_aemb_12_months', 'y_stk_or_aemb_24_months', 'y_stk_or_aemb',
       'History of Vascular Disease', 'Antihypertensive Medication',
       'Diabetes Mellitus', 'Diabetes Medication', 'Abnormal Kidney Function'],
      dtype='object', length=114)

## Models settings

In [47]:
# Define 5-fold stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create F2 scorer
f2_scorer = make_scorer(fbeta_score, beta=2)

# ===================== Naive Bayes =====================
search_space_nb = {
    "var_smoothing": (1e-12, 1e-1, "log-uniform")
}

nb_opt = BayesSearchCV(
    estimator=GaussianNB(),
    search_spaces=search_space_nb,
    scoring=f2_scorer,
    n_iter=30,
    cv=cv,
    n_jobs=-1,
    random_state=42
)

# ===================== Logistic Regression =====================
search_space_lr = {
    'C': (1e-4, 1e+3, 'log-uniform'),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

lr_opt = BayesSearchCV(
    LogisticRegression(max_iter=1000, random_state=42),
    search_spaces=search_space_lr,
    scoring=f2_scorer,
    n_iter=30,
    cv=cv,
    n_jobs=-1,
    random_state=42
)

# ===================== Decision Tree =====================
search_space_dt = {
    "max_depth": Integer(3, 20),
    "min_samples_split": Integer(2, 20),
    "min_samples_leaf": Integer(1, 10),
    "criterion": Categorical(["gini", "entropy"])
}

dt_opt = BayesSearchCV(
    DecisionTreeClassifier(random_state=42),
    search_spaces=search_space_dt,
    n_iter=30,
    scoring=f2_scorer,
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

# ===================== Random Forest =====================
search_space_rf = {
    'n_estimators': (50, 300),
    'max_depth': (1, 50),
    'min_samples_split': (2, 20),
    'min_samples_leaf': (1, 20),
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

rf_opt = BayesSearchCV(
    RandomForestClassifier(random_state=42),
    search_spaces=search_space_rf,
    scoring=f2_scorer,
    n_iter=30,
    cv=cv,
    n_jobs=-1,
    random_state=42
)

# ===================== XGBoost =====================
search_space_xgb = {
    'n_estimators': (10, 500),
    'max_depth': (1, 50),
    'learning_rate': (0.001, 0.1, 'uniform'),
    'subsample': (0.5, 1.0),
    'colsample_bytree': (0.5, 1.0),
    'gamma': (0, 5),
    'min_child_weight': (1, 20)
}

xgb_opt = BayesSearchCV(
    XGBClassifier(random_state=42),
    search_spaces=search_space_xgb,
    scoring=f2_scorer,
    n_iter=30,
    n_jobs=-1,
    random_state=42
)

# ===================== MLP ===============================

mlp_opt = MLPClassifier(max_iter=2000, solver="adam", activation="relu", hidden_layer_sizes=(200,100), random_state=42)

# ===================== Optimizer Map =====================
optimizer_map = {
    "nb": nb_opt,
    "lr": lr_opt,
    "dt": dt_opt,
    "rf": rf_opt,
    "xgb": xgb_opt,
    "mlp": mlp_opt
}


## Define function

In [48]:
SamplingName = Literal["baseline", "undersample", "oversample", "smote", "all"]
ModelName = Literal["nb", "lr", "xgb", "dt", "rf", "mlp"]


def _make_resampler(name: SamplingName, random_state: int, y_train: pd.Series = None):
    if name == "baseline":
        return None
    if name == "undersample":
        if y_train is None:
            raise ValueError("y_train must be provided for undersampling strategy")
        n_minority = y_train.sum()
        n_required = max(int(0.1 * len(y_train)), n_minority * 2)
        n_majority = n_required - n_minority
        n_majority = min(n_majority, (y_train == 0).sum())
        sampling_strategy = {0: n_majority, 1: n_minority}
        return RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=random_state)
    if name == "oversample":
        return RandomOverSampler(random_state=random_state)
    if name == "smote":
        return SMOTE(random_state=random_state)
    raise ValueError(f"Unknown sampling technique: {name}")


def run_cv_with_sampling(
    X_full: pd.DataFrame,
    target_cols: List[str],
    target_name: str,
    optimizer_map: Dict[ModelName, object],
    model_name: ModelName = "nb",
    sampling: SamplingName = "all",
    n_splits: int = 5,
    random_state: int = 42,
    xgb_04 = False,
) -> Dict[str, dict]:
    if target_name not in target_cols:
        raise ValueError("target_name must be inside target_cols")
    missing_targets = [c for c in target_cols if c not in X_full.columns]
    if missing_targets:
        raise ValueError(f"Target columns not in X_full: {missing_targets}")

    y = X_full[target_name].copy()
    X = X_full.drop(columns=target_cols).copy()

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    if sampling == "all":
        techniques = ["baseline", "undersample", "oversample", "smote"]
    else:
        techniques = [sampling]

    results: Dict[str, dict] = {}

    for tech in techniques:
        print(f"\n=== Technique: {tech.upper()} ===")

        fold_results: List[dict] = []
        best_params_per_fold: List[dict] = []

        for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), start=1):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            if model_name in ["nb", "lr", "mlp"]:
                scaler = StandardScaler()
                X_train = scaler.fit_transform(X_train)
                X_val = scaler.transform(X_val)

            resampler = _make_resampler(tech, random_state, y_train)
            if resampler is not None:
                X_train_rs, y_train_rs = resampler.fit_resample(X_train, y_train)
            else:
                X_train_rs, y_train_rs = X_train, y_train

            opt = optimizer_map[model_name]
            opt.fit(X_train_rs, y_train_rs)

            if model_name == "mlp":
                best_model = opt
                best_params_per_fold = {}
            else:
                best_model = opt.best_estimator_
                best_params_per_fold.append(getattr(opt, "best_params_", {}))

            y_pred = best_model.predict(X_val)

            if hasattr(best_model, "predict_proba"):
                y_pred_proba = best_model.predict_proba(X_val)[:, 1]
            else:
                scores = best_model.decision_function(X_val)
                smin, smax = scores.min(), scores.max()
                y_pred_proba = (scores - smin) / (smax - smin + 1e-9)

            if model_name == "xgb" and xgb_04:
                y_pred = (y_pred_proba >= 0.4).astype(int)

            prec = precision_score(y_val, y_pred, zero_division=0)
            rec = recall_score(y_val, y_pred, zero_division=0)
            f1 = f1_score(y_val, y_pred, zero_division=0)
            fbeta2 = fbeta_score(y_val, y_pred, beta=2, zero_division=0)
            roc = roc_auc_score(y_val, y_pred_proba)

            fold_metrics = {
                "fold": fold,
                "accuracy": accuracy_score(y_val, y_pred),
                "precision": prec,
                "sensitivity": rec,
                "f1_score": f1,
                "fbeta_2": fbeta2,
                "roc_auc": roc,
                "NNS": (1 / prec) if prec > 0 else np.inf,
                "best_params": getattr(opt, "best_params_", {}),
            }
            fold_results.append(fold_metrics)

            print({k: (round(v, 3) if isinstance(v, float) else v) for k, v in fold_metrics.items()})

        metrics = [
            "accuracy",
            "precision",
            "sensitivity",
            "f1_score",
            "fbeta_2",
            "roc_auc",
            "NNS",
        ]
        mean_std = {
            m: (np.mean([fr[m] for fr in fold_results]), np.std([fr[m] for fr in fold_results]))
            for m in metrics
        }

        print("\nMean scores across folds (", tech, "):")
        for m in metrics:
            mu, sd = mean_std[m]
            print(f"{m}: {mu:.3f} \u00B1 {sd:.3f}")

        summary_df = pd.DataFrame(fold_results)
        results[tech] = {
            "fold_results": fold_results,
            "summary_metrics": summary_df,
            "best_params_per_fold": best_params_per_fold,
        }

    return results

## y_acs

In [49]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===


{'fold': 1, 'accuracy': 0.823, 'precision': 0.041, 'sensitivity': 0.6, 'f1_score': 0.077, 'fbeta_2': 0.161, 'roc_auc': np.float64(0.775), 'NNS': 24.333, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 2, 'accuracy': 0.833, 'precision': 0.043, 'sensitivity': 0.643, 'f1_score': 0.081, 'fbeta_2': 0.171, 'roc_auc': np.float64(0.756), 'NNS': 23.0, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 3, 'accuracy': 0.859, 'precision': 0.041, 'sensitivity': 0.5, 'f1_score': 0.075, 'fbeta_2': 0.154, 'roc_auc': np.float64(0.76), 'NNS': 24.571, 'best_params': OrderedDict([('var_smoothing', 0.09912740922751373)])}
{'fold': 4, 'accuracy': 0.842, 'precision': 0.027, 'sensitivity': 0.357, 'f1_score': 0.05, 'fbeta_2': 0.102, 'roc_auc': np.float64(0.725), 'NNS': 37.6, 'best_params': OrderedDict([('var_smoothing', 0.0412947837564389)])}
{'fold': 5, 'accuracy': 0.868, 'precision': 0.026, 'sensitivity': 0.267, 'f1_score': 0.047, 'fbeta_2': 0.093, 'roc_auc': np.float64(0.668), 'NN

In [50]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.836, 'precision': 0.035, 'sensitivity': 0.467, 'f1_score': 0.065, 'fbeta_2': 0.135, 'roc_auc': np.float64(0.708), 'NNS': 28.429, 'best_params': OrderedDict([('C', 0.00022908307093759842), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.811, 'precision': 0.034, 'sensitivity': 0.571, 'f1_score': 0.065, 'fbeta_2': 0.139, 'roc_auc': np.float64(0.748), 'NNS': 29.0, 'best_params': OrderedDict([('C', 0.00046001210476106976), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.835, 'precision': 0.044, 'sensitivity': 0.643, 'f1_score': 0.082, 'fbeta_2': 0.172, 'roc_auc': np.float64(0.836), 'NNS': 22.778, 'best_params': OrderedDict([('C', 0.0005238499102541385), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.763, 'precision': 0.024, 'sensitivity': 0.5, 'f1_score': 0.046, 'fbeta_2': 0.101, 'roc_auc': np.float64(0.688), 'NNS': 41.286, 'best_params': OrderedDict([('C', 0.00

{'fold': 5, 'accuracy': 0.816, 'precision': 0.023, 'sensitivity': 0.333, 'f1_score': 0.043, 'fbeta_2': 0.09, 'roc_auc': np.float64(0.691), 'NNS': 43.8, 'best_params': OrderedDict([('C', 0.00010602594470834996), ('penalty', 'l2'), ('solver', 'liblinear')])}

Mean scores across folds ( undersample ):
accuracy: 0.812 ± 0.027
precision: 0.032 ± 0.008
sensitivity: 0.503 ± 0.104
f1_score: 0.060 ± 0.014
fbeta_2: 0.127 ± 0.029
roc_auc: 0.734 ± 0.055
NNS: 33.058 ± 8.083


In [51]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="oversample",
    n_splits=5,
    random_state=42,
)


=== Technique: OVERSAMPLE ===
{'fold': 1, 'accuracy': 0.751, 'precision': 0.026, 'sensitivity': 0.533, 'f1_score': 0.05, 'fbeta_2': 0.11, 'roc_auc': np.float64(0.678), 'NNS': 38.0, 'best_params': OrderedDict([('C', 0.07425534359037013), ('penalty', 'l2'), ('solver', 'liblinear')])}


{'fold': 2, 'accuracy': 0.705, 'precision': 0.025, 'sensitivity': 0.643, 'f1_score': 0.048, 'fbeta_2': 0.107, 'roc_auc': np.float64(0.701), 'NNS': 40.333, 'best_params': OrderedDict([('C', 0.0012107742518059712), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.718, 'precision': 0.029, 'sensitivity': 0.714, 'f1_score': 0.055, 'fbeta_2': 0.123, 'roc_auc': np.float64(0.729), 'NNS': 35.0, 'best_params': OrderedDict([('C', 0.0010465833079868225), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.779, 'precision': 0.03, 'sensitivity': 0.571, 'f1_score': 0.056, 'fbeta_2': 0.122, 'roc_auc': np.float64(0.724), 'NNS': 33.875, 'best_params': OrderedDict([('C', 2.1532192747313754), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 5, 'accuracy': 0.792, 'precision': 0.016, 'sensitivity': 0.267, 'f1_score': 0.031, 'fbeta_2': 0.065, 'roc_auc': np.float64(0.569), 'NNS': 61.5, 'best_params': OrderedDict([('C', 0.6365338060906855), ('penalty', 'l2'), ('

In [52]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.671, 'precision': 0.025, 'sensitivity': 0.667, 'f1_score': 0.048, 'fbeta_2': 0.107, 'roc_auc': np.float64(0.711), 'NNS': 40.6, 'best_params': OrderedDict([('C', 0.000739351416918972), ('penalty', 'l2'), ('solver', 'liblinear')])}


{'fold': 2, 'accuracy': 0.745, 'precision': 0.019, 'sensitivity': 0.429, 'f1_score': 0.037, 'fbeta_2': 0.082, 'roc_auc': np.float64(0.632), 'NNS': 51.333, 'best_params': OrderedDict([('C', 0.0036653968001029696), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.723, 'precision': 0.026, 'sensitivity': 0.643, 'f1_score': 0.051, 'fbeta_2': 0.113, 'roc_auc': np.float64(0.716), 'NNS': 37.889, 'best_params': OrderedDict([('C', 0.002070368918342941), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.718, 'precision': 0.026, 'sensitivity': 0.643, 'f1_score': 0.05, 'fbeta_2': 0.111, 'roc_auc': np.float64(0.76), 'NNS': 38.667, 'best_params': OrderedDict([('C', 0.001891244533164794), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 5, 'accuracy': 0.792, 'precision': 0.016, 'sensitivity': 0.267, 'f1_score': 0.031, 'fbeta_2': 0.065, 'roc_auc': np.float64(0.6), 'NNS': 61.5, 'best_params': OrderedDict([('C', 12.879803387472974), ('penalty', 'l1'), ('

In [53]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.94, 'precision': 0.032, 'sensitivity': 0.133, 'f1_score': 0.052, 'fbeta_2': 0.082, 'roc_auc': np.float64(0.619), 'NNS': 31.0, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 20), ('min_samples_leaf', 10), ('min_samples_split', 2)])}
{'fold': 2, 'accuracy': 0.878, 'precision': 0.021, 'sensitivity': 0.214, 'f1_score': 0.039, 'fbeta_2': 0.076, 'roc_auc': np.float64(0.551), 'NNS': 47.0, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 20), ('min_samples_leaf', 1), ('min_samples_split', 5)])}
{'fold': 3, 'accuracy': 0.955, 'precision': 0.044, 'sensitivity': 0.143, 'f1_score': 0.068, 'fbeta_2': 0.099, 'roc_auc': np.float64(0.74), 'NNS': 22.5, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 3), ('min_samples_leaf', 10), ('min_samples_split', 20)])}
{'fold': 4, 'accuracy': 0.903, 'precision': 0.019, 'sensitivity': 0.143, 'f1_score': 0.033, 'fbeta_2': 0.061, 'roc_auc': np.float64(0.

In [54]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.938, 'precision': 0.059, 'sensitivity': 0.267, 'f1_score': 0.096, 'fbeta_2': 0.156, 'roc_auc': np.float64(0.661), 'NNS': 17.0, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 50), ('max_features', None), ('min_samples_leaf', 20), ('min_samples_split', 2), ('n_estimators', 50)])}
{'fold': 2, 'accuracy': 0.912, 'precision': 0.04, 'sensitivity': 0.286, 'f1_score': 0.07, 'fbeta_2': 0.127, 'roc_auc': np.float64(0.651), 'NNS': 25.25, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 9), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 295)])}
{'fold': 3, 'accuracy': 0.938, 'precision': 0.0, 'sensitivity': 0.0, 'f1_score': 0.0, 'fbeta_2': 0.0, 'roc_auc': np.float64(0.555), 'NNS': inf, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 39), ('max_features', None), ('min_samples_leaf', 4), ('min_samples_split', 2), ('n_estimators', 83)])}
{'fold': 4, 'a

In [55]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.961, 'precision': 0.054, 'sensitivity': 0.133, 'f1_score': 0.077, 'fbeta_2': 0.103, 'roc_auc': np.float64(0.641), 'NNS': 18.5, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 0), ('learning_rate', 0.05829204337672862), ('max_depth', 50), ('min_child_weight', 5), ('n_estimators', 500), ('subsample', 0.6978573848597611)])}
{'fold': 2, 'accuracy': 0.939, 'precision': 0.059, 'sensitivity': 0.286, 'f1_score': 0.098, 'fbeta_2': 0.161, 'roc_auc': np.float64(0.757), 'NNS': 17.0, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 0), ('learning_rate', 0.1), ('max_depth', 23), ('min_child_weight', 3), ('n_estimators', 500), ('subsample', 0.5306763447990716)])}
{'fold': 3, 'accuracy': 0.978, 'precision': 0.118, 'sensitivity': 0.143, 'f1_score': 0.129, 'fbeta_2': 0.137, 'roc_auc': np.float64(0.826), 'NNS': 8.5, 'best_params': OrderedDict([('colsample_bytree', 0.5333785561042034), ('gamma', 1), ('learning_rate'

In [57]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.955, 'precision': 0.083, 'sensitivity': 0.267, 'f1_score': 0.127, 'fbeta_2': 0.185, 'roc_auc': np.float64(0.685), 'NNS': 12.0, 'best_params': {}}
{'fold': 2, 'accuracy': 0.933, 'precision': 0.076, 'sensitivity': 0.429, 'f1_score': 0.129, 'fbeta_2': 0.222, 'roc_auc': np.float64(0.715), 'NNS': 13.167, 'best_params': {}}
{'fold': 3, 'accuracy': 0.947, 'precision': 0.037, 'sensitivity': 0.143, 'f1_score': 0.059, 'fbeta_2': 0.091, 'roc_auc': np.float64(0.67), 'NNS': 27.0, 'best_params': {}}
{'fold': 4, 'accuracy': 0.94, 'precision': 0.032, 'sensitivity': 0.143, 'f1_score': 0.052, 'fbeta_2': 0.084, 'roc_auc': np.float64(0.699), 'NNS': 31.5, 'best_params': {}}
{'fold': 5, 'accuracy': 0.948, 'precision': 0.056, 'sensitivity': 0.2, 'f1_score': 0.087, 'fbeta_2': 0.132, 'roc_auc': np.float64(0.619), 'NNS': 18.0, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.945 ± 0.007
precision: 0.057 ± 0.020
sensitivity: 0.236

## y_cvdeath_6_months

In [58]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===
{'fold': 1, 'accuracy': 0.652, 'precision': 0.062, 'sensitivity': 0.6, 'f1_score': 0.113, 'fbeta_2': 0.22, 'roc_auc': np.float64(0.676), 'NNS': 16.037, 'best_params': OrderedDict([('var_smoothing', 0.03237858380675109)])}
{'fold': 2, 'accuracy': 0.707, 'precision': 0.078, 'sensitivity': 0.659, 'f1_score': 0.14, 'fbeta_2': 0.265, 'roc_auc': np.float64(0.739), 'NNS': 12.793, 'best_params': OrderedDict([('var_smoothing', 0.019788179299376048)])}
{'fold': 3, 'accuracy': 0.733, 'precision': 0.068, 'sensitivity': 0.5, 'f1_score': 0.119, 'fbeta_2': 0.22, 'roc_auc': np.float64(0.734), 'NNS': 14.773, 'best_params': OrderedDict([('var_smoothing', 0.09706226232971742)])}
{'fold': 4, 'accuracy': 0.718, 'precision': 0.062, 'sensitivity': 0.467, 'f1_score': 0.109, 'fbeta_2': 0.202, 'roc_auc': np.float64(0.687), 'NNS': 16.238, 'best_params': OrderedDict([('var_smoothing', 0.01568097675832564)])}
{'fold': 5, 'accuracy': 0.777, 'precision': 0.078, 'sensitivity': 0.467, 'f1_

In [59]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="oversample",
    n_splits=5,
    random_state=42,
)


=== Technique: OVERSAMPLE ===
{'fold': 1, 'accuracy': 0.66, 'precision': 0.078, 'sensitivity': 0.756, 'f1_score': 0.141, 'fbeta_2': 0.275, 'roc_auc': np.float64(0.759), 'NNS': 12.882, 'best_params': OrderedDict([('C', 0.0004963064902269936), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.631, 'precision': 0.072, 'sensitivity': 0.773, 'f1_score': 0.132, 'fbeta_2': 0.262, 'roc_auc': np.float64(0.776), 'NNS': 13.912, 'best_params': OrderedDict([('C', 0.0003285526296960481), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.603, 'precision': 0.077, 'sensitivity': 0.909, 'f1_score': 0.142, 'fbeta_2': 0.288, 'roc_auc': np.float64(0.793), 'NNS': 12.975, 'best_params': OrderedDict([('C', 0.00019661894528858297), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.627, 'precision': 0.077, 'sensitivity': 0.822, 'f1_score': 0.14, 'fbeta_2': 0.279, 'roc_auc': np.float64(0.759), 'NNS': 13.054, 'best_params': OrderedDict([('C', 0.00

In [60]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.652, 'precision': 0.08, 'sensitivity': 0.8, 'f1_score': 0.145, 'fbeta_2': 0.285, 'roc_auc': np.float64(0.768), 'NNS': 12.528, 'best_params': OrderedDict([('C', 0.0005491559094000247), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.684, 'precision': 0.079, 'sensitivity': 0.727, 'f1_score': 0.143, 'fbeta_2': 0.275, 'roc_auc': np.float64(0.768), 'NNS': 12.656, 'best_params': OrderedDict([('C', 0.0009769708347079002), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.613, 'precision': 0.077, 'sensitivity': 0.886, 'f1_score': 0.142, 'fbeta_2': 0.286, 'roc_auc': np.float64(0.783), 'NNS': 12.949, 'best_params': OrderedDict([('C', 0.0003265810249839299), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.673, 'precision': 0.077, 'sensitivity': 0.711, 'f1_score': 0.139, 'fbeta_2': 0.268, 'roc_auc': np.float64(0.748), 'NNS': 13.031, 'best_params': OrderedDict([('C', 0.000777195

In [61]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.692, 'precision': 0.068, 'sensitivity': 0.578, 'f1_score': 0.122, 'fbeta_2': 0.231, 'roc_auc': np.float64(0.666), 'NNS': 14.692, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 19), ('min_samples_leaf', 5), ('min_samples_split', 17)])}
{'fold': 2, 'accuracy': 0.709, 'precision': 0.065, 'sensitivity': 0.523, 'f1_score': 0.115, 'fbeta_2': 0.216, 'roc_auc': np.float64(0.655), 'NNS': 15.478, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 18), ('min_samples_leaf', 3), ('min_samples_split', 2)])}
{'fold': 3, 'accuracy': 0.718, 'precision': 0.076, 'sensitivity': 0.614, 'f1_score': 0.136, 'fbeta_2': 0.255, 'roc_auc': np.float64(0.668), 'NNS': 13.074, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 20), ('min_samples_leaf', 1), ('min_samples_split', 2)])}
{'fold': 4, 'accuracy': 0.782, 'precision': 0.07, 'sensitivity': 0.4, 'f1_score': 0.12, 'fbeta_2': 0.206, 'roc_auc': np.float64

In [62]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.829, 'precision': 0.113, 'sensitivity': 0.533, 'f1_score': 0.187, 'fbeta_2': 0.306, 'roc_auc': np.float64(0.73), 'NNS': 8.833, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 28), ('max_features', None), ('min_samples_leaf', 20), ('min_samples_split', 12), ('n_estimators', 257)])}
{'fold': 2, 'accuracy': 0.666, 'precision': 0.056, 'sensitivity': 0.523, 'f1_score': 0.102, 'fbeta_2': 0.197, 'roc_auc': np.float64(0.644), 'NNS': 17.783, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 50), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 50)])}
{'fold': 3, 'accuracy': 0.778, 'precision': 0.075, 'sensitivity': 0.455, 'f1_score': 0.129, 'fbeta_2': 0.226, 'roc_auc': np.float64(0.758), 'NNS': 13.3, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 14), ('max_features', None), ('min_samples_leaf', 20), ('min_samples_split', 12), ('n_estimators', 112)])

In [63]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.812, 'precision': 0.089, 'sensitivity': 0.444, 'f1_score': 0.149, 'fbeta_2': 0.248, 'roc_auc': np.float64(0.73), 'NNS': 11.2, 'best_params': OrderedDict([('colsample_bytree', 0.6348408450025294), ('gamma', 4), ('learning_rate', 0.09705940378212671), ('max_depth', 20), ('min_child_weight', 7), ('n_estimators', 306), ('subsample', 0.5169040002531909)])}
{'fold': 2, 'accuracy': 0.807, 'precision': 0.11, 'sensitivity': 0.614, 'f1_score': 0.187, 'fbeta_2': 0.321, 'roc_auc': np.float64(0.782), 'NNS': 9.074, 'best_params': OrderedDict([('colsample_bytree', 0.705051979426657), ('gamma', 4), ('learning_rate', 0.09335393188593556), ('max_depth', 16), ('min_child_weight', 14), ('n_estimators', 213), ('subsample', 0.675465667449572)])}
{'fold': 3, 'accuracy': 0.808, 'precision': 0.091, 'sensitivity': 0.477, 'f1_score': 0.152, 'fbeta_2': 0.257, 'roc_auc': np.float64(0.783), 'NNS': 11.048, 'best_params': OrderedDict([('colsample_bytree', 0.66

In [64]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.76, 'precision': 0.072, 'sensitivity': 0.467, 'f1_score': 0.125, 'fbeta_2': 0.223, 'roc_auc': np.float64(0.723), 'NNS': 13.81, 'best_params': {}}
{'fold': 2, 'accuracy': 0.761, 'precision': 0.078, 'sensitivity': 0.523, 'f1_score': 0.136, 'fbeta_2': 0.245, 'roc_auc': np.float64(0.691), 'NNS': 12.739, 'best_params': {}}
{'fold': 3, 'accuracy': 0.745, 'precision': 0.071, 'sensitivity': 0.5, 'f1_score': 0.124, 'fbeta_2': 0.226, 'roc_auc': np.float64(0.673), 'NNS': 14.091, 'best_params': {}}
{'fold': 4, 'accuracy': 0.76, 'precision': 0.081, 'sensitivity': 0.533, 'f1_score': 0.141, 'fbeta_2': 0.253, 'roc_auc': np.float64(0.713), 'NNS': 12.292, 'best_params': {}}
{'fold': 5, 'accuracy': 0.755, 'precision': 0.093, 'sensitivity': 0.644, 'f1_score': 0.163, 'fbeta_2': 0.295, 'roc_auc': np.float64(0.76), 'NNS': 10.724, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.756 ± 0.006
precision: 0.079 ± 0.008
sensitivity:

## y_death_6_months

In [65]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===
{'fold': 1, 'accuracy': 0.659, 'precision': 0.084, 'sensitivity': 0.6, 'f1_score': 0.148, 'fbeta_2': 0.269, 'roc_auc': np.float64(0.689), 'NNS': 11.889, 'best_params': OrderedDict([('var_smoothing', 0.006742101252182173)])}
{'fold': 2, 'accuracy': 0.638, 'precision': 0.085, 'sensitivity': 0.661, 'f1_score': 0.15, 'fbeta_2': 0.28, 'roc_auc': np.float64(0.692), 'NNS': 11.795, 'best_params': OrderedDict([('var_smoothing', 0.04203632037755307)])}
{'fold': 3, 'accuracy': 0.671, 'precision': 0.089, 'sensitivity': 0.627, 'f1_score': 0.156, 'fbeta_2': 0.284, 'roc_auc': np.float64(0.696), 'NNS': 11.243, 'best_params': OrderedDict([('var_smoothing', 0.020502935232420492)])}
{'fold': 4, 'accuracy': 0.67, 'precision': 0.095, 'sensitivity': 0.678, 'f1_score': 0.166, 'fbeta_2': 0.303, 'roc_auc': np.float64(0.694), 'NNS': 10.575, 'best_params': OrderedDict([('var_smoothing', 0.07270515208687399)])}
{'fold': 5, 'accuracy': 0.675, 'precision': 0.106, 'sensitivity': 0.75, 'f

In [66]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.642, 'precision': 0.095, 'sensitivity': 0.733, 'f1_score': 0.168, 'fbeta_2': 0.312, 'roc_auc': np.float64(0.747), 'NNS': 10.545, 'best_params': OrderedDict([('C', 0.0005879792726575813), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.571, 'precision': 0.09, 'sensitivity': 0.864, 'f1_score': 0.163, 'fbeta_2': 0.318, 'roc_auc': np.float64(0.797), 'NNS': 11.098, 'best_params': OrderedDict([('C', 0.00018023154050241845), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.598, 'precision': 0.094, 'sensitivity': 0.847, 'f1_score': 0.169, 'fbeta_2': 0.326, 'roc_auc': np.float64(0.778), 'NNS': 10.62, 'best_params': OrderedDict([('C', 0.0001929442173127865), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.66, 'precision': 0.099, 'sensitivity': 0.746, 'f1_score': 0.175, 'fbeta_2': 0.324, 'roc_auc': np.float64(0.756), 'NNS': 10.068, 'best_params': OrderedDict([('C', 0.00055613

In [None]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.588, 'precision': 0.083, 'sensitivity': 0.733, 'f1_score': 0.149, 'fbeta_2': 0.286, 'roc_auc': np.float64(0.635), 'NNS': 12.045, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 3), ('min_samples_leaf', 1), ('min_samples_split', 2)])}
{'fold': 2, 'accuracy': 0.659, 'precision': 0.084, 'sensitivity': 0.61, 'f1_score': 0.148, 'fbeta_2': 0.271, 'roc_auc': np.float64(0.679), 'NNS': 11.889, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 5), ('min_samples_leaf', 1), ('min_samples_split', 2)])}
{'fold': 3, 'accuracy': 0.617, 'precision': 0.095, 'sensitivity': 0.814, 'f1_score': 0.171, 'fbeta_2': 0.324, 'roc_auc': np.float64(0.735), 'NNS': 10.5, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 3), ('min_samples_leaf', 5), ('min_samples_split', 20)])}
{'fold': 4, 'accuracy': 0.628, 'precision': 0.084, 'sensitivity': 0.678, 'f1_score': 0.15, 'fbeta_2': 0.282, 'roc_auc': np.float64(0.

In [67]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.679, 'precision': 0.099, 'sensitivity': 0.683, 'f1_score': 0.173, 'fbeta_2': 0.314, 'roc_auc': np.float64(0.741), 'NNS': 10.073, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 47), ('max_features', 'sqrt'), ('min_samples_leaf', 5), ('min_samples_split', 16), ('n_estimators', 143)])}
{'fold': 2, 'accuracy': 0.676, 'precision': 0.104, 'sensitivity': 0.746, 'f1_score': 0.182, 'fbeta_2': 0.333, 'roc_auc': np.float64(0.768), 'NNS': 9.636, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 50), ('max_features', None), ('min_samples_leaf', 12), ('min_samples_split', 11), ('n_estimators', 254)])}
{'fold': 3, 'accuracy': 0.691, 'precision': 0.114, 'sensitivity': 0.797, 'f1_score': 0.2, 'fbeta_2': 0.363, 'roc_auc': np.float64(0.784), 'NNS': 8.745, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 34), ('max_features', 'log2'), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 3

In [68]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.723, 'precision': 0.118, 'sensitivity': 0.717, 'f1_score': 0.203, 'fbeta_2': 0.356, 'roc_auc': np.float64(0.763), 'NNS': 8.465, 'best_params': OrderedDict([('colsample_bytree', 0.5), ('gamma', 5), ('learning_rate', 0.0931181691675355), ('max_depth', 15), ('min_child_weight', 12), ('n_estimators', 136), ('subsample', 0.6785957846690904)])}
{'fold': 2, 'accuracy': 0.697, 'precision': 0.107, 'sensitivity': 0.712, 'f1_score': 0.185, 'fbeta_2': 0.333, 'roc_auc': np.float64(0.777), 'NNS': 9.381, 'best_params': OrderedDict([('colsample_bytree', 0.7717015338451563), ('gamma', 5), ('learning_rate', 0.05015020365607496), ('max_depth', 42), ('min_child_weight', 7), ('n_estimators', 35), ('subsample', 0.785388901339449)])}
{'fold': 3, 'accuracy': 0.608, 'precision': 0.084, 'sensitivity': 0.712, 'f1_score': 0.15, 'fbeta_2': 0.285, 'roc_auc': np.float64(0.741), 'NNS': 11.952, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 5)

In [69]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.669, 'precision': 0.089, 'sensitivity': 0.617, 'f1_score': 0.155, 'fbeta_2': 0.282, 'roc_auc': np.float64(0.727), 'NNS': 11.27, 'best_params': {}}
{'fold': 2, 'accuracy': 0.652, 'precision': 0.094, 'sensitivity': 0.712, 'f1_score': 0.165, 'fbeta_2': 0.307, 'roc_auc': np.float64(0.733), 'NNS': 10.69, 'best_params': {}}
{'fold': 3, 'accuracy': 0.668, 'precision': 0.092, 'sensitivity': 0.661, 'f1_score': 0.162, 'fbeta_2': 0.296, 'roc_auc': np.float64(0.696), 'NNS': 10.846, 'best_params': {}}
{'fold': 4, 'accuracy': 0.69, 'precision': 0.098, 'sensitivity': 0.661, 'f1_score': 0.171, 'fbeta_2': 0.309, 'roc_auc': np.float64(0.72), 'NNS': 10.154, 'best_params': {}}
{'fold': 5, 'accuracy': 0.676, 'precision': 0.106, 'sensitivity': 0.75, 'f1_score': 0.186, 'fbeta_2': 0.338, 'roc_auc': np.float64(0.759), 'NNS': 9.444, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.671 ± 0.012
precision: 0.096 ± 0.006
sensitivity:

## y_hf_6_months

In [70]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===
{'fold': 1, 'accuracy': 0.56, 'precision': 0.185, 'sensitivity': 0.748, 'f1_score': 0.297, 'fbeta_2': 0.465, 'roc_auc': np.float64(0.686), 'NNS': 5.407, 'best_params': OrderedDict([('var_smoothing', 9.493661540765977e-07)])}
{'fold': 2, 'accuracy': 0.75, 'precision': 0.257, 'sensitivity': 0.547, 'f1_score': 0.35, 'fbeta_2': 0.446, 'roc_auc': np.float64(0.734), 'NNS': 3.89, 'best_params': OrderedDict([('var_smoothing', 1.175426508124534e-06)])}
{'fold': 3, 'accuracy': 0.765, 'precision': 0.258, 'sensitivity': 0.477, 'f1_score': 0.335, 'fbeta_2': 0.408, 'roc_auc': np.float64(0.734), 'NNS': 3.875, 'best_params': OrderedDict([('var_smoothing', 1.3651003883279374e-07)])}
{'fold': 4, 'accuracy': 0.742, 'precision': 0.238, 'sensitivity': 0.49, 'f1_score': 0.32, 'fbeta_2': 0.404, 'roc_auc': np.float64(0.703), 'NNS': 4.203, 'best_params': OrderedDict([('var_smoothing', 0.00012559332962752413)])}
{'fold': 5, 'accuracy': 0.741, 'precision': 0.252, 'sensitivity': 0.556

In [71]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.654, 'precision': 0.219, 'sensitivity': 0.702, 'f1_score': 0.334, 'fbeta_2': 0.488, 'roc_auc': np.float64(0.748), 'NNS': 4.557, 'best_params': OrderedDict([('C', 0.0007484720096698342), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.662, 'precision': 0.235, 'sensitivity': 0.773, 'f1_score': 0.36, 'fbeta_2': 0.53, 'roc_auc': np.float64(0.773), 'NNS': 4.259, 'best_params': OrderedDict([('C', 0.0005601886602362931), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.716, 'precision': 0.273, 'sensitivity': 0.775, 'f1_score': 0.403, 'fbeta_2': 0.566, 'roc_auc': np.float64(0.795), 'NNS': 3.667, 'best_params': OrderedDict([('C', 0.0011612799535074545), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.702, 'precision': 0.25, 'sensitivity': 0.702, 'f1_score': 0.369, 'fbeta_2': 0.516, 'roc_auc': np.float64(0.764), 'NNS': 4.0, 'best_params': OrderedDict([('C', 0.001271953458609

In [72]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.605, 'precision': 0.208, 'sensitivity': 0.781, 'f1_score': 0.329, 'fbeta_2': 0.504, 'roc_auc': np.float64(0.736), 'NNS': 4.805, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 3), ('min_samples_leaf', 10), ('min_samples_split', 3)])}
{'fold': 2, 'accuracy': 0.658, 'precision': 0.213, 'sensitivity': 0.66, 'f1_score': 0.322, 'fbeta_2': 0.465, 'roc_auc': np.float64(0.69), 'NNS': 4.687, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 6), ('min_samples_leaf', 4), ('min_samples_split', 20)])}
{'fold': 3, 'accuracy': 0.645, 'precision': 0.214, 'sensitivity': 0.695, 'f1_score': 0.327, 'fbeta_2': 0.479, 'roc_auc': np.float64(0.717), 'NNS': 4.676, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 5), ('min_samples_leaf', 10), ('min_samples_split', 3)])}
{'fold': 4, 'accuracy': 0.733, 'precision': 0.241, 'sensitivity': 0.536, 'f1_score': 0.333, 'fbeta_2': 0.431, 'roc_auc': np.float64(0.69

In [73]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.678, 'precision': 0.243, 'sensitivity': 0.755, 'f1_score': 0.367, 'fbeta_2': 0.531, 'roc_auc': np.float64(0.755), 'NNS': 4.123, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 50), ('max_features', 'log2'), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 300)])}
{'fold': 2, 'accuracy': 0.564, 'precision': 0.194, 'sensitivity': 0.807, 'f1_score': 0.313, 'fbeta_2': 0.495, 'roc_auc': np.float64(0.668), 'NNS': 5.149, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 1), ('max_features', None), ('min_samples_leaf', 17), ('min_samples_split', 16), ('n_estimators', 298)])}
{'fold': 3, 'accuracy': 0.72, 'precision': 0.276, 'sensitivity': 0.775, 'f1_score': 0.407, 'fbeta_2': 0.569, 'roc_auc': np.float64(0.79), 'NNS': 3.624, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 50), ('max_features', 'log2'), ('min_samples_leaf', 5), ('min_samples_split', 20), ('n_estimators', 50)]

In [74]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.673, 'precision': 0.243, 'sensitivity': 0.775, 'f1_score': 0.37, 'fbeta_2': 0.539, 'roc_auc': np.float64(0.749), 'NNS': 4.12, 'best_params': OrderedDict([('colsample_bytree', 0.5604633606673581), ('gamma', 5), ('learning_rate', 0.1), ('max_depth', 36), ('min_child_weight', 14), ('n_estimators', 10), ('subsample', 1.0)])}
{'fold': 2, 'accuracy': 0.647, 'precision': 0.224, 'sensitivity': 0.76, 'f1_score': 0.347, 'fbeta_2': 0.514, 'roc_auc': np.float64(0.751), 'NNS': 4.456, 'best_params': OrderedDict([('colsample_bytree', 0.8260367457426929), ('gamma', 5), ('learning_rate', 0.003384429637393834), ('max_depth', 33), ('min_child_weight', 1), ('n_estimators', 10), ('subsample', 0.5370531719943081)])}
{'fold': 3, 'accuracy': 0.725, 'precision': 0.278, 'sensitivity': 0.762, 'f1_score': 0.407, 'fbeta_2': 0.565, 'roc_auc': np.float64(0.795), 'NNS': 3.6, 'best_params': OrderedDict([('colsample_bytree', 0.6091886815818075), ('gamma', 0), ('

In [75]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.639, 'precision': 0.202, 'sensitivity': 0.649, 'f1_score': 0.308, 'fbeta_2': 0.45, 'roc_auc': np.float64(0.689), 'NNS': 4.949, 'best_params': {}}
{'fold': 2, 'accuracy': 0.673, 'precision': 0.221, 'sensitivity': 0.653, 'f1_score': 0.33, 'fbeta_2': 0.469, 'roc_auc': np.float64(0.723), 'NNS': 4.531, 'best_params': {}}
{'fold': 3, 'accuracy': 0.683, 'precision': 0.231, 'sensitivity': 0.669, 'f1_score': 0.344, 'fbeta_2': 0.485, 'roc_auc': np.float64(0.732), 'NNS': 4.327, 'best_params': {}}
{'fold': 4, 'accuracy': 0.68, 'precision': 0.223, 'sensitivity': 0.636, 'f1_score': 0.33, 'fbeta_2': 0.464, 'roc_auc': np.float64(0.708), 'NNS': 4.49, 'best_params': {}}
{'fold': 5, 'accuracy': 0.656, 'precision': 0.217, 'sensitivity': 0.682, 'f1_score': 0.33, 'fbeta_2': 0.478, 'roc_auc': np.float64(0.709), 'NNS': 4.602, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.666 ± 0.016
precision: 0.219 ± 0.010
sensitivity: 0.65

## y_inp_6_months

In [76]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===
{'fold': 1, 'accuracy': 0.652, 'precision': 0.305, 'sensitivity': 0.487, 'f1_score': 0.375, 'fbeta_2': 0.435, 'roc_auc': np.float64(0.646), 'NNS': 3.283, 'best_params': OrderedDict([('var_smoothing', 2.2572886601623825e-08)])}
{'fold': 2, 'accuracy': 0.644, 'precision': 0.313, 'sensitivity': 0.558, 'f1_score': 0.401, 'fbeta_2': 0.482, 'roc_auc': np.float64(0.667), 'NNS': 3.193, 'best_params': OrderedDict([('var_smoothing', 3.244468390655083e-08)])}
{'fold': 3, 'accuracy': 0.655, 'precision': 0.311, 'sensitivity': 0.508, 'f1_score': 0.386, 'fbeta_2': 0.451, 'roc_auc': np.float64(0.632), 'NNS': 3.212, 'best_params': OrderedDict([('var_smoothing', 7.819115625274379e-08)])}
{'fold': 4, 'accuracy': 0.359, 'precision': 0.24, 'sensitivity': 0.923, 'f1_score': 0.381, 'fbeta_2': 0.588, 'roc_auc': np.float64(0.637), 'NNS': 4.171, 'best_params': OrderedDict([('var_smoothing', 4.729216546763614e-08)])}
{'fold': 5, 'accuracy': 0.676, 'precision': 0.333, 'sensitivity': 0

In [77]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="oversample",
    n_splits=5,
    random_state=42,
)


=== Technique: OVERSAMPLE ===
{'fold': 1, 'accuracy': 0.608, 'precision': 0.299, 'sensitivity': 0.621, 'f1_score': 0.404, 'fbeta_2': 0.511, 'roc_auc': np.float64(0.667), 'NNS': 3.34, 'best_params': OrderedDict([('C', 0.0002578202087114616), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.589, 'precision': 0.306, 'sensitivity': 0.727, 'f1_score': 0.431, 'fbeta_2': 0.57, 'roc_auc': np.float64(0.687), 'NNS': 3.27, 'best_params': OrderedDict([('C', 0.006222006072009479), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.61, 'precision': 0.295, 'sensitivity': 0.596, 'f1_score': 0.395, 'fbeta_2': 0.495, 'roc_auc': np.float64(0.651), 'NNS': 3.387, 'best_params': OrderedDict([('C', 0.00021448553949850792), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.573, 'precision': 0.284, 'sensitivity': 0.658, 'f1_score': 0.397, 'fbeta_2': 0.521, 'roc_auc': np.float64(0.643), 'NNS': 3.52, 'best_params': OrderedDict([('C', 0.0032780780

In [78]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.578, 'precision': 0.258, 'sensitivity': 0.517, 'f1_score': 0.344, 'fbeta_2': 0.431, 'roc_auc': np.float64(0.569), 'NNS': 3.874, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 20), ('min_samples_leaf', 3), ('min_samples_split', 2)])}
{'fold': 2, 'accuracy': 0.571, 'precision': 0.259, 'sensitivity': 0.542, 'f1_score': 0.35, 'fbeta_2': 0.445, 'roc_auc': np.float64(0.593), 'NNS': 3.865, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 13), ('min_samples_leaf', 7), ('min_samples_split', 20)])}
{'fold': 3, 'accuracy': 0.643, 'precision': 0.3, 'sensitivity': 0.504, 'f1_score': 0.376, 'fbeta_2': 0.443, 'roc_auc': np.float64(0.619), 'NNS': 3.336, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 4), ('min_samples_leaf', 1), ('min_samples_split', 20)])}
{'fold': 4, 'accuracy': 0.636, 'precision': 0.304, 'sensitivity': 0.546, 'f1_score': 0.391, 'fbeta_2': 0.471, 'roc_auc': np.float64(0.61

In [79]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.609, 'precision': 0.302, 'sensitivity': 0.632, 'f1_score': 0.409, 'fbeta_2': 0.519, 'roc_auc': np.float64(0.67), 'NNS': 3.309, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 24), ('max_features', 'sqrt'), ('min_samples_leaf', 3), ('min_samples_split', 17), ('n_estimators', 261)])}
{'fold': 2, 'accuracy': 0.592, 'precision': 0.302, 'sensitivity': 0.692, 'f1_score': 0.42, 'fbeta_2': 0.55, 'roc_auc': np.float64(0.674), 'NNS': 3.317, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 50), ('max_features', 'sqrt'), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 50)])}
{'fold': 3, 'accuracy': 0.566, 'precision': 0.275, 'sensitivity': 0.631, 'f1_score': 0.383, 'fbeta_2': 0.501, 'roc_auc': np.float64(0.629), 'NNS': 3.64, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 1), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 18), ('n_estimators', 90)])}
{

In [80]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.621, 'precision': 0.309, 'sensitivity': 0.621, 'f1_score': 0.412, 'fbeta_2': 0.516, 'roc_auc': np.float64(0.67), 'NNS': 3.241, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 4), ('learning_rate', 0.006830583589036771), ('max_depth', 20), ('min_child_weight', 20), ('n_estimators', 500), ('subsample', 0.5185072972001584)])}
{'fold': 2, 'accuracy': 0.611, 'precision': 0.312, 'sensitivity': 0.685, 'f1_score': 0.429, 'fbeta_2': 0.553, 'roc_auc': np.float64(0.688), 'NNS': 3.202, 'best_params': OrderedDict([('colsample_bytree', 0.5), ('gamma', 0), ('learning_rate', 0.001), ('max_depth', 43), ('min_child_weight', 12), ('n_estimators', 427), ('subsample', 0.5)])}
{'fold': 3, 'accuracy': 0.626, 'precision': 0.308, 'sensitivity': 0.6, 'f1_score': 0.407, 'fbeta_2': 0.504, 'roc_auc': np.float64(0.655), 'NNS': 3.25, 'best_params': OrderedDict([('colsample_bytree', 0.6262565293747252), ('gamma', 5), ('learning_rate', 0.099308

In [81]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.593, 'precision': 0.286, 'sensitivity': 0.602, 'f1_score': 0.388, 'fbeta_2': 0.493, 'roc_auc': np.float64(0.621), 'NNS': 3.497, 'best_params': {}}
{'fold': 2, 'accuracy': 0.581, 'precision': 0.278, 'sensitivity': 0.604, 'f1_score': 0.381, 'fbeta_2': 0.489, 'roc_auc': np.float64(0.617), 'NNS': 3.592, 'best_params': {}}
{'fold': 3, 'accuracy': 0.576, 'precision': 0.27, 'sensitivity': 0.577, 'f1_score': 0.368, 'fbeta_2': 0.47, 'roc_auc': np.float64(0.6), 'NNS': 3.707, 'best_params': {}}
{'fold': 4, 'accuracy': 0.582, 'precision': 0.264, 'sensitivity': 0.535, 'f1_score': 0.353, 'fbeta_2': 0.444, 'roc_auc': np.float64(0.608), 'NNS': 3.791, 'best_params': {}}
{'fold': 5, 'accuracy': 0.58, 'precision': 0.281, 'sensitivity': 0.617, 'f1_score': 0.386, 'fbeta_2': 0.498, 'roc_auc': np.float64(0.634), 'NNS': 3.559, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.582 ± 0.006
precision: 0.276 ± 0.008
sensitivity: 0.5

## y_stk_or_aemb

In [82]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===
{'fold': 1, 'accuracy': 0.13, 'precision': 0.024, 'sensitivity': 0.929, 'f1_score': 0.047, 'fbeta_2': 0.109, 'roc_auc': np.float64(0.608), 'NNS': 41.731, 'best_params': OrderedDict([('var_smoothing', 0.001522083767389707)])}
{'fold': 2, 'accuracy': 0.836, 'precision': 0.027, 'sensitivity': 0.185, 'f1_score': 0.048, 'fbeta_2': 0.086, 'roc_auc': np.float64(0.596), 'NNS': 36.6, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 3, 'accuracy': 0.104, 'precision': 0.023, 'sensitivity': 0.963, 'f1_score': 0.045, 'fbeta_2': 0.106, 'roc_auc': np.float64(0.632), 'NNS': 42.923, 'best_params': OrderedDict([('var_smoothing', 0.00011866105296908167)])}
{'fold': 4, 'accuracy': 0.418, 'precision': 0.024, 'sensitivity': 0.63, 'f1_score': 0.046, 'fbeta_2': 0.103, 'roc_auc': np.float64(0.519), 'NNS': 42.118, 'best_params': OrderedDict([('var_smoothing', 0.0016675286387663815)])}
{'fold': 5, 'accuracy': 0.357, 'precision': 0.022, 'sensitivity': 0.63, 'f1_score': 0

In [83]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.523, 'precision': 0.031, 'sensitivity': 0.643, 'f1_score': 0.058, 'fbeta_2': 0.128, 'roc_auc': np.float64(0.616), 'NNS': 32.778, 'best_params': OrderedDict([('C', 0.00010602594470834996), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.497, 'precision': 0.029, 'sensitivity': 0.667, 'f1_score': 0.055, 'fbeta_2': 0.123, 'roc_auc': np.float64(0.608), 'NNS': 34.556, 'best_params': OrderedDict([('C', 0.0001), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.525, 'precision': 0.031, 'sensitivity': 0.667, 'f1_score': 0.059, 'fbeta_2': 0.129, 'roc_auc': np.float64(0.638), 'NNS': 32.611, 'best_params': OrderedDict([('C', 0.0001), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.516, 'precision': 0.027, 'sensitivity': 0.593, 'f1_score': 0.052, 'fbeta_2': 0.114, 'roc_auc': np.float64(0.584), 'NNS': 37.125, 'best_params': OrderedDict([('C', 0.00010602594470834996), ('penalty', 

In [84]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.727, 'precision': 0.022, 'sensitivity': 0.25, 'f1_score': 0.04, 'fbeta_2': 0.081, 'roc_auc': np.float64(0.494), 'NNS': 45.571, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 17), ('min_samples_leaf', 1), ('min_samples_split', 2)])}
{'fold': 2, 'accuracy': 0.752, 'precision': 0.018, 'sensitivity': 0.185, 'f1_score': 0.032, 'fbeta_2': 0.064, 'roc_auc': np.float64(0.506), 'NNS': 57.0, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 19), ('min_samples_leaf', 5), ('min_samples_split', 3)])}
{'fold': 3, 'accuracy': 0.824, 'precision': 0.03, 'sensitivity': 0.222, 'f1_score': 0.053, 'fbeta_2': 0.098, 'roc_auc': np.float64(0.555), 'NNS': 33.167, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 10), ('min_samples_leaf', 7), ('min_samples_split', 20)])}
{'fold': 4, 'accuracy': 0.762, 'precision': 0.007, 'sensitivity': 0.074, 'f1_score': 0.014, 'fbeta_2': 0.027, 'roc_auc': np.float64(0.4

In [85]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.757, 'precision': 0.025, 'sensitivity': 0.25, 'f1_score': 0.045, 'fbeta_2': 0.089, 'roc_auc': np.float64(0.491), 'NNS': 40.286, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 46), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 50)])}
{'fold': 2, 'accuracy': 0.792, 'precision': 0.009, 'sensitivity': 0.074, 'f1_score': 0.016, 'fbeta_2': 0.03, 'roc_auc': np.float64(0.445), 'NNS': 115.0, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 15), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 50)])}
{'fold': 3, 'accuracy': 0.749, 'precision': 0.014, 'sensitivity': 0.148, 'f1_score': 0.025, 'fbeta_2': 0.051, 'roc_auc': np.float64(0.493), 'NNS': 71.75, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 37), ('max_features', None), ('min_samples_leaf', 2), ('min_samples_split', 2), ('n_estimators', 50)])}
{'f

In [86]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.897, 'precision': 0.055, 'sensitivity': 0.214, 'f1_score': 0.088, 'fbeta_2': 0.136, 'roc_auc': np.float64(0.623), 'NNS': 18.167, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 0), ('learning_rate', 0.1), ('max_depth', 35), ('min_child_weight', 7), ('n_estimators', 500), ('subsample', 1.0)])}
{'fold': 2, 'accuracy': 0.892, 'precision': 0.018, 'sensitivity': 0.074, 'f1_score': 0.029, 'fbeta_2': 0.046, 'roc_auc': np.float64(0.542), 'NNS': 54.5, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 0), ('learning_rate', 0.1), ('max_depth', 1), ('min_child_weight', 2), ('n_estimators', 500), ('subsample', 0.5481557373188092)])}
{'fold': 3, 'accuracy': 0.905, 'precision': 0.022, 'sensitivity': 0.074, 'f1_score': 0.033, 'fbeta_2': 0.05, 'roc_auc': np.float64(0.584), 'NNS': 46.5, 'best_params': OrderedDict([('colsample_bytree', 0.9054836341129655), ('gamma', 0), ('learning_rate', 0.09635062259320298), ('max_

In [87]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.85, 'precision': 0.052, 'sensitivity': 0.321, 'f1_score': 0.09, 'fbeta_2': 0.158, 'roc_auc': np.float64(0.622), 'NNS': 19.222, 'best_params': {}}
{'fold': 2, 'accuracy': 0.831, 'precision': 0.016, 'sensitivity': 0.111, 'f1_score': 0.028, 'fbeta_2': 0.051, 'roc_auc': np.float64(0.574), 'NNS': 61.667, 'best_params': {}}
{'fold': 3, 'accuracy': 0.866, 'precision': 0.047, 'sensitivity': 0.259, 'f1_score': 0.079, 'fbeta_2': 0.136, 'roc_auc': np.float64(0.624), 'NNS': 21.429, 'best_params': {}}
{'fold': 4, 'accuracy': 0.861, 'precision': 0.033, 'sensitivity': 0.185, 'f1_score': 0.056, 'fbeta_2': 0.096, 'roc_auc': np.float64(0.529), 'NNS': 30.4, 'best_params': {}}
{'fold': 5, 'accuracy': 0.885, 'precision': 0.033, 'sensitivity': 0.148, 'f1_score': 0.054, 'fbeta_2': 0.087, 'roc_auc': np.float64(0.524), 'NNS': 30.25, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.859 ± 0.018
precision: 0.036 ± 0.012
sensitivity