# Imports

In [2]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, fbeta_score

from skopt.space import Integer, Categorical

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import fbeta_score, roc_auc_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score

from skopt import BayesSearchCV

from typing import Dict, List, Literal
from skopt.space import Real, Integer, Categorical

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings("ignore")

## Read dataset

In [3]:
data = pd.read_csv("../data/processed/20. FINAL_mean_delta_multi_output.csv")
print(data.shape)
data.head()

(6091, 210)


Unnamed: 0,Hba1c,Hba1c Time,Hba1c FM,Hba1c FM Time,BMI,BMI Time,Cancer,Cancer Time,Carotid Disease,Carotid Disease Time,...,y_stk_or_aemb_3_months,y_stk_or_aemb_6_months,y_stk_or_aemb_12_months,y_stk_or_aemb_24_months,y_stk_or_aemb,History of Vascular Disease,Antihypertensive Medication,Diabetes Mellitus,Diabetes Medication,Abnormal Kidney Function
0,6.26,-501,6.26,-501,20.7,-19,0,10000,0,10000,...,0,0,0,0,0,0,1,0,0,0
1,6.26,-501,6.26,-501,26.7,-780,1,-4846,0,10000,...,0,0,0,0,0,0,1,0,0,0
2,5.8,-287,6.3,-2701,31.1,-35,0,10000,0,10000,...,0,0,0,0,0,0,1,1,1,1
3,6.26,-501,6.26,-501,21.3,-207,0,10000,0,10000,...,0,0,0,0,0,1,1,0,0,1
4,5.9,-162,5.4,-5209,37.8,-554,1,-86,0,10000,...,0,0,0,0,0,1,1,1,1,0


## Drop columns

In [4]:
targets = ["y_acs_6_months", "y_cvdeath_6_months", "y_death_6_months", "y_hf_6_months", "y_inp_6_months", "y_stk_or_aemb"]
cols_to_drop = [col for col in data.columns if col.startswith('y_') and col not in targets]
data = data.drop(columns=cols_to_drop)

cols_to_drop = [col for col in data.columns if col.endswith('_t')]
data = data.drop(columns=cols_to_drop)

cols_to_drop = [col for col in data.columns if "FM" in col]
data = data.drop(columns=cols_to_drop)

print(data.shape)
data.head()

(6091, 141)


Unnamed: 0,Hba1c,Hba1c Time,BMI,BMI Time,Cancer,Cancer Time,Carotid Disease,Carotid Disease Time,Coronary Disease,Coronary Disease Time,...,y_cvdeath_6_months,y_death_6_months,y_hf_6_months,y_inp_6_months,y_stk_or_aemb,History of Vascular Disease,Antihypertensive Medication,Diabetes Mellitus,Diabetes Medication,Abnormal Kidney Function
0,6.26,-501,20.7,-19,0,10000,0,10000,0,10000,...,0,0,0,0,0,0,1,0,0,0
1,6.26,-501,26.7,-780,1,-4846,0,10000,0,10000,...,0,0,1,1,0,0,1,0,0,0
2,5.8,-287,31.1,-35,0,10000,0,10000,0,10000,...,0,0,0,0,0,0,1,1,1,1
3,6.26,-501,21.3,-207,0,10000,0,10000,0,10000,...,0,0,0,0,0,1,1,0,0,1
4,5.9,-162,37.8,-554,1,-86,0,10000,1,0,...,0,0,1,1,0,1,1,1,1,0


## Models settings

In [5]:
# Define 5-fold stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create F2 scorer
f2_scorer = make_scorer(fbeta_score, beta=2)

# ===================== Naive Bayes =====================
search_space_nb = {
    "var_smoothing": (1e-12, 1e-1, "log-uniform")
}

nb_opt = BayesSearchCV(
    estimator=GaussianNB(),
    search_spaces=search_space_nb,
    scoring=f2_scorer,
    n_iter=30,
    cv=cv,
    n_jobs=-1,
    random_state=42
)

# ===================== Logistic Regression =====================
search_space_lr = {
    'C': (1e-4, 1e+3, 'log-uniform'),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

lr_opt = BayesSearchCV(
    LogisticRegression(max_iter=1000, random_state=42),
    search_spaces=search_space_lr,
    scoring=f2_scorer,
    n_iter=30,
    cv=cv,
    n_jobs=-1,
    random_state=42
)

# ===================== Decision Tree =====================
search_space_dt = {
    "max_depth": Integer(3, 20),
    "min_samples_split": Integer(2, 20),
    "min_samples_leaf": Integer(1, 10),
    "criterion": Categorical(["gini", "entropy"])
}

dt_opt = BayesSearchCV(
    DecisionTreeClassifier(random_state=42),
    search_spaces=search_space_dt,
    n_iter=30,
    scoring=f2_scorer,
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

# ===================== Random Forest =====================
search_space_rf = {
    'n_estimators': (50, 300),
    'max_depth': (1, 50),
    'min_samples_split': (2, 20),
    'min_samples_leaf': (1, 20),
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

rf_opt = BayesSearchCV(
    RandomForestClassifier(random_state=42),
    search_spaces=search_space_rf,
    scoring=f2_scorer,
    n_iter=30,
    cv=cv,
    n_jobs=-1,
    random_state=42
)

# ===================== XGBoost =====================
search_space_xgb = {
    'n_estimators': (10, 500),
    'max_depth': (1, 50),
    'learning_rate': (0.001, 0.1, 'uniform'),
    'subsample': (0.5, 1.0),
    'colsample_bytree': (0.5, 1.0),
    'gamma': (0, 5),
    'min_child_weight': (1, 20)
}

xgb_opt = BayesSearchCV(
    XGBClassifier(random_state=42),
    search_spaces=search_space_xgb,
    scoring=f2_scorer,
    n_iter=30,
    n_jobs=-1,
    random_state=42
)

# ===================== MLP ===============================

mlp_opt = MLPClassifier(max_iter=2000, solver="adam", activation="relu", hidden_layer_sizes=(200,100), random_state=42)

# ===================== Optimizer Map =====================
optimizer_map = {
    "nb": nb_opt,
    "lr": lr_opt,
    "dt": dt_opt,
    "rf": rf_opt,
    "xgb": xgb_opt,
    "mlp": mlp_opt
}


## Define function

In [6]:
SamplingName = Literal["baseline", "undersample", "oversample", "smote", "all"]
ModelName = Literal["nb", "lr", "xgb", "dt", "rf", "mlp"]


def _make_resampler(name: SamplingName, random_state: int, y_train: pd.Series = None):
    if name == "baseline":
        return None
    if name == "undersample":
        if y_train is None:
            raise ValueError("y_train must be provided for undersampling strategy")
        n_minority = y_train.sum()
        n_required = max(int(0.1 * len(y_train)), n_minority * 2)
        n_majority = n_required - n_minority
        n_majority = min(n_majority, (y_train == 0).sum())
        sampling_strategy = {0: n_majority, 1: n_minority}
        return RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=random_state)
    if name == "oversample":
        return RandomOverSampler(random_state=random_state)
    if name == "smote":
        return SMOTE(random_state=random_state)
    raise ValueError(f"Unknown sampling technique: {name}")


def run_cv_with_sampling(
    X_full: pd.DataFrame,
    target_cols: List[str],
    target_name: str,
    optimizer_map: Dict[ModelName, object],
    model_name: ModelName = "nb",
    sampling: SamplingName = "all",
    n_splits: int = 5,
    random_state: int = 42,
    xgb_04 = False,
) -> Dict[str, dict]:
    if target_name not in target_cols:
        raise ValueError("target_name must be inside target_cols")
    missing_targets = [c for c in target_cols if c not in X_full.columns]
    if missing_targets:
        raise ValueError(f"Target columns not in X_full: {missing_targets}")

    y = X_full[target_name].copy()
    X = X_full.drop(columns=target_cols).copy()

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    if sampling == "all":
        techniques = ["baseline", "undersample", "oversample", "smote"]
    else:
        techniques = [sampling]

    results: Dict[str, dict] = {}

    for tech in techniques:
        print(f"\n=== Technique: {tech.upper()} ===")

        fold_results: List[dict] = []
        best_params_per_fold: List[dict] = []

        for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), start=1):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            if model_name in ["nb", "lr", "mlp"]:
                scaler = StandardScaler()
                X_train = scaler.fit_transform(X_train)
                X_val = scaler.transform(X_val)

            resampler = _make_resampler(tech, random_state, y_train)
            if resampler is not None:
                X_train_rs, y_train_rs = resampler.fit_resample(X_train, y_train)
            else:
                X_train_rs, y_train_rs = X_train, y_train

            opt = optimizer_map[model_name]
            opt.fit(X_train_rs, y_train_rs)

            if model_name == "mlp":
                best_model = opt
                best_params_per_fold = {}
            else:
                best_model = opt.best_estimator_
                best_params_per_fold.append(getattr(opt, "best_params_", {}))

            y_pred = best_model.predict(X_val)

            if hasattr(best_model, "predict_proba"):
                y_pred_proba = best_model.predict_proba(X_val)[:, 1]
            else:
                scores = best_model.decision_function(X_val)
                smin, smax = scores.min(), scores.max()
                y_pred_proba = (scores - smin) / (smax - smin + 1e-9)

            if model_name == "xgb" and xgb_04:
                y_pred = (y_pred_proba >= 0.4).astype(int)

            prec = precision_score(y_val, y_pred, zero_division=0)
            rec = recall_score(y_val, y_pred, zero_division=0)
            f1 = f1_score(y_val, y_pred, zero_division=0)
            fbeta2 = fbeta_score(y_val, y_pred, beta=2, zero_division=0)
            roc = roc_auc_score(y_val, y_pred_proba)

            fold_metrics = {
                "fold": fold,
                "accuracy": accuracy_score(y_val, y_pred),
                "precision": prec,
                "sensitivity": rec,
                "f1_score": f1,
                "fbeta_2": fbeta2,
                "roc_auc": roc,
                "NNS": (1 / prec) if prec > 0 else np.inf,
                "best_params": getattr(opt, "best_params_", {}),
            }
            fold_results.append(fold_metrics)

            print({k: (round(v, 3) if isinstance(v, float) else v) for k, v in fold_metrics.items()})

        metrics = [
            "accuracy",
            "precision",
            "sensitivity",
            "f1_score",
            "fbeta_2",
            "roc_auc",
            "NNS",
        ]
        mean_std = {
            m: (np.mean([fr[m] for fr in fold_results]), np.std([fr[m] for fr in fold_results]))
            for m in metrics
        }

        print("\nMean scores across folds (", tech, "):")
        for m in metrics:
            mu, sd = mean_std[m]
            print(f"{m}: {mu:.3f} \u00B1 {sd:.3f}")

        summary_df = pd.DataFrame(fold_results)
        results[tech] = {
            "fold_results": fold_results,
            "summary_metrics": summary_df,
            "best_params_per_fold": best_params_per_fold,
        }

    return results

## y_acs

In [6]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===
{'fold': 1, 'accuracy': 0.737, 'precision': 0.028, 'sensitivity': 0.6, 'f1_score': 0.053, 'fbeta_2': 0.117, 'roc_auc': np.float64(0.754), 'NNS': 35.889, 'best_params': OrderedDict([('var_smoothing', 0.07413670746795804)])}
{'fold': 2, 'accuracy': 0.735, 'precision': 0.033, 'sensitivity': 0.786, 'f1_score': 0.064, 'fbeta_2': 0.142, 'roc_auc': np.float64(0.776), 'NNS': 30.091, 'best_params': OrderedDict([('var_smoothing', 0.09997099570381021)])}
{'fold': 3, 'accuracy': 0.759, 'precision': 0.036, 'sensitivity': 0.786, 'f1_score': 0.07, 'fbeta_2': 0.154, 'roc_auc': np.float64(0.781), 'NNS': 27.455, 'best_params': OrderedDict([('var_smoothing', 0.09997099570381021)])}
{'fold': 4, 'accuracy': 0.786, 'precision': 0.023, 'sensitivity': 0.429, 'f1_score': 0.044, 'fbeta_2': 0.095, 'roc_auc': np.float64(0.722), 'NNS': 43.167, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 5, 'accuracy': 0.791, 'precision': 0.02, 'sensitivity': 0.333, 'f1_score': 0.038,

In [7]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.763, 'precision': 0.031, 'sensitivity': 0.6, 'f1_score': 0.059, 'fbeta_2': 0.128, 'roc_auc': np.float64(0.734), 'NNS': 32.444, 'best_params': OrderedDict([('C', 0.0002024092661971105), ('penalty', 'l2'), ('solver', 'liblinear')])}




{'fold': 2, 'accuracy': 0.8, 'precision': 0.033, 'sensitivity': 0.571, 'f1_score': 0.062, 'fbeta_2': 0.132, 'roc_auc': np.float64(0.766), 'NNS': 30.75, 'best_params': OrderedDict([('C', 0.0009398629982831738), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.758, 'precision': 0.036, 'sensitivity': 0.786, 'f1_score': 0.069, 'fbeta_2': 0.153, 'roc_auc': np.float64(0.839), 'NNS': 27.545, 'best_params': OrderedDict([('C', 0.00012728153334192775), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.774, 'precision': 0.022, 'sensitivity': 0.429, 'f1_score': 0.042, 'fbeta_2': 0.091, 'roc_auc': np.float64(0.682), 'NNS': 45.5, 'best_params': OrderedDict([('C', 0.0008344495266143242), ('penalty', 'l2'), ('solver', 'liblinear')])}




{'fold': 5, 'accuracy': 0.779, 'precision': 0.026, 'sensitivity': 0.467, 'f1_score': 0.049, 'fbeta_2': 0.107, 'roc_auc': np.float64(0.661), 'NNS': 38.286, 'best_params': OrderedDict([('C', 0.0002976974695232891), ('penalty', 'l2'), ('solver', 'liblinear')])}

Mean scores across folds ( undersample ):
accuracy: 0.775 ± 0.015
precision: 0.030 ± 0.005
sensitivity: 0.570 ± 0.125
f1_score: 0.056 ± 0.010
fbeta_2: 0.122 ± 0.021
roc_auc: 0.736 ± 0.064
NNS: 34.905 ± 6.343


In [8]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="oversample",
    n_splits=5,
    random_state=42,
)


=== Technique: OVERSAMPLE ===
{'fold': 1, 'accuracy': 0.86, 'precision': 0.03, 'sensitivity': 0.333, 'f1_score': 0.055, 'fbeta_2': 0.111, 'roc_auc': np.float64(0.531), 'NNS': 33.2, 'best_params': OrderedDict([('C', 209.58791200208904), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.84, 'precision': 0.031, 'sensitivity': 0.429, 'f1_score': 0.058, 'fbeta_2': 0.12, 'roc_auc': np.float64(0.611), 'NNS': 32.167, 'best_params': OrderedDict([('C', 1000.0), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.841, 'precision': 0.031, 'sensitivity': 0.429, 'f1_score': 0.058, 'fbeta_2': 0.121, 'roc_auc': np.float64(0.61), 'NNS': 32.0, 'best_params': OrderedDict([('C', 487.9169484427551), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.855, 'precision': 0.006, 'sensitivity': 0.071, 'f1_score': 0.011, 'fbeta_2': 0.023, 'roc_auc': np.float64(0.608), 'NNS': 165.0, 'best_params': OrderedDict([('C', 487.9169484427551), ('penalty', 'l2

In [9]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.856, 'precision': 0.029, 'sensitivity': 0.333, 'f1_score': 0.054, 'fbeta_2': 0.109, 'roc_auc': np.float64(0.523), 'NNS': 34.0, 'best_params': OrderedDict([('C', 1000.0), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.845, 'precision': 0.027, 'sensitivity': 0.357, 'f1_score': 0.05, 'fbeta_2': 0.104, 'roc_auc': np.float64(0.616), 'NNS': 37.0, 'best_params': OrderedDict([('C', 1000.0), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.837, 'precision': 0.021, 'sensitivity': 0.286, 'f1_score': 0.039, 'fbeta_2': 0.081, 'roc_auc': np.float64(0.61), 'NNS': 48.0, 'best_params': OrderedDict([('C', 997.7505599555375), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.863, 'precision': 0.013, 'sensitivity': 0.143, 'f1_score': 0.023, 'fbeta_2': 0.047, 'roc_auc': np.float64(0.583), 'NNS': 78.5, 'best_params': OrderedDict([('C', 1000.0), ('penalty', 'l2'), ('solver', 'liblinear')]

In [10]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.935, 'precision': 0.029, 'sensitivity': 0.133, 'f1_score': 0.048, 'fbeta_2': 0.078, 'roc_auc': np.float64(0.528), 'NNS': 34.0, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 19), ('min_samples_leaf', 7), ('min_samples_split', 20)])}
{'fold': 2, 'accuracy': 0.912, 'precision': 0.04, 'sensitivity': 0.286, 'f1_score': 0.07, 'fbeta_2': 0.127, 'roc_auc': np.float64(0.592), 'NNS': 25.25, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 13), ('min_samples_leaf', 2), ('min_samples_split', 20)])}
{'fold': 3, 'accuracy': 0.883, 'precision': 0.068, 'sensitivity': 0.714, 'f1_score': 0.123, 'fbeta_2': 0.245, 'roc_auc': np.float64(0.764), 'NNS': 14.8, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 20), ('min_samples_leaf', 1), ('min_samples_split', 3)])}
{'fold': 4, 'accuracy': 0.883, 'precision': 0.029, 'sensitivity': 0.286, 'f1_score': 0.053, 'fbeta_2': 0.104, 'roc_auc': np.float64(0

In [11]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.904, 'precision': 0.019, 'sensitivity': 0.133, 'f1_score': 0.033, 'fbeta_2': 0.06, 'roc_auc': np.float64(0.471), 'NNS': 53.0, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 50), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 50)])}
{'fold': 2, 'accuracy': 0.939, 'precision': 0.045, 'sensitivity': 0.214, 'f1_score': 0.075, 'fbeta_2': 0.123, 'roc_auc': np.float64(0.622), 'NNS': 22.0, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 50), ('max_features', None), ('min_samples_leaf', 7), ('min_samples_split', 20), ('n_estimators', 300)])}
{'fold': 3, 'accuracy': 0.888, 'precision': 0.058, 'sensitivity': 0.571, 'f1_score': 0.105, 'fbeta_2': 0.205, 'roc_auc': np.float64(0.781), 'NNS': 17.375, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 50), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 151)])}
{

In [12]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.946, 'precision': 0.019, 'sensitivity': 0.067, 'f1_score': 0.029, 'fbeta_2': 0.044, 'roc_auc': np.float64(0.629), 'NNS': 53.0, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 0), ('learning_rate', 0.1), ('max_depth', 1), ('min_child_weight', 1), ('n_estimators', 500), ('subsample', 0.5)])}
{'fold': 2, 'accuracy': 0.961, 'precision': 0.077, 'sensitivity': 0.214, 'f1_score': 0.113, 'fbeta_2': 0.158, 'roc_auc': np.float64(0.736), 'NNS': 13.0, 'best_params': OrderedDict([('colsample_bytree', 0.502942409585224), ('gamma', 0), ('learning_rate', 0.09521638492107078), ('max_depth', 47), ('min_child_weight', 6), ('n_estimators', 500), ('subsample', 1.0)])}
{'fold': 3, 'accuracy': 0.972, 'precision': 0.143, 'sensitivity': 0.286, 'f1_score': 0.19, 'fbeta_2': 0.238, 'roc_auc': np.float64(0.797), 'NNS': 7.0, 'best_params': OrderedDict([('colsample_bytree', 0.8631300334967597), ('gamma', 4), ('learning_rate', 0.08994426820681

In [7]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    xgb_04=True
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.928, 'precision': 0.038, 'sensitivity': 0.2, 'f1_score': 0.064, 'fbeta_2': 0.108, 'roc_auc': np.float64(0.629), 'NNS': 26.333, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 0), ('learning_rate', 0.1), ('max_depth', 1), ('min_child_weight', 1), ('n_estimators', 500), ('subsample', 0.5)])}
{'fold': 2, 'accuracy': 0.944, 'precision': 0.065, 'sensitivity': 0.286, 'f1_score': 0.105, 'fbeta_2': 0.169, 'roc_auc': np.float64(0.736), 'NNS': 15.5, 'best_params': OrderedDict([('colsample_bytree', 0.502942409585224), ('gamma', 0), ('learning_rate', 0.09521638492107078), ('max_depth', 47), ('min_child_weight', 6), ('n_estimators', 500), ('subsample', 1.0)])}
{'fold': 3, 'accuracy': 0.961, 'precision': 0.116, 'sensitivity': 0.357, 'f1_score': 0.175, 'fbeta_2': 0.253, 'roc_auc': np.float64(0.797), 'NNS': 8.6, 'best_params': OrderedDict([('colsample_bytree', 0.8631300334967597), ('gamma', 4), ('learning_rate', 0.0899442682068

In [14]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.95, 'precision': 0.021, 'sensitivity': 0.067, 'f1_score': 0.032, 'fbeta_2': 0.046, 'roc_auc': np.float64(0.681), 'NNS': 48.0, 'best_params': {}}
{'fold': 2, 'accuracy': 0.939, 'precision': 0.095, 'sensitivity': 0.5, 'f1_score': 0.159, 'fbeta_2': 0.269, 'roc_auc': np.float64(0.762), 'NNS': 10.571, 'best_params': {}}
{'fold': 3, 'accuracy': 0.951, 'precision': 0.058, 'sensitivity': 0.214, 'f1_score': 0.091, 'fbeta_2': 0.139, 'roc_auc': np.float64(0.726), 'NNS': 17.333, 'best_params': {}}
{'fold': 4, 'accuracy': 0.96, 'precision': 0.051, 'sensitivity': 0.143, 'f1_score': 0.075, 'fbeta_2': 0.105, 'roc_auc': np.float64(0.631), 'NNS': 19.5, 'best_params': {}}
{'fold': 5, 'accuracy': 0.949, 'precision': 0.039, 'sensitivity': 0.133, 'f1_score': 0.061, 'fbeta_2': 0.09, 'roc_auc': np.float64(0.635), 'NNS': 25.5, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.950 ± 0.007
precision: 0.053 ± 0.024
sensitivity: 0.21

## y_cvdeath_6_months

In [15]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===
{'fold': 1, 'accuracy': 0.471, 'precision': 0.056, 'sensitivity': 0.844, 'f1_score': 0.105, 'fbeta_2': 0.222, 'roc_auc': np.float64(0.688), 'NNS': 17.789, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 2, 'accuracy': 0.544, 'precision': 0.057, 'sensitivity': 0.75, 'f1_score': 0.106, 'fbeta_2': 0.219, 'roc_auc': np.float64(0.71), 'NNS': 17.485, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 3, 'accuracy': 0.459, 'precision': 0.056, 'sensitivity': 0.886, 'f1_score': 0.106, 'fbeta_2': 0.224, 'roc_auc': np.float64(0.748), 'NNS': 17.769, 'best_params': OrderedDict([('var_smoothing', 0.09706226232971742)])}
{'fold': 4, 'accuracy': 0.538, 'precision': 0.058, 'sensitivity': 0.756, 'f1_score': 0.108, 'fbeta_2': 0.222, 'roc_auc': np.float64(0.694), 'NNS': 17.235, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 5, 'accuracy': 0.561, 'precision': 0.075, 'sensitivity': 0.956, 'f1_score': 0.138, 'fbeta_2': 0.284, 'roc_auc': 

In [16]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="oversample",
    n_splits=5,
    random_state=42,
)


=== Technique: OVERSAMPLE ===
{'fold': 1, 'accuracy': 0.683, 'precision': 0.083, 'sensitivity': 0.756, 'f1_score': 0.15, 'fbeta_2': 0.289, 'roc_auc': np.float64(0.76), 'NNS': 12.029, 'best_params': OrderedDict([('C', 0.0006045989833825082), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.781, 'precision': 0.089, 'sensitivity': 0.545, 'f1_score': 0.152, 'fbeta_2': 0.268, 'roc_auc': np.float64(0.758), 'NNS': 11.292, 'best_params': OrderedDict([('C', 39.5252013727049), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.758, 'precision': 0.083, 'sensitivity': 0.568, 'f1_score': 0.145, 'fbeta_2': 0.262, 'roc_auc': np.float64(0.725), 'NNS': 12.04, 'best_params': OrderedDict([('C', 14.104866592267824), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.603, 'precision': 0.074, 'sensitivity': 0.844, 'f1_score': 0.136, 'fbeta_2': 0.273, 'roc_auc': np.float64(0.765), 'NNS': 13.553, 'best_params': OrderedDict([('C', 0.000106025944

In [17]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.708, 'precision': 0.081, 'sensitivity': 0.667, 'f1_score': 0.144, 'fbeta_2': 0.272, 'roc_auc': np.float64(0.767), 'NNS': 12.367, 'best_params': OrderedDict([('C', 0.0010448997386369905), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.704, 'precision': 0.084, 'sensitivity': 0.727, 'f1_score': 0.151, 'fbeta_2': 0.288, 'roc_auc': np.float64(0.762), 'NNS': 11.875, 'best_params': OrderedDict([('C', 0.0010143419976671108), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.759, 'precision': 0.072, 'sensitivity': 0.477, 'f1_score': 0.125, 'fbeta_2': 0.224, 'roc_auc': np.float64(0.7), 'NNS': 13.905, 'best_params': OrderedDict([('C', 1000.0), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.678, 'precision': 0.076, 'sensitivity': 0.689, 'f1_score': 0.137, 'fbeta_2': 0.263, 'roc_auc': np.float64(0.752), 'NNS': 13.194, 'best_params': OrderedDict([('C', 0.0007207453777636449), (

In [18]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.643, 'precision': 0.065, 'sensitivity': 0.644, 'f1_score': 0.118, 'fbeta_2': 0.231, 'roc_auc': np.float64(0.682), 'NNS': 15.448, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 5), ('min_samples_leaf', 3), ('min_samples_split', 3)])}
{'fold': 2, 'accuracy': 0.8, 'precision': 0.037, 'sensitivity': 0.182, 'f1_score': 0.062, 'fbeta_2': 0.102, 'roc_auc': np.float64(0.54), 'NNS': 26.875, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 7), ('min_samples_leaf', 4), ('min_samples_split', 12)])}
{'fold': 3, 'accuracy': 0.595, 'precision': 0.057, 'sensitivity': 0.659, 'f1_score': 0.105, 'fbeta_2': 0.212, 'roc_auc': np.float64(0.661), 'NNS': 17.483, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 3), ('min_samples_leaf', 7), ('min_samples_split', 2)])}
{'fold': 4, 'accuracy': 0.704, 'precision': 0.064, 'sensitivity': 0.511, 'f1_score': 0.113, 'fbeta_2': 0.213, 'roc_auc': np.float64(0

In [19]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.822, 'precision': 0.098, 'sensitivity': 0.467, 'f1_score': 0.162, 'fbeta_2': 0.266, 'roc_auc': np.float64(0.749), 'NNS': 10.19, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 46), ('max_features', None), ('min_samples_leaf', 6), ('min_samples_split', 20), ('n_estimators', 300)])}
{'fold': 2, 'accuracy': 0.709, 'precision': 0.055, 'sensitivity': 0.432, 'f1_score': 0.097, 'fbeta_2': 0.181, 'roc_auc': np.float64(0.652), 'NNS': 18.316, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 38), ('max_features', None), ('min_samples_leaf', 9), ('min_samples_split', 20), ('n_estimators', 300)])}
{'fold': 3, 'accuracy': 0.837, 'precision': 0.107, 'sensitivity': 0.477, 'f1_score': 0.174, 'fbeta_2': 0.282, 'roc_auc': np.float64(0.793), 'NNS': 9.381, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 50), ('max_features', 'sqrt'), ('min_samples_leaf', 4), ('min_samples_split', 3), ('n_estimators', 50)

In [20]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.822, 'precision': 0.106, 'sensitivity': 0.511, 'f1_score': 0.175, 'fbeta_2': 0.289, 'roc_auc': np.float64(0.753), 'NNS': 9.478, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 5), ('learning_rate', 0.07277726751537635), ('max_depth', 1), ('min_child_weight', 1), ('n_estimators', 500), ('subsample', 0.5)])}
{'fold': 2, 'accuracy': 0.837, 'precision': 0.122, 'sensitivity': 0.568, 'f1_score': 0.201, 'fbeta_2': 0.328, 'roc_auc': np.float64(0.764), 'NNS': 8.2, 'best_params': OrderedDict([('colsample_bytree', 0.8058839213244959), ('gamma', 0), ('learning_rate', 0.05138958093803986), ('max_depth', 1), ('min_child_weight', 9), ('n_estimators', 500), ('subsample', 0.5)])}
{'fold': 3, 'accuracy': 0.788, 'precision': 0.085, 'sensitivity': 0.5, 'f1_score': 0.146, 'fbeta_2': 0.253, 'roc_auc': np.float64(0.784), 'NNS': 11.727, 'best_params': OrderedDict([('colsample_bytree', 0.705051979426657), ('gamma', 4), ('learning_rate',

In [21]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    xgb_04=True
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.752, 'precision': 0.095, 'sensitivity': 0.667, 'f1_score': 0.166, 'fbeta_2': 0.302, 'roc_auc': np.float64(0.753), 'NNS': 10.567, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 5), ('learning_rate', 0.07277726751537635), ('max_depth', 1), ('min_child_weight', 1), ('n_estimators', 500), ('subsample', 0.5)])}
{'fold': 2, 'accuracy': 0.764, 'precision': 0.094, 'sensitivity': 0.636, 'f1_score': 0.163, 'fbeta_2': 0.295, 'roc_auc': np.float64(0.764), 'NNS': 10.679, 'best_params': OrderedDict([('colsample_bytree', 0.8058839213244959), ('gamma', 0), ('learning_rate', 0.05138958093803986), ('max_depth', 1), ('min_child_weight', 9), ('n_estimators', 500), ('subsample', 0.5)])}
{'fold': 3, 'accuracy': 0.709, 'precision': 0.088, 'sensitivity': 0.75, 'f1_score': 0.157, 'fbeta_2': 0.299, 'roc_auc': np.float64(0.784), 'NNS': 11.394, 'best_params': OrderedDict([('colsample_bytree', 0.705051979426657), ('gamma', 4), ('learning_r

In [22]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.783, 'precision': 0.093, 'sensitivity': 0.556, 'f1_score': 0.159, 'fbeta_2': 0.278, 'roc_auc': np.float64(0.74), 'NNS': 10.8, 'best_params': {}}
{'fold': 2, 'accuracy': 0.78, 'precision': 0.073, 'sensitivity': 0.432, 'f1_score': 0.124, 'fbeta_2': 0.217, 'roc_auc': np.float64(0.719), 'NNS': 13.789, 'best_params': {}}
{'fold': 3, 'accuracy': 0.752, 'precision': 0.078, 'sensitivity': 0.545, 'f1_score': 0.137, 'fbeta_2': 0.249, 'roc_auc': np.float64(0.707), 'NNS': 12.75, 'best_params': {}}
{'fold': 4, 'accuracy': 0.794, 'precision': 0.088, 'sensitivity': 0.489, 'f1_score': 0.149, 'fbeta_2': 0.256, 'roc_auc': np.float64(0.697), 'NNS': 11.364, 'best_params': {}}
{'fold': 5, 'accuracy': 0.778, 'precision': 0.099, 'sensitivity': 0.622, 'f1_score': 0.171, 'fbeta_2': 0.303, 'roc_auc': np.float64(0.792), 'NNS': 10.071, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.777 ± 0.014
precision: 0.086 ± 0.010
sensitivity

## y_death_6_months

In [23]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===
{'fold': 1, 'accuracy': 0.467, 'precision': 0.077, 'sensitivity': 0.9, 'f1_score': 0.142, 'fbeta_2': 0.288, 'roc_auc': np.float64(0.708), 'NNS': 12.926, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 2, 'accuracy': 0.401, 'precision': 0.064, 'sensitivity': 0.831, 'f1_score': 0.118, 'fbeta_2': 0.244, 'roc_auc': np.float64(0.69), 'NNS': 15.694, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 3, 'accuracy': 0.433, 'precision': 0.074, 'sensitivity': 0.932, 'f1_score': 0.138, 'fbeta_2': 0.281, 'roc_auc': np.float64(0.726), 'NNS': 13.473, 'best_params': OrderedDict([('var_smoothing', 0.09706226232971742)])}
{'fold': 4, 'accuracy': 0.392, 'precision': 0.064, 'sensitivity': 0.847, 'f1_score': 0.119, 'fbeta_2': 0.246, 'roc_auc': np.float64(0.689), 'NNS': 15.62, 'best_params': OrderedDict([('var_smoothing', 0.09706226232971742)])}
{'fold': 5, 'accuracy': 0.457, 'precision': 0.075, 'sensitivity': 0.883, 'f1_score': 0.138, 'fbeta_2': 0.28

In [24]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="oversample",
    n_splits=5,
    random_state=42,
)


=== Technique: OVERSAMPLE ===
{'fold': 1, 'accuracy': 0.627, 'precision': 0.099, 'sensitivity': 0.817, 'f1_score': 0.177, 'fbeta_2': 0.334, 'roc_auc': np.float64(0.766), 'NNS': 10.061, 'best_params': OrderedDict([('C', 0.0002477909077261299), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.613, 'precision': 0.101, 'sensitivity': 0.881, 'f1_score': 0.181, 'fbeta_2': 0.346, 'roc_auc': np.float64(0.802), 'NNS': 9.923, 'best_params': OrderedDict([('C', 0.00023965993216682645), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.649, 'precision': 0.103, 'sensitivity': 0.814, 'f1_score': 0.184, 'fbeta_2': 0.343, 'roc_auc': np.float64(0.786), 'NNS': 9.667, 'best_params': OrderedDict([('C', 0.0004005975387785025), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.602, 'precision': 0.087, 'sensitivity': 0.763, 'f1_score': 0.157, 'fbeta_2': 0.299, 'roc_auc': np.float64(0.752), 'NNS': 11.467, 'best_params': OrderedDict([('C', 0.00

In [25]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.688, 'precision': 0.1, 'sensitivity': 0.667, 'f1_score': 0.174, 'fbeta_2': 0.312, 'roc_auc': np.float64(0.762), 'NNS': 10.0, 'best_params': OrderedDict([('C', 0.0016180906389551323), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.635, 'precision': 0.106, 'sensitivity': 0.881, 'f1_score': 0.189, 'fbeta_2': 0.358, 'roc_auc': np.float64(0.796), 'NNS': 9.423, 'best_params': OrderedDict([('C', 0.0003783559043158893), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.631, 'precision': 0.1, 'sensitivity': 0.831, 'f1_score': 0.179, 'fbeta_2': 0.338, 'roc_auc': np.float64(0.787), 'NNS': 9.959, 'best_params': OrderedDict([('C', 0.0004203599153293223), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.654, 'precision': 0.1, 'sensitivity': 0.763, 'f1_score': 0.176, 'fbeta_2': 0.327, 'roc_auc': np.float64(0.755), 'NNS': 10.044, 'best_params': OrderedDict([('C', 0.0004410040998609

In [26]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.585, 'precision': 0.086, 'sensitivity': 0.767, 'f1_score': 0.154, 'fbeta_2': 0.296, 'roc_auc': np.float64(0.648), 'NNS': 11.696, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 3), ('min_samples_leaf', 8), ('min_samples_split', 2)])}
{'fold': 2, 'accuracy': 0.516, 'precision': 0.075, 'sensitivity': 0.797, 'f1_score': 0.138, 'fbeta_2': 0.273, 'roc_auc': np.float64(0.698), 'NNS': 13.277, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 3), ('min_samples_leaf', 2), ('min_samples_split', 20)])}
{'fold': 3, 'accuracy': 0.622, 'precision': 0.096, 'sensitivity': 0.814, 'f1_score': 0.172, 'fbeta_2': 0.327, 'roc_auc': np.float64(0.726), 'NNS': 10.375, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 3), ('min_samples_leaf', 3), ('min_samples_split', 19)])}
{'fold': 4, 'accuracy': 0.628, 'precision': 0.084, 'sensitivity': 0.678, 'f1_score': 0.15, 'fbeta_2': 0.282, 'roc_auc': np.float6

In [27]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.664, 'precision': 0.089, 'sensitivity': 0.633, 'f1_score': 0.156, 'fbeta_2': 0.285, 'roc_auc': np.float64(0.741), 'NNS': 11.211, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 9), ('max_features', 'sqrt'), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 127)])}
{'fold': 2, 'accuracy': 0.584, 'precision': 0.085, 'sensitivity': 0.78, 'f1_score': 0.154, 'fbeta_2': 0.296, 'roc_auc': np.float64(0.755), 'NNS': 11.739, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 1), ('max_features', None), ('min_samples_leaf', 20), ('min_samples_split', 2), ('n_estimators', 50)])}
{'fold': 3, 'accuracy': 0.72, 'precision': 0.133, 'sensitivity': 0.864, 'f1_score': 0.23, 'fbeta_2': 0.411, 'roc_auc': np.float64(0.824), 'NNS': 7.529, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 41), ('max_features', 'sqrt'), ('min_samples_leaf', 7), ('min_samples_split', 19), ('n_estimators', 300)])}

In [28]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.696, 'precision': 0.103, 'sensitivity': 0.667, 'f1_score': 0.178, 'fbeta_2': 0.317, 'roc_auc': np.float64(0.765), 'NNS': 9.75, 'best_params': OrderedDict([('colsample_bytree', 0.7514697780094015), ('gamma', 5), ('learning_rate', 0.01644364625714213), ('max_depth', 24), ('min_child_weight', 4), ('n_estimators', 150), ('subsample', 0.8408090697050196)])}
{'fold': 2, 'accuracy': 0.688, 'precision': 0.113, 'sensitivity': 0.797, 'f1_score': 0.198, 'fbeta_2': 0.361, 'roc_auc': np.float64(0.786), 'NNS': 8.83, 'best_params': OrderedDict([('colsample_bytree', 0.6485213058585517), ('gamma', 3), ('learning_rate', 0.1), ('max_depth', 10), ('min_child_weight', 1), ('n_estimators', 348), ('subsample', 0.9401988356446473)])}
{'fold': 3, 'accuracy': 0.731, 'precision': 0.129, 'sensitivity': 0.797, 'f1_score': 0.223, 'fbeta_2': 0.392, 'roc_auc': np.float64(0.821), 'NNS': 7.723, 'best_params': OrderedDict([('colsample_bytree', 0.5), ('gamma', 2),

In [7]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    xgb_04=True
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.594, 'precision': 0.099, 'sensitivity': 0.9, 'f1_score': 0.179, 'fbeta_2': 0.345, 'roc_auc': np.float64(0.765), 'NNS': 10.056, 'best_params': OrderedDict([('colsample_bytree', 0.7514697780094015), ('gamma', 5), ('learning_rate', 0.01644364625714213), ('max_depth', 24), ('min_child_weight', 4), ('n_estimators', 150), ('subsample', 0.8408090697050196)])}
{'fold': 2, 'accuracy': 0.605, 'precision': 0.099, 'sensitivity': 0.881, 'f1_score': 0.178, 'fbeta_2': 0.341, 'roc_auc': np.float64(0.786), 'NNS': 10.115, 'best_params': OrderedDict([('colsample_bytree', 0.6485213058585517), ('gamma', 3), ('learning_rate', 0.1), ('max_depth', 10), ('min_child_weight', 1), ('n_estimators', 348), ('subsample', 0.9401988356446473)])}
{'fold': 3, 'accuracy': 0.627, 'precision': 0.104, 'sensitivity': 0.881, 'f1_score': 0.186, 'fbeta_2': 0.354, 'roc_auc': np.float64(0.821), 'NNS': 9.596, 'best_params': OrderedDict([('colsample_bytree', 0.5), ('gamma', 2

In [30]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.7, 'precision': 0.106, 'sensitivity': 0.683, 'f1_score': 0.183, 'fbeta_2': 0.326, 'roc_auc': np.float64(0.727), 'NNS': 9.463, 'best_params': {}}
{'fold': 2, 'accuracy': 0.664, 'precision': 0.099, 'sensitivity': 0.729, 'f1_score': 0.174, 'fbeta_2': 0.32, 'roc_auc': np.float64(0.745), 'NNS': 10.14, 'best_params': {}}
{'fold': 3, 'accuracy': 0.69, 'precision': 0.096, 'sensitivity': 0.644, 'f1_score': 0.167, 'fbeta_2': 0.301, 'roc_auc': np.float64(0.748), 'NNS': 10.395, 'best_params': {}}
{'fold': 4, 'accuracy': 0.705, 'precision': 0.099, 'sensitivity': 0.627, 'f1_score': 0.171, 'fbeta_2': 0.303, 'roc_auc': np.float64(0.7), 'NNS': 10.108, 'best_params': {}}
{'fold': 5, 'accuracy': 0.704, 'precision': 0.095, 'sensitivity': 0.583, 'f1_score': 0.163, 'fbeta_2': 0.287, 'roc_auc': np.float64(0.734), 'NNS': 10.571, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.693 ± 0.015
precision: 0.099 ± 0.004
sensitivity: 0

## y_hf_6_months

In [31]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===
{'fold': 1, 'accuracy': 0.583, 'precision': 0.195, 'sensitivity': 0.755, 'f1_score': 0.31, 'fbeta_2': 0.479, 'roc_auc': np.float64(0.691), 'NNS': 5.132, 'best_params': OrderedDict([('var_smoothing', 0.026166382362478202)])}
{'fold': 2, 'accuracy': 0.637, 'precision': 0.219, 'sensitivity': 0.76, 'f1_score': 0.34, 'fbeta_2': 0.509, 'roc_auc': np.float64(0.729), 'NNS': 4.561, 'best_params': OrderedDict([('var_smoothing', 0.03146530290185664)])}
{'fold': 3, 'accuracy': 0.626, 'precision': 0.213, 'sensitivity': 0.748, 'f1_score': 0.331, 'fbeta_2': 0.498, 'roc_auc': np.float64(0.729), 'NNS': 4.699, 'best_params': OrderedDict([('var_smoothing', 0.0024495278585455124)])}
{'fold': 4, 'accuracy': 0.638, 'precision': 0.211, 'sensitivity': 0.702, 'f1_score': 0.325, 'fbeta_2': 0.479, 'roc_auc': np.float64(0.708), 'NNS': 4.736, 'best_params': OrderedDict([('var_smoothing', 0.01963655691543654)])}
{'fold': 5, 'accuracy': 0.572, 'precision': 0.203, 'sensitivity': 0.834, 'f

In [32]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="oversample",
    n_splits=5,
    random_state=42,
)


=== Technique: OVERSAMPLE ===
{'fold': 1, 'accuracy': 0.628, 'precision': 0.226, 'sensitivity': 0.828, 'f1_score': 0.356, 'fbeta_2': 0.541, 'roc_auc': np.float64(0.767), 'NNS': 4.416, 'best_params': OrderedDict([('C', 0.0025366695112235263), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.594, 'precision': 0.208, 'sensitivity': 0.82, 'f1_score': 0.332, 'fbeta_2': 0.516, 'roc_auc': np.float64(0.757), 'NNS': 4.805, 'best_params': OrderedDict([('C', 0.002846791706196798), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.666, 'precision': 0.246, 'sensitivity': 0.821, 'f1_score': 0.379, 'fbeta_2': 0.56, 'roc_auc': np.float64(0.792), 'NNS': 4.065, 'best_params': OrderedDict([('C', 0.00010602594470834996), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.657, 'precision': 0.23, 'sensitivity': 0.755, 'f1_score': 0.353, 'fbeta_2': 0.519, 'roc_auc': np.float64(0.768), 'NNS': 4.342, 'best_params': OrderedDict([('C', 0.0001), (

In [33]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.641, 'precision': 0.224, 'sensitivity': 0.768, 'f1_score': 0.346, 'fbeta_2': 0.516, 'roc_auc': np.float64(0.761), 'NNS': 4.474, 'best_params': OrderedDict([('C', 0.0002886425671572526), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.664, 'precision': 0.236, 'sensitivity': 0.773, 'f1_score': 0.362, 'fbeta_2': 0.532, 'roc_auc': np.float64(0.779), 'NNS': 4.233, 'best_params': OrderedDict([('C', 0.000452346936738668), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.705, 'precision': 0.265, 'sensitivity': 0.775, 'f1_score': 0.395, 'fbeta_2': 0.559, 'roc_auc': np.float64(0.801), 'NNS': 3.778, 'best_params': OrderedDict([('C', 0.0006013724406337032), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.694, 'precision': 0.256, 'sensitivity': 0.768, 'f1_score': 0.383, 'fbeta_2': 0.548, 'roc_auc': np.float64(0.779), 'NNS': 3.914, 'best_params': OrderedDict([('C', 0.00053467511

In [34]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.686, 'precision': 0.246, 'sensitivity': 0.742, 'f1_score': 0.369, 'fbeta_2': 0.528, 'roc_auc': np.float64(0.732), 'NNS': 4.071, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 6), ('min_samples_leaf', 1), ('min_samples_split', 13)])}
{'fold': 2, 'accuracy': 0.739, 'precision': 0.279, 'sensitivity': 0.707, 'f1_score': 0.4, 'fbeta_2': 0.541, 'roc_auc': np.float64(0.739), 'NNS': 3.585, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 3), ('min_samples_leaf', 10), ('min_samples_split', 2)])}
{'fold': 3, 'accuracy': 0.773, 'precision': 0.306, 'sensitivity': 0.656, 'f1_score': 0.417, 'fbeta_2': 0.533, 'roc_auc': np.float64(0.785), 'NNS': 3.273, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 3), ('min_samples_leaf', 8), ('min_samples_split', 10)])}
{'fold': 4, 'accuracy': 0.75, 'precision': 0.282, 'sensitivity': 0.662, 'f1_score': 0.396, 'fbeta_2': 0.522, 'roc_auc': np.float64

In [35]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.633, 'precision': 0.226, 'sensitivity': 0.808, 'f1_score': 0.353, 'fbeta_2': 0.533, 'roc_auc': np.float64(0.755), 'NNS': 4.426, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 1), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 129)])}
{'fold': 2, 'accuracy': 0.688, 'precision': 0.249, 'sensitivity': 0.76, 'f1_score': 0.375, 'fbeta_2': 0.539, 'roc_auc': np.float64(0.784), 'NNS': 4.018, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 49), ('max_features', 'log2'), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 154)])}
{'fold': 3, 'accuracy': 0.736, 'precision': 0.291, 'sensitivity': 0.788, 'f1_score': 0.425, 'fbeta_2': 0.587, 'roc_auc': np.float64(0.816), 'NNS': 3.437, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 50), ('max_features', 'sqrt'), ('min_samples_leaf', 1), ('min_samples_split', 11), ('n_estimators', 300)])

In [36]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.683, 'precision': 0.25, 'sensitivity': 0.781, 'f1_score': 0.379, 'fbeta_2': 0.548, 'roc_auc': np.float64(0.78), 'NNS': 4.0, 'best_params': OrderedDict([('colsample_bytree', 0.5728999910361396), ('gamma', 2), ('learning_rate', 0.00359316963595307), ('max_depth', 1), ('min_child_weight', 11), ('n_estimators', 467), ('subsample', 0.7932956159655994)])}
{'fold': 2, 'accuracy': 0.623, 'precision': 0.213, 'sensitivity': 0.767, 'f1_score': 0.334, 'fbeta_2': 0.505, 'roc_auc': np.float64(0.767), 'NNS': 4.687, 'best_params': OrderedDict([('colsample_bytree', 0.7824066520512818), ('gamma', 5), ('learning_rate', 0.001), ('max_depth', 34), ('min_child_weight', 20), ('n_estimators', 18), ('subsample', 0.5)])}
{'fold': 3, 'accuracy': 0.762, 'precision': 0.316, 'sensitivity': 0.788, 'f1_score': 0.451, 'fbeta_2': 0.607, 'roc_auc': np.float64(0.823), 'NNS': 3.168, 'best_params': OrderedDict([('colsample_bytree', 0.5018151536273716), ('gamma', 4),

In [37]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    xgb_04=True
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.752, 'precision': 0.095, 'sensitivity': 0.667, 'f1_score': 0.166, 'fbeta_2': 0.302, 'roc_auc': np.float64(0.753), 'NNS': 10.567, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 5), ('learning_rate', 0.07277726751537635), ('max_depth', 1), ('min_child_weight', 1), ('n_estimators', 500), ('subsample', 0.5)])}
{'fold': 2, 'accuracy': 0.764, 'precision': 0.094, 'sensitivity': 0.636, 'f1_score': 0.163, 'fbeta_2': 0.295, 'roc_auc': np.float64(0.764), 'NNS': 10.679, 'best_params': OrderedDict([('colsample_bytree', 0.8058839213244959), ('gamma', 0), ('learning_rate', 0.05138958093803986), ('max_depth', 1), ('min_child_weight', 9), ('n_estimators', 500), ('subsample', 0.5)])}
{'fold': 3, 'accuracy': 0.709, 'precision': 0.088, 'sensitivity': 0.75, 'f1_score': 0.157, 'fbeta_2': 0.299, 'roc_auc': np.float64(0.784), 'NNS': 11.394, 'best_params': OrderedDict([('colsample_bytree', 0.705051979426657), ('gamma', 4), ('learning_r

In [38]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.642, 'precision': 0.202, 'sensitivity': 0.642, 'f1_score': 0.307, 'fbeta_2': 0.447, 'roc_auc': np.float64(0.709), 'NNS': 4.948, 'best_params': {}}
{'fold': 2, 'accuracy': 0.681, 'precision': 0.236, 'sensitivity': 0.707, 'f1_score': 0.353, 'fbeta_2': 0.505, 'roc_auc': np.float64(0.748), 'NNS': 4.245, 'best_params': {}}
{'fold': 3, 'accuracy': 0.676, 'precision': 0.229, 'sensitivity': 0.682, 'f1_score': 0.343, 'fbeta_2': 0.489, 'roc_auc': np.float64(0.729), 'NNS': 4.369, 'best_params': {}}
{'fold': 4, 'accuracy': 0.7, 'precision': 0.248, 'sensitivity': 0.695, 'f1_score': 0.365, 'fbeta_2': 0.511, 'roc_auc': np.float64(0.742), 'NNS': 4.038, 'best_params': {}}
{'fold': 5, 'accuracy': 0.664, 'precision': 0.222, 'sensitivity': 0.682, 'f1_score': 0.335, 'fbeta_2': 0.482, 'roc_auc': np.float64(0.726), 'NNS': 4.505, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.673 ± 0.019
precision: 0.227 ± 0.015
sensitivity: 

## y_inp_6_months

In [39]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===
{'fold': 1, 'accuracy': 0.55, 'precision': 0.281, 'sensitivity': 0.705, 'f1_score': 0.402, 'fbeta_2': 0.541, 'roc_auc': np.float64(0.643), 'NNS': 3.56, 'best_params': OrderedDict([('var_smoothing', 3.113444056694532e-05)])}
{'fold': 2, 'accuracy': 0.545, 'precision': 0.283, 'sensitivity': 0.738, 'f1_score': 0.409, 'fbeta_2': 0.559, 'roc_auc': np.float64(0.666), 'NNS': 3.531, 'best_params': OrderedDict([('var_smoothing', 2.982994486439461e-05)])}
{'fold': 3, 'accuracy': 0.573, 'precision': 0.282, 'sensitivity': 0.646, 'f1_score': 0.393, 'fbeta_2': 0.513, 'roc_auc': np.float64(0.633), 'NNS': 3.548, 'best_params': OrderedDict([('var_smoothing', 2.014201121331883e-09)])}
{'fold': 4, 'accuracy': 0.308, 'precision': 0.227, 'sensitivity': 0.931, 'f1_score': 0.365, 'fbeta_2': 0.574, 'roc_auc': np.float64(0.638), 'NNS': 4.409, 'best_params': OrderedDict([('var_smoothing', 2.1287504812695306e-08)])}
{'fold': 5, 'accuracy': 0.568, 'precision': 0.304, 'sensitivity': 0.

In [40]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="oversample",
    n_splits=5,
    random_state=42,
)


=== Technique: OVERSAMPLE ===
{'fold': 1, 'accuracy': 0.618, 'precision': 0.308, 'sensitivity': 0.632, 'f1_score': 0.415, 'fbeta_2': 0.522, 'roc_auc': np.float64(0.681), 'NNS': 3.242, 'best_params': OrderedDict([('C', 0.0003054260119367861), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.613, 'precision': 0.321, 'sensitivity': 0.727, 'f1_score': 0.445, 'fbeta_2': 0.58, 'roc_auc': np.float64(0.696), 'NNS': 3.116, 'best_params': OrderedDict([('C', 0.008016892136734904), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.626, 'precision': 0.318, 'sensitivity': 0.658, 'f1_score': 0.429, 'fbeta_2': 0.542, 'roc_auc': np.float64(0.665), 'NNS': 3.14, 'best_params': OrderedDict([('C', 0.00024219012884038265), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.599, 'precision': 0.302, 'sensitivity': 0.673, 'f1_score': 0.417, 'fbeta_2': 0.54, 'roc_auc': np.float64(0.67), 'NNS': 3.309, 'best_params': OrderedDict([('C', 0.000139532

In [41]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.616, 'precision': 0.312, 'sensitivity': 0.659, 'f1_score': 0.424, 'fbeta_2': 0.539, 'roc_auc': np.float64(0.688), 'NNS': 3.203, 'best_params': OrderedDict([('C', 0.0005144932011626379), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.603, 'precision': 0.316, 'sensitivity': 0.735, 'f1_score': 0.442, 'fbeta_2': 0.581, 'roc_auc': np.float64(0.685), 'NNS': 3.168, 'best_params': OrderedDict([('C', 0.009110682973135314), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.599, 'precision': 0.3, 'sensitivity': 0.658, 'f1_score': 0.412, 'fbeta_2': 0.531, 'roc_auc': np.float64(0.659), 'NNS': 3.333, 'best_params': OrderedDict([('C', 0.007725145380971576), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.589, 'precision': 0.292, 'sensitivity': 0.65, 'f1_score': 0.403, 'fbeta_2': 0.522, 'roc_auc': np.float64(0.654), 'NNS': 3.42, 'best_params': OrderedDict([('C', 0.0010325093261771

In [42]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.675, 'precision': 0.32, 'sensitivity': 0.46, 'f1_score': 0.377, 'fbeta_2': 0.423, 'roc_auc': np.float64(0.642), 'NNS': 3.125, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 5), ('min_samples_leaf', 1), ('min_samples_split', 2)])}
{'fold': 2, 'accuracy': 0.607, 'precision': 0.302, 'sensitivity': 0.642, 'f1_score': 0.411, 'fbeta_2': 0.524, 'roc_auc': np.float64(0.661), 'NNS': 3.311, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 4), ('min_samples_leaf', 2), ('min_samples_split', 20)])}
{'fold': 3, 'accuracy': 0.623, 'precision': 0.297, 'sensitivity': 0.558, 'f1_score': 0.387, 'fbeta_2': 0.474, 'roc_auc': np.float64(0.604), 'NNS': 3.372, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 7), ('min_samples_leaf', 1), ('min_samples_split', 20)])}
{'fold': 4, 'accuracy': 0.635, 'precision': 0.304, 'sensitivity': 0.554, 'f1_score': 0.393, 'fbeta_2': 0.476, 'roc_auc': np.float64(0.

In [43]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.533, 'precision': 0.287, 'sensitivity': 0.793, 'f1_score': 0.421, 'fbeta_2': 0.586, 'roc_auc': np.float64(0.67), 'NNS': 3.488, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 1), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 300)])}
{'fold': 2, 'accuracy': 0.516, 'precision': 0.28, 'sensitivity': 0.808, 'f1_score': 0.416, 'fbeta_2': 0.587, 'roc_auc': np.float64(0.662), 'NNS': 3.571, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 1), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 50)])}
{'fold': 3, 'accuracy': 0.493, 'precision': 0.268, 'sensitivity': 0.796, 'f1_score': 0.401, 'fbeta_2': 0.571, 'roc_auc': np.float64(0.603), 'NNS': 3.729, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 1), ('max_features', None), ('min_samples_leaf', 16), ('min_samples_split', 13), ('n_estimators', 299)])}
{'fo

In [44]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.533, 'precision': 0.287, 'sensitivity': 0.793, 'f1_score': 0.421, 'fbeta_2': 0.586, 'roc_auc': np.float64(0.637), 'NNS': 3.488, 'best_params': OrderedDict([('colsample_bytree', 0.7779534892952855), ('gamma', 1), ('learning_rate', 0.001), ('max_depth', 1), ('min_child_weight', 1), ('n_estimators', 10), ('subsample', 1.0)])}
{'fold': 2, 'accuracy': 0.529, 'precision': 0.286, 'sensitivity': 0.804, 'f1_score': 0.421, 'fbeta_2': 0.59, 'roc_auc': np.float64(0.686), 'NNS': 3.502, 'best_params': OrderedDict([('colsample_bytree', 0.5), ('gamma', 5), ('learning_rate', 0.001), ('max_depth', 1), ('min_child_weight', 20), ('n_estimators', 500), ('subsample', 1.0)])}
{'fold': 3, 'accuracy': 0.511, 'precision': 0.276, 'sensitivity': 0.796, 'f1_score': 0.41, 'fbeta_2': 0.579, 'roc_auc': np.float64(0.663), 'NNS': 3.618, 'best_params': OrderedDict([('colsample_bytree', 0.6932622991980926), ('gamma', 1), ('learning_rate', 0.001), ('max_depth', 1),

In [45]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    xgb_04=True
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.214, 'precision': 0.214, 'sensitivity': 1.0, 'f1_score': 0.353, 'fbeta_2': 0.577, 'roc_auc': np.float64(0.637), 'NNS': 4.67, 'best_params': OrderedDict([('colsample_bytree', 0.7779534892952855), ('gamma', 1), ('learning_rate', 0.001), ('max_depth', 1), ('min_child_weight', 1), ('n_estimators', 10), ('subsample', 1.0)])}
{'fold': 2, 'accuracy': 0.213, 'precision': 0.213, 'sensitivity': 1.0, 'f1_score': 0.352, 'fbeta_2': 0.576, 'roc_auc': np.float64(0.686), 'NNS': 4.685, 'best_params': OrderedDict([('colsample_bytree', 0.5), ('gamma', 5), ('learning_rate', 0.001), ('max_depth', 1), ('min_child_weight', 20), ('n_estimators', 500), ('subsample', 1.0)])}
{'fold': 3, 'accuracy': 0.213, 'precision': 0.213, 'sensitivity': 1.0, 'f1_score': 0.352, 'fbeta_2': 0.576, 'roc_auc': np.float64(0.663), 'NNS': 4.685, 'best_params': OrderedDict([('colsample_bytree', 0.6932622991980926), ('gamma', 1), ('learning_rate', 0.001), ('max_depth', 1), ('mi

In [46]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.591, 'precision': 0.28, 'sensitivity': 0.582, 'f1_score': 0.379, 'fbeta_2': 0.479, 'roc_auc': np.float64(0.615), 'NNS': 3.566, 'best_params': {}}
{'fold': 2, 'accuracy': 0.588, 'precision': 0.28, 'sensitivity': 0.592, 'f1_score': 0.38, 'fbeta_2': 0.484, 'roc_auc': np.float64(0.619), 'NNS': 3.571, 'best_params': {}}
{'fold': 3, 'accuracy': 0.594, 'precision': 0.281, 'sensitivity': 0.581, 'f1_score': 0.379, 'fbeta_2': 0.479, 'roc_auc': np.float64(0.607), 'NNS': 3.556, 'best_params': {}}
{'fold': 4, 'accuracy': 0.594, 'precision': 0.28, 'sensitivity': 0.573, 'f1_score': 0.376, 'fbeta_2': 0.474, 'roc_auc': np.float64(0.617), 'NNS': 3.577, 'best_params': {}}
{'fold': 5, 'accuracy': 0.583, 'precision': 0.279, 'sensitivity': 0.598, 'f1_score': 0.38, 'fbeta_2': 0.487, 'roc_auc': np.float64(0.639), 'NNS': 3.583, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.590 ± 0.004
precision: 0.280 ± 0.001
sensitivity: 0.5

## y_stk_or_aemb

In [47]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===
{'fold': 1, 'accuracy': 0.235, 'precision': 0.024, 'sensitivity': 0.821, 'f1_score': 0.047, 'fbeta_2': 0.108, 'roc_auc': np.float64(0.558), 'NNS': 41.348, 'best_params': OrderedDict([('var_smoothing', 0.09999650866365037)])}
{'fold': 2, 'accuracy': 0.156, 'precision': 0.024, 'sensitivity': 0.926, 'f1_score': 0.046, 'fbeta_2': 0.108, 'roc_auc': np.float64(0.604), 'NNS': 42.04, 'best_params': OrderedDict([('var_smoothing', 0.0008980026880050335)])}
{'fold': 3, 'accuracy': 0.086, 'precision': 0.023, 'sensitivity': 0.963, 'f1_score': 0.045, 'fbeta_2': 0.104, 'roc_auc': np.float64(0.489), 'NNS': 43.769, 'best_params': OrderedDict([('var_smoothing', 1.3058969720445305e-11)])}
{'fold': 4, 'accuracy': 0.149, 'precision': 0.023, 'sensitivity': 0.889, 'f1_score': 0.044, 'fbeta_2': 0.103, 'roc_auc': np.float64(0.501), 'NNS': 44.083, 'best_params': OrderedDict([('var_smoothing', 0.008555461152760607)])}
{'fold': 5, 'accuracy': 0.236, 'precision': 0.022, 'sensitivity': 

In [48]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="oversample",
    n_splits=5,
    random_state=42,
)


=== Technique: OVERSAMPLE ===
{'fold': 1, 'accuracy': 0.544, 'precision': 0.029, 'sensitivity': 0.571, 'f1_score': 0.054, 'fbeta_2': 0.119, 'roc_auc': np.float64(0.624), 'NNS': 35.0, 'best_params': OrderedDict([('C', 0.00010602594470834996), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.741, 'precision': 0.029, 'sensitivity': 0.333, 'f1_score': 0.054, 'fbeta_2': 0.108, 'roc_auc': np.float64(0.606), 'NNS': 34.111, 'best_params': OrderedDict([('C', 5.81701237575795), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.738, 'precision': 0.047, 'sensitivity': 0.556, 'f1_score': 0.086, 'fbeta_2': 0.174, 'roc_auc': np.float64(0.623), 'NNS': 21.467, 'best_params': OrderedDict([('C', 1000.0), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.626, 'precision': 0.037, 'sensitivity': 0.63, 'f1_score': 0.069, 'fbeta_2': 0.149, 'roc_auc': np.float64(0.636), 'NNS': 27.235, 'best_params': OrderedDict([('C', 0.0006556469860348211), (

In [49]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.527, 'precision': 0.028, 'sensitivity': 0.571, 'f1_score': 0.053, 'fbeta_2': 0.115, 'roc_auc': np.float64(0.634), 'NNS': 36.312, 'best_params': OrderedDict([('C', 0.00010602594470834996), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.739, 'precision': 0.035, 'sensitivity': 0.407, 'f1_score': 0.065, 'fbeta_2': 0.131, 'roc_auc': np.float64(0.589), 'NNS': 28.455, 'best_params': OrderedDict([('C', 487.9169484427551), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.743, 'precision': 0.047, 'sensitivity': 0.556, 'f1_score': 0.087, 'fbeta_2': 0.177, 'roc_auc': np.float64(0.622), 'NNS': 21.067, 'best_params': OrderedDict([('C', 13.746626277890918), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.721, 'precision': 0.036, 'sensitivity': 0.444, 'f1_score': 0.066, 'fbeta_2': 0.135, 'roc_auc': np.float64(0.589), 'NNS': 28.083, 'best_params': OrderedDict([('C', 2.483491132307

In [50]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.78, 'precision': 0.024, 'sensitivity': 0.214, 'f1_score': 0.043, 'fbeta_2': 0.082, 'roc_auc': np.float64(0.52), 'NNS': 42.0, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 19), ('min_samples_leaf', 2), ('min_samples_split', 5)])}
{'fold': 2, 'accuracy': 0.773, 'precision': 0.016, 'sensitivity': 0.148, 'f1_score': 0.028, 'fbeta_2': 0.055, 'roc_auc': np.float64(0.481), 'NNS': 64.5, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 15), ('min_samples_leaf', 1), ('min_samples_split', 2)])}
{'fold': 3, 'accuracy': 0.807, 'precision': 0.027, 'sensitivity': 0.222, 'f1_score': 0.049, 'fbeta_2': 0.091, 'roc_auc': np.float64(0.537), 'NNS': 36.667, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 15), ('min_samples_leaf', 2), ('min_samples_split', 2)])}
{'fold': 4, 'accuracy': 0.784, 'precision': 0.024, 'sensitivity': 0.222, 'f1_score': 0.044, 'fbeta_2': 0.084, 'roc_auc': np.float64(0.571

In [51]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.756, 'precision': 0.021, 'sensitivity': 0.214, 'f1_score': 0.039, 'fbeta_2': 0.076, 'roc_auc': np.float64(0.478), 'NNS': 47.0, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 40), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 5), ('n_estimators', 50)])}
{'fold': 2, 'accuracy': 0.749, 'precision': 0.021, 'sensitivity': 0.222, 'f1_score': 0.038, 'fbeta_2': 0.075, 'roc_auc': np.float64(0.497), 'NNS': 48.5, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 21), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 192)])}
{'fold': 3, 'accuracy': 0.817, 'precision': 0.024, 'sensitivity': 0.185, 'f1_score': 0.043, 'fbeta_2': 0.08, 'roc_auc': np.float64(0.565), 'NNS': 41.2, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 31), ('max_features', None), ('min_samples_leaf', 15), ('min_samples_split', 2), ('n_estimators', 50)])}
{'fo

In [52]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.938, 'precision': 0.039, 'sensitivity': 0.071, 'f1_score': 0.051, 'fbeta_2': 0.061, 'roc_auc': np.float64(0.597), 'NNS': 25.5, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 0), ('learning_rate', 0.0741222318928362), ('max_depth', 43), ('min_child_weight', 1), ('n_estimators', 319), ('subsample', 0.5)])}
{'fold': 2, 'accuracy': 0.892, 'precision': 0.009, 'sensitivity': 0.037, 'f1_score': 0.015, 'fbeta_2': 0.023, 'roc_auc': np.float64(0.568), 'NNS': 107.0, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 1), ('learning_rate', 0.09147401153275281), ('max_depth', 50), ('min_child_weight', 6), ('n_estimators', 173), ('subsample', 0.5)])}
{'fold': 3, 'accuracy': 0.918, 'precision': 0.038, 'sensitivity': 0.111, 'f1_score': 0.057, 'fbeta_2': 0.08, 'roc_auc': np.float64(0.576), 'NNS': 26.333, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 0), ('learning_rate', 0.1), ('max_depth', 5), (

In [55]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    xgb_04=True
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.919, 'precision': 0.027, 'sensitivity': 0.071, 'f1_score': 0.039, 'fbeta_2': 0.053, 'roc_auc': np.float64(0.597), 'NNS': 37.5, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 0), ('learning_rate', 0.0741222318928362), ('max_depth', 43), ('min_child_weight', 1), ('n_estimators', 319), ('subsample', 0.5)])}
{'fold': 2, 'accuracy': 0.823, 'precision': 0.01, 'sensitivity': 0.074, 'f1_score': 0.018, 'fbeta_2': 0.033, 'roc_auc': np.float64(0.568), 'NNS': 96.5, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 1), ('learning_rate', 0.09147401153275281), ('max_depth', 50), ('min_child_weight', 6), ('n_estimators', 173), ('subsample', 0.5)])}
{'fold': 3, 'accuracy': 0.885, 'precision': 0.033, 'sensitivity': 0.148, 'f1_score': 0.054, 'fbeta_2': 0.087, 'roc_auc': np.float64(0.576), 'NNS': 30.25, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 0), ('learning_rate', 0.1), ('max_depth', 5), ('m

In [56]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.881, 'precision': 0.032, 'sensitivity': 0.143, 'f1_score': 0.052, 'fbeta_2': 0.084, 'roc_auc': np.float64(0.582), 'NNS': 31.25, 'best_params': {}}
{'fold': 2, 'accuracy': 0.854, 'precision': 0.019, 'sensitivity': 0.111, 'f1_score': 0.033, 'fbeta_2': 0.057, 'roc_auc': np.float64(0.566), 'NNS': 52.333, 'best_params': {}}
{'fold': 3, 'accuracy': 0.854, 'precision': 0.048, 'sensitivity': 0.296, 'f1_score': 0.082, 'fbeta_2': 0.145, 'roc_auc': np.float64(0.679), 'NNS': 20.875, 'best_params': {}}
{'fold': 4, 'accuracy': 0.859, 'precision': 0.038, 'sensitivity': 0.222, 'f1_score': 0.065, 'fbeta_2': 0.113, 'roc_auc': np.float64(0.554), 'NNS': 26.167, 'best_params': {}}
{'fold': 5, 'accuracy': 0.851, 'precision': 0.041, 'sensitivity': 0.259, 'f1_score': 0.071, 'fbeta_2': 0.126, 'roc_auc': np.float64(0.55), 'NNS': 24.143, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.860 ± 0.011
precision: 0.036 ± 0.010
sensitiv