# Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, fbeta_score

from skopt.space import Integer, Categorical

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import fbeta_score, roc_auc_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score

from skopt import BayesSearchCV

from typing import Dict, List, Literal
from skopt.space import Real, Integer, Categorical

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings("ignore")

## Read dataset

In [2]:
data = pd.read_csv("../data/processed/20. FINAL_mean_delta_multi_output.csv")
print(data.shape)
data.head()

(6091, 210)


Unnamed: 0,Hba1c,Hba1c Time,Hba1c FM,Hba1c FM Time,BMI,BMI Time,Cancer,Cancer Time,Carotid Disease,Carotid Disease Time,...,y_stk_or_aemb_3_months,y_stk_or_aemb_6_months,y_stk_or_aemb_12_months,y_stk_or_aemb_24_months,y_stk_or_aemb,History of Vascular Disease,Antihypertensive Medication,Diabetes Mellitus,Diabetes Medication,Abnormal Kidney Function
0,6.26,-501,6.26,-501,20.7,-19,0,10000,0,10000,...,0,0,0,0,0,0,1,0,0,0
1,6.26,-501,6.26,-501,26.7,-780,1,-4846,0,10000,...,0,0,0,0,0,0,1,0,0,0
2,5.8,-287,6.3,-2701,31.1,-35,0,10000,0,10000,...,0,0,0,0,0,0,1,1,1,1
3,6.26,-501,6.26,-501,21.3,-207,0,10000,0,10000,...,0,0,0,0,0,1,1,0,0,1
4,5.9,-162,5.4,-5209,37.8,-554,1,-86,0,10000,...,0,0,0,0,0,1,1,1,1,0


## Drop columns

In [3]:
targets = ["y_acs_6_months", "y_cvdeath_6_months", "y_death_6_months", "y_hf_6_months", "y_inp_6_months", "y_stk_or_aemb"]
cols_to_drop = [col for col in data.columns if col.startswith('y_') and col not in targets]
data = data.drop(columns=cols_to_drop)

print(data.shape)
data.head()

(6091, 161)


Unnamed: 0,Hba1c,Hba1c Time,Hba1c FM,Hba1c FM Time,BMI,BMI Time,Cancer,Cancer Time,Carotid Disease,Carotid Disease Time,...,y_cvdeath_6_months,y_death_6_months,y_hf_6_months,y_inp_6_months,y_stk_or_aemb,History of Vascular Disease,Antihypertensive Medication,Diabetes Mellitus,Diabetes Medication,Abnormal Kidney Function
0,6.26,-501,6.26,-501,20.7,-19,0,10000,0,10000,...,0,0,0,0,0,0,1,0,0,0
1,6.26,-501,6.26,-501,26.7,-780,1,-4846,0,10000,...,0,0,1,1,0,0,1,0,0,0
2,5.8,-287,6.3,-2701,31.1,-35,0,10000,0,10000,...,0,0,0,0,0,0,1,1,1,1
3,6.26,-501,6.26,-501,21.3,-207,0,10000,0,10000,...,0,0,0,0,0,1,1,0,0,1
4,5.9,-162,5.4,-5209,37.8,-554,1,-86,0,10000,...,0,0,1,1,0,1,1,1,1,0


## Models settings

In [4]:
# Define 5-fold stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create F2 scorer
f2_scorer = make_scorer(fbeta_score, beta=2)

# ===================== Naive Bayes =====================
search_space_nb = {
    "var_smoothing": (1e-12, 1e-1, "log-uniform")
}

nb_opt = BayesSearchCV(
    estimator=GaussianNB(),
    search_spaces=search_space_nb,
    scoring=f2_scorer,
    n_iter=30,
    cv=cv,
    n_jobs=-1,
    random_state=42
)

# ===================== Logistic Regression =====================
search_space_lr = {
    'C': (1e-4, 1e+3, 'log-uniform'),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

lr_opt = BayesSearchCV(
    LogisticRegression(max_iter=1000, random_state=42),
    search_spaces=search_space_lr,
    scoring=f2_scorer,
    n_iter=30,
    cv=cv,
    n_jobs=-1,
    random_state=42
)

# ===================== Decision Tree =====================
search_space_dt = {
    "max_depth": Integer(3, 20),
    "min_samples_split": Integer(2, 20),
    "min_samples_leaf": Integer(1, 10),
    "criterion": Categorical(["gini", "entropy"])
}

dt_opt = BayesSearchCV(
    DecisionTreeClassifier(random_state=42),
    search_spaces=search_space_dt,
    n_iter=30,
    scoring=f2_scorer,
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

# ===================== Random Forest =====================
search_space_rf = {
    'n_estimators': (50, 300),
    'max_depth': (1, 50),
    'min_samples_split': (2, 20),
    'min_samples_leaf': (1, 20),
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

rf_opt = BayesSearchCV(
    RandomForestClassifier(random_state=42),
    search_spaces=search_space_rf,
    scoring=f2_scorer,
    n_iter=30,
    cv=cv,
    n_jobs=-1,
    random_state=42
)

# ===================== XGBoost =====================
search_space_xgb = {
    'n_estimators': (10, 500),
    'max_depth': (1, 50),
    'learning_rate': (0.001, 0.1, 'uniform'),
    'subsample': (0.5, 1.0),
    'colsample_bytree': (0.5, 1.0),
    'gamma': (0, 5),
    'min_child_weight': (1, 20)
}

xgb_opt = BayesSearchCV(
    XGBClassifier(random_state=42),
    search_spaces=search_space_xgb,
    scoring=f2_scorer,
    n_iter=30,
    n_jobs=-1,
    random_state=42
)

# ===================== MLP ===============================

mlp_opt = MLPClassifier(max_iter=2000, solver="adam", activation="relu", hidden_layer_sizes=(200,100), random_state=42)

# ===================== Optimizer Map =====================
optimizer_map = {
    "nb": nb_opt,
    "lr": lr_opt,
    "dt": dt_opt,
    "rf": rf_opt,
    "xgb": xgb_opt,
    "mlp": mlp_opt
}


## Define function

In [5]:
SamplingName = Literal["baseline", "undersample", "oversample", "smote", "all"]
ModelName = Literal["nb", "lr", "xgb", "dt", "rf", "mlp"]


def _make_resampler(name: SamplingName, random_state: int, y_train: pd.Series = None):
    if name == "baseline":
        return None
    if name == "undersample":
        if y_train is None:
            raise ValueError("y_train must be provided for undersampling strategy")
        n_minority = y_train.sum()
        n_required = max(int(0.1 * len(y_train)), n_minority * 2)
        n_majority = n_required - n_minority
        n_majority = min(n_majority, (y_train == 0).sum())
        sampling_strategy = {0: n_majority, 1: n_minority}
        return RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=random_state)
    if name == "oversample":
        return RandomOverSampler(random_state=random_state)
    if name == "smote":
        return SMOTE(random_state=random_state)
    raise ValueError(f"Unknown sampling technique: {name}")


def run_cv_with_sampling(
    X_full: pd.DataFrame,
    target_cols: List[str],
    target_name: str,
    optimizer_map: Dict[ModelName, object],
    model_name: ModelName = "nb",
    sampling: SamplingName = "all",
    n_splits: int = 5,
    random_state: int = 42,
    xgb_04 = False,
) -> Dict[str, dict]:
    if target_name not in target_cols:
        raise ValueError("target_name must be inside target_cols")
    missing_targets = [c for c in target_cols if c not in X_full.columns]
    if missing_targets:
        raise ValueError(f"Target columns not in X_full: {missing_targets}")

    y = X_full[target_name].copy()
    X = X_full.drop(columns=target_cols).copy()

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    if sampling == "all":
        techniques = ["baseline", "undersample", "oversample", "smote"]
    else:
        techniques = [sampling]

    results: Dict[str, dict] = {}

    for tech in techniques:
        print(f"\n=== Technique: {tech.upper()} ===")

        fold_results: List[dict] = []
        best_params_per_fold: List[dict] = []

        for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), start=1):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            if model_name in ["nb", "lr", "mlp"]:
                scaler = StandardScaler()
                X_train = scaler.fit_transform(X_train)
                X_val = scaler.transform(X_val)

            resampler = _make_resampler(tech, random_state, y_train)
            if resampler is not None:
                X_train_rs, y_train_rs = resampler.fit_resample(X_train, y_train)
            else:
                X_train_rs, y_train_rs = X_train, y_train

            opt = optimizer_map[model_name]
            opt.fit(X_train_rs, y_train_rs)

            if model_name == "mlp":
                best_model = opt
                best_params_per_fold = {}
            else:
                best_model = opt.best_estimator_
                best_params_per_fold.append(getattr(opt, "best_params_", {}))

            y_pred = best_model.predict(X_val)

            if hasattr(best_model, "predict_proba"):
                y_pred_proba = best_model.predict_proba(X_val)[:, 1]
            else:
                scores = best_model.decision_function(X_val)
                smin, smax = scores.min(), scores.max()
                y_pred_proba = (scores - smin) / (smax - smin + 1e-9)

            if model_name == "xgb" and xgb_04:
                y_pred = (y_pred_proba >= 0.4).astype(int)

            prec = precision_score(y_val, y_pred, zero_division=0)
            rec = recall_score(y_val, y_pred, zero_division=0)
            f1 = f1_score(y_val, y_pred, zero_division=0)
            fbeta2 = fbeta_score(y_val, y_pred, beta=2, zero_division=0)
            roc = roc_auc_score(y_val, y_pred_proba)

            fold_metrics = {
                "fold": fold,
                "accuracy": accuracy_score(y_val, y_pred),
                "precision": prec,
                "sensitivity": rec,
                "f1_score": f1,
                "fbeta_2": fbeta2,
                "roc_auc": roc,
                "NNS": (1 / prec) if prec > 0 else np.inf,
                "best_params": getattr(opt, "best_params_", {}),
            }
            fold_results.append(fold_metrics)

            print({k: (round(v, 3) if isinstance(v, float) else v) for k, v in fold_metrics.items()})

        metrics = [
            "accuracy",
            "precision",
            "sensitivity",
            "f1_score",
            "fbeta_2",
            "roc_auc",
            "NNS",
        ]
        mean_std = {
            m: (np.mean([fr[m] for fr in fold_results]), np.std([fr[m] for fr in fold_results]))
            for m in metrics
        }

        print("\nMean scores across folds (", tech, "):")
        for m in metrics:
            mu, sd = mean_std[m]
            print(f"{m}: {mu:.3f} \u00B1 {sd:.3f}")

        summary_df = pd.DataFrame(fold_results)
        results[tech] = {
            "fold_results": fold_results,
            "summary_metrics": summary_df,
            "best_params_per_fold": best_params_per_fold,
        }

    return results

## y_acs

In [6]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===
{'fold': 1, 'accuracy': 0.738, 'precision': 0.028, 'sensitivity': 0.6, 'f1_score': 0.053, 'fbeta_2': 0.118, 'roc_auc': np.float64(0.751), 'NNS': 35.778, 'best_params': OrderedDict([('var_smoothing', 0.07167633722247009)])}
{'fold': 2, 'accuracy': 0.74, 'precision': 0.034, 'sensitivity': 0.786, 'f1_score': 0.065, 'fbeta_2': 0.144, 'roc_auc': np.float64(0.771), 'NNS': 29.545, 'best_params': OrderedDict([('var_smoothing', 0.09997099570381021)])}
{'fold': 3, 'accuracy': 0.76, 'precision': 0.037, 'sensitivity': 0.786, 'f1_score': 0.07, 'fbeta_2': 0.154, 'roc_auc': np.float64(0.779), 'NNS': 27.273, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 4, 'accuracy': 0.783, 'precision': 0.023, 'sensitivity': 0.429, 'f1_score': 0.043, 'fbeta_2': 0.094, 'roc_auc': np.float64(0.733), 'NNS': 43.667, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 5, 'accuracy': 0.796, 'precision': 0.021, 'sensitivity': 0.333, 'f1_score': 0.039, 'fbeta_2': 0.083

In [7]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.721, 'precision': 0.023, 'sensitivity': 0.533, 'f1_score': 0.045, 'fbeta_2': 0.1, 'roc_auc': np.float64(0.711), 'NNS': 42.625, 'best_params': OrderedDict([('C', 0.0001551110829796666), ('penalty', 'l2'), ('solver', 'liblinear')])}




{'fold': 2, 'accuracy': 0.797, 'precision': 0.043, 'sensitivity': 0.786, 'f1_score': 0.082, 'fbeta_2': 0.177, 'roc_auc': np.float64(0.771), 'NNS': 23.182, 'best_params': OrderedDict([('C', 0.0007771076240146523), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.772, 'precision': 0.035, 'sensitivity': 0.714, 'f1_score': 0.067, 'fbeta_2': 0.147, 'roc_auc': np.float64(0.833), 'NNS': 28.4, 'best_params': OrderedDict([('C', 0.00047695215746020633), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.747, 'precision': 0.02, 'sensitivity': 0.429, 'f1_score': 0.037, 'fbeta_2': 0.083, 'roc_auc': np.float64(0.701), 'NNS': 51.0, 'best_params': OrderedDict([('C', 0.0006579555864149992), ('penalty', 'l2'), ('solver', 'liblinear')])}




{'fold': 5, 'accuracy': 0.755, 'precision': 0.024, 'sensitivity': 0.467, 'f1_score': 0.045, 'fbeta_2': 0.098, 'roc_auc': np.float64(0.667), 'NNS': 42.429, 'best_params': OrderedDict([('C', 0.0001), ('penalty', 'l2'), ('solver', 'liblinear')])}

Mean scores across folds ( undersample ):
accuracy: 0.759 ± 0.025
precision: 0.029 ± 0.009
sensitivity: 0.586 ± 0.140
f1_score: 0.055 ± 0.017
fbeta_2: 0.121 ± 0.035
roc_auc: 0.737 ± 0.059
NNS: 37.527 ± 10.204


In [8]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="oversample",
    n_splits=5,
    random_state=42,
)


=== Technique: OVERSAMPLE ===
{'fold': 1, 'accuracy': 0.888, 'precision': 0.023, 'sensitivity': 0.2, 'f1_score': 0.042, 'fbeta_2': 0.08, 'roc_auc': np.float64(0.516), 'NNS': 42.667, 'best_params': OrderedDict([('C', 997.7505599555375), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.868, 'precision': 0.032, 'sensitivity': 0.357, 'f1_score': 0.058, 'fbeta_2': 0.117, 'roc_auc': np.float64(0.572), 'NNS': 31.4, 'best_params': OrderedDict([('C', 487.9169484427551), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.853, 'precision': 0.029, 'sensitivity': 0.357, 'f1_score': 0.053, 'fbeta_2': 0.108, 'roc_auc': np.float64(0.631), 'NNS': 35.0, 'best_params': OrderedDict([('C', 1000.0), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.885, 'precision': 0.03, 'sensitivity': 0.286, 'f1_score': 0.054, 'fbeta_2': 0.105, 'roc_auc': np.float64(0.594), 'NNS': 33.5, 'best_params': OrderedDict([('C', 997.7505599555375), ('penalty', 'l2'

In [9]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.89, 'precision': 0.024, 'sensitivity': 0.2, 'f1_score': 0.043, 'fbeta_2': 0.081, 'roc_auc': np.float64(0.512), 'NNS': 41.667, 'best_params': OrderedDict([('C', 1000.0), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.873, 'precision': 0.033, 'sensitivity': 0.357, 'f1_score': 0.061, 'fbeta_2': 0.121, 'roc_auc': np.float64(0.577), 'NNS': 30.2, 'best_params': OrderedDict([('C', 1000.0), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.86, 'precision': 0.03, 'sensitivity': 0.357, 'f1_score': 0.056, 'fbeta_2': 0.113, 'roc_auc': np.float64(0.631), 'NNS': 33.2, 'best_params': OrderedDict([('C', 1000.0), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.885, 'precision': 0.023, 'sensitivity': 0.214, 'f1_score': 0.041, 'fbeta_2': 0.08, 'roc_auc': np.float64(0.591), 'NNS': 44.0, 'best_params': OrderedDict([('C', 997.057119818347), ('penalty', 'l2'), ('solver', 'liblinear')])}


In [10]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.894, 'precision': 0.017, 'sensitivity': 0.133, 'f1_score': 0.03, 'fbeta_2': 0.056, 'roc_auc': np.float64(0.518), 'NNS': 59.0, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 15), ('min_samples_leaf', 1), ('min_samples_split', 2)])}
{'fold': 2, 'accuracy': 0.909, 'precision': 0.029, 'sensitivity': 0.214, 'f1_score': 0.051, 'fbeta_2': 0.094, 'roc_auc': np.float64(0.617), 'NNS': 34.333, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 9), ('min_samples_leaf', 1), ('min_samples_split', 2)])}
{'fold': 3, 'accuracy': 0.893, 'precision': 0.047, 'sensitivity': 0.429, 'f1_score': 0.085, 'fbeta_2': 0.163, 'roc_auc': np.float64(0.747), 'NNS': 21.333, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 17), ('min_samples_leaf', 4), ('min_samples_split', 2)])}
{'fold': 4, 'accuracy': 0.933, 'precision': 0.041, 'sensitivity': 0.214, 'f1_score': 0.069, 'fbeta_2': 0.116, 'roc_auc': np.float64(

In [11]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.906, 'precision': 0.019, 'sensitivity': 0.133, 'f1_score': 0.034, 'fbeta_2': 0.061, 'roc_auc': np.float64(0.521), 'NNS': 51.5, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 50), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 50)])}
{'fold': 2, 'accuracy': 0.884, 'precision': 0.03, 'sensitivity': 0.286, 'f1_score': 0.054, 'fbeta_2': 0.105, 'roc_auc': np.float64(0.62), 'NNS': 33.75, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 50), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 300)])}
{'fold': 3, 'accuracy': 0.904, 'precision': 0.044, 'sensitivity': 0.357, 'f1_score': 0.079, 'fbeta_2': 0.148, 'roc_auc': np.float64(0.662), 'NNS': 22.6, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 15), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 300)])}
{'f

In [12]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.957, 'precision': 0.068, 'sensitivity': 0.2, 'f1_score': 0.102, 'fbeta_2': 0.144, 'roc_auc': np.float64(0.628), 'NNS': 14.667, 'best_params': OrderedDict([('colsample_bytree', 0.5), ('gamma', 0), ('learning_rate', 0.1), ('max_depth', 1), ('min_child_weight', 1), ('n_estimators', 356), ('subsample', 0.5)])}
{'fold': 2, 'accuracy': 0.961, 'precision': 0.075, 'sensitivity': 0.214, 'f1_score': 0.111, 'fbeta_2': 0.156, 'roc_auc': np.float64(0.764), 'NNS': 13.333, 'best_params': OrderedDict([('colsample_bytree', 0.5814312884639665), ('gamma', 0), ('learning_rate', 0.1), ('max_depth', 50), ('min_child_weight', 4), ('n_estimators', 293), ('subsample', 1.0)])}
{'fold': 3, 'accuracy': 0.962, 'precision': 0.1, 'sensitivity': 0.286, 'f1_score': 0.148, 'fbeta_2': 0.208, 'roc_auc': np.float64(0.814), 'NNS': 10.0, 'best_params': OrderedDict([('colsample_bytree', 0.9591391638846221), ('gamma', 2), ('learning_rate', 0.1), ('max_depth', 11), ('mi

In [6]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    xgb_04=True
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.943, 'precision': 0.05, 'sensitivity': 0.2, 'f1_score': 0.08, 'fbeta_2': 0.125, 'roc_auc': np.float64(0.628), 'NNS': 20.0, 'best_params': OrderedDict([('colsample_bytree', 0.5), ('gamma', 0), ('learning_rate', 0.1), ('max_depth', 1), ('min_child_weight', 1), ('n_estimators', 356), ('subsample', 0.5)])}
{'fold': 2, 'accuracy': 0.944, 'precision': 0.05, 'sensitivity': 0.214, 'f1_score': 0.081, 'fbeta_2': 0.129, 'roc_auc': np.float64(0.764), 'NNS': 20.0, 'best_params': OrderedDict([('colsample_bytree', 0.5814312884639665), ('gamma', 0), ('learning_rate', 0.1), ('max_depth', 50), ('min_child_weight', 4), ('n_estimators', 293), ('subsample', 1.0)])}
{'fold': 3, 'accuracy': 0.952, 'precision': 0.091, 'sensitivity': 0.357, 'f1_score': 0.145, 'fbeta_2': 0.225, 'roc_auc': np.float64(0.814), 'NNS': 11.0, 'best_params': OrderedDict([('colsample_bytree', 0.9591391638846221), ('gamma', 2), ('learning_rate', 0.1), ('max_depth', 11), ('min_chi

In [14]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.952, 'precision': 0.061, 'sensitivity': 0.2, 'f1_score': 0.094, 'fbeta_2': 0.138, 'roc_auc': np.float64(0.694), 'NNS': 16.333, 'best_params': {}}
{'fold': 2, 'accuracy': 0.944, 'precision': 0.091, 'sensitivity': 0.429, 'f1_score': 0.15, 'fbeta_2': 0.246, 'roc_auc': np.float64(0.785), 'NNS': 11.0, 'best_params': {}}
{'fold': 3, 'accuracy': 0.939, 'precision': 0.045, 'sensitivity': 0.214, 'f1_score': 0.075, 'fbeta_2': 0.123, 'roc_auc': np.float64(0.745), 'NNS': 22.0, 'best_params': {}}
{'fold': 4, 'accuracy': 0.962, 'precision': 0.029, 'sensitivity': 0.071, 'f1_score': 0.042, 'fbeta_2': 0.056, 'roc_auc': np.float64(0.635), 'NNS': 34.0, 'best_params': {}}
{'fold': 5, 'accuracy': 0.955, 'precision': 0.065, 'sensitivity': 0.2, 'f1_score': 0.098, 'fbeta_2': 0.142, 'roc_auc': np.float64(0.632), 'NNS': 15.333, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.951 ± 0.008
precision: 0.058 ± 0.021
sensitivity: 0.22

## y_cvdeath_6_months

In [15]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===
{'fold': 1, 'accuracy': 0.509, 'precision': 0.058, 'sensitivity': 0.8, 'f1_score': 0.107, 'fbeta_2': 0.224, 'roc_auc': np.float64(0.693), 'NNS': 17.361, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 2, 'accuracy': 0.563, 'precision': 0.06, 'sensitivity': 0.75, 'f1_score': 0.11, 'fbeta_2': 0.226, 'roc_auc': np.float64(0.711), 'NNS': 16.788, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 3, 'accuracy': 0.498, 'precision': 0.059, 'sensitivity': 0.864, 'f1_score': 0.11, 'fbeta_2': 0.232, 'roc_auc': np.float64(0.734), 'NNS': 16.947, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 4, 'accuracy': 0.544, 'precision': 0.059, 'sensitivity': 0.756, 'f1_score': 0.109, 'fbeta_2': 0.224, 'roc_auc': np.float64(0.694), 'NNS': 17.029, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 5, 'accuracy': 0.576, 'precision': 0.074, 'sensitivity': 0.911, 'f1_score': 0.137, 'fbeta_2': 0.28, 'roc_auc': np.float64(0.777), 'N

In [16]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="oversample",
    n_splits=5,
    random_state=42,
)


=== Technique: OVERSAMPLE ===
{'fold': 1, 'accuracy': 0.79, 'precision': 0.066, 'sensitivity': 0.356, 'f1_score': 0.111, 'fbeta_2': 0.189, 'roc_auc': np.float64(0.652), 'NNS': 15.187, 'best_params': OrderedDict([('C', 5.766226607700993), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.698, 'precision': 0.08, 'sensitivity': 0.705, 'f1_score': 0.144, 'fbeta_2': 0.276, 'roc_auc': np.float64(0.77), 'NNS': 12.452, 'best_params': OrderedDict([('C', 0.0008114557698252823), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.755, 'precision': 0.085, 'sensitivity': 0.591, 'f1_score': 0.149, 'fbeta_2': 0.27, 'roc_auc': np.float64(0.727), 'NNS': 11.769, 'best_params': OrderedDict([('C', 487.9169484427551), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.776, 'precision': 0.081, 'sensitivity': 0.489, 'f1_score': 0.139, 'fbeta_2': 0.243, 'roc_auc': np.float64(0.723), 'NNS': 12.364, 'best_params': OrderedDict([('C', 1.5748255317294

In [17]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.806, 'precision': 0.079, 'sensitivity': 0.4, 'f1_score': 0.132, 'fbeta_2': 0.221, 'roc_auc': np.float64(0.659), 'NNS': 12.611, 'best_params': OrderedDict([('C', 2.0871588778809445), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.736, 'precision': 0.096, 'sensitivity': 0.75, 'f1_score': 0.17, 'fbeta_2': 0.317, 'roc_auc': np.float64(0.762), 'NNS': 10.424, 'best_params': OrderedDict([('C', 0.0020836446648093763), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.77, 'precision': 0.076, 'sensitivity': 0.477, 'f1_score': 0.13, 'fbeta_2': 0.231, 'roc_auc': np.float64(0.708), 'NNS': 13.238, 'best_params': OrderedDict([('C', 1000.0), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.712, 'precision': 0.075, 'sensitivity': 0.6, 'f1_score': 0.133, 'fbeta_2': 0.25, 'roc_auc': np.float64(0.733), 'NNS': 13.333, 'best_params': OrderedDict([('C', 0.0014820540871886187), ('penalty',

In [18]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.727, 'precision': 0.044, 'sensitivity': 0.311, 'f1_score': 0.078, 'fbeta_2': 0.141, 'roc_auc': np.float64(0.572), 'NNS': 22.571, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 15), ('min_samples_leaf', 9), ('min_samples_split', 8)])}
{'fold': 2, 'accuracy': 0.764, 'precision': 0.058, 'sensitivity': 0.364, 'f1_score': 0.1, 'fbeta_2': 0.177, 'roc_auc': np.float64(0.619), 'NNS': 17.25, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 15), ('min_samples_leaf', 9), ('min_samples_split', 9)])}
{'fold': 3, 'accuracy': 0.615, 'precision': 0.06, 'sensitivity': 0.659, 'f1_score': 0.11, 'fbeta_2': 0.22, 'roc_auc': np.float64(0.677), 'NNS': 16.655, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 3), ('min_samples_leaf', 7), ('min_samples_split', 20)])}
{'fold': 4, 'accuracy': 0.794, 'precision': 0.088, 'sensitivity': 0.489, 'f1_score': 0.149, 'fbeta_2': 0.256, 'roc_auc': np.float64(0.623

In [19]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.824, 'precision': 0.099, 'sensitivity': 0.467, 'f1_score': 0.163, 'fbeta_2': 0.268, 'roc_auc': np.float64(0.75), 'NNS': 10.095, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 41), ('max_features', None), ('min_samples_leaf', 6), ('min_samples_split', 12), ('n_estimators', 119)])}
{'fold': 2, 'accuracy': 0.844, 'precision': 0.099, 'sensitivity': 0.409, 'f1_score': 0.159, 'fbeta_2': 0.251, 'roc_auc': np.float64(0.739), 'NNS': 10.111, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 27), ('max_features', None), ('min_samples_leaf', 6), ('min_samples_split', 20), ('n_estimators', 50)])}
{'fold': 3, 'accuracy': 0.819, 'precision': 0.107, 'sensitivity': 0.545, 'f1_score': 0.178, 'fbeta_2': 0.299, 'roc_auc': np.float64(0.765), 'NNS': 9.375, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 26), ('max_features', None), ('min_samples_leaf', 2), ('min_samples_split', 5), ('n_estimators', 300)])}


In [20]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.775, 'precision': 0.074, 'sensitivity': 0.444, 'f1_score': 0.127, 'fbeta_2': 0.223, 'roc_auc': np.float64(0.713), 'NNS': 13.45, 'best_params': OrderedDict([('colsample_bytree', 0.6433472919567121), ('gamma', 0), ('learning_rate', 0.06897949878729645), ('max_depth', 38), ('min_child_weight', 14), ('n_estimators', 315), ('subsample', 0.9943332581973823)])}
{'fold': 2, 'accuracy': 0.814, 'precision': 0.097, 'sensitivity': 0.5, 'f1_score': 0.162, 'fbeta_2': 0.273, 'roc_auc': np.float64(0.746), 'NNS': 10.318, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 1), ('learning_rate', 0.1), ('max_depth', 15), ('min_child_weight', 17), ('n_estimators', 453), ('subsample', 0.7304498720559391)])}
{'fold': 3, 'accuracy': 0.778, 'precision': 0.094, 'sensitivity': 0.591, 'f1_score': 0.161, 'fbeta_2': 0.286, 'roc_auc': np.float64(0.773), 'NNS': 10.692, 'best_params': OrderedDict([('colsample_bytree', 0.9327892914535585), ('gamma',

In [21]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    xgb_04=True
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.724, 'precision': 0.076, 'sensitivity': 0.578, 'f1_score': 0.134, 'fbeta_2': 0.249, 'roc_auc': np.float64(0.713), 'NNS': 13.192, 'best_params': OrderedDict([('colsample_bytree', 0.6433472919567121), ('gamma', 0), ('learning_rate', 0.06897949878729645), ('max_depth', 38), ('min_child_weight', 14), ('n_estimators', 315), ('subsample', 0.9943332581973823)])}
{'fold': 2, 'accuracy': 0.76, 'precision': 0.087, 'sensitivity': 0.591, 'f1_score': 0.151, 'fbeta_2': 0.273, 'roc_auc': np.float64(0.746), 'NNS': 11.538, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 1), ('learning_rate', 0.1), ('max_depth', 15), ('min_child_weight', 17), ('n_estimators', 453), ('subsample', 0.7304498720559391)])}
{'fold': 3, 'accuracy': 0.709, 'precision': 0.081, 'sensitivity': 0.682, 'f1_score': 0.145, 'fbeta_2': 0.274, 'roc_auc': np.float64(0.773), 'NNS': 12.367, 'best_params': OrderedDict([('colsample_bytree', 0.9327892914535585), ('gamma

In [22]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.788, 'precision': 0.086, 'sensitivity': 0.489, 'f1_score': 0.146, 'fbeta_2': 0.252, 'roc_auc': np.float64(0.74), 'NNS': 11.682, 'best_params': {}}
{'fold': 2, 'accuracy': 0.779, 'precision': 0.085, 'sensitivity': 0.523, 'f1_score': 0.146, 'fbeta_2': 0.257, 'roc_auc': np.float64(0.716), 'NNS': 11.783, 'best_params': {}}
{'fold': 3, 'accuracy': 0.748, 'precision': 0.077, 'sensitivity': 0.545, 'f1_score': 0.135, 'fbeta_2': 0.246, 'roc_auc': np.float64(0.702), 'NNS': 12.958, 'best_params': {}}
{'fold': 4, 'accuracy': 0.768, 'precision': 0.081, 'sensitivity': 0.511, 'f1_score': 0.14, 'fbeta_2': 0.248, 'roc_auc': np.float64(0.718), 'NNS': 12.304, 'best_params': {}}
{'fold': 5, 'accuracy': 0.776, 'precision': 0.104, 'sensitivity': 0.667, 'f1_score': 0.18, 'fbeta_2': 0.321, 'roc_auc': np.float64(0.795), 'NNS': 9.6, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.772 ± 0.014
precision: 0.087 ± 0.009
sensitivity:

## y_death_6_months

In [23]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===
{'fold': 1, 'accuracy': 0.482, 'precision': 0.078, 'sensitivity': 0.883, 'f1_score': 0.144, 'fbeta_2': 0.289, 'roc_auc': np.float64(0.699), 'NNS': 12.792, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 2, 'accuracy': 0.421, 'precision': 0.067, 'sensitivity': 0.847, 'f1_score': 0.124, 'fbeta_2': 0.255, 'roc_auc': np.float64(0.686), 'NNS': 14.92, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 3, 'accuracy': 0.465, 'precision': 0.076, 'sensitivity': 0.898, 'f1_score': 0.14, 'fbeta_2': 0.283, 'roc_auc': np.float64(0.729), 'NNS': 13.189, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 4, 'accuracy': 0.411, 'precision': 0.065, 'sensitivity': 0.831, 'f1_score': 0.12, 'fbeta_2': 0.247, 'roc_auc': np.float64(0.685), 'NNS': 15.429, 'best_params': OrderedDict([('var_smoothing', 0.09706226232971742)])}
{'fold': 5, 'accuracy': 0.482, 'precision': 0.077, 'sensitivity': 0.867, 'f1_score': 0.141, 'fbeta_2': 0.284, 'roc_auc': n

In [24]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="oversample",
    n_splits=5,
    random_state=42,
)


=== Technique: OVERSAMPLE ===
{'fold': 1, 'accuracy': 0.623, 'precision': 0.1, 'sensitivity': 0.833, 'f1_score': 0.179, 'fbeta_2': 0.338, 'roc_auc': np.float64(0.768), 'NNS': 10.0, 'best_params': OrderedDict([('C', 0.00026840225775419917), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.595, 'precision': 0.098, 'sensitivity': 0.898, 'f1_score': 0.177, 'fbeta_2': 0.341, 'roc_auc': np.float64(0.792), 'NNS': 10.189, 'best_params': OrderedDict([('C', 0.00016455402543612014), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.652, 'precision': 0.104, 'sensitivity': 0.814, 'f1_score': 0.185, 'fbeta_2': 0.344, 'roc_auc': np.float64(0.79), 'NNS': 9.604, 'best_params': OrderedDict([('C', 0.0004350700418966027), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.628, 'precision': 0.095, 'sensitivity': 0.78, 'f1_score': 0.169, 'fbeta_2': 0.319, 'roc_auc': np.float64(0.76), 'NNS': 10.565, 'best_params': OrderedDict([('C', 0.0002044

In [25]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.658, 'precision': 0.099, 'sensitivity': 0.733, 'f1_score': 0.174, 'fbeta_2': 0.321, 'roc_auc': np.float64(0.761), 'NNS': 10.114, 'best_params': OrderedDict([('C', 0.0007318965102778926), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.629, 'precision': 0.1, 'sensitivity': 0.831, 'f1_score': 0.178, 'fbeta_2': 0.337, 'roc_auc': np.float64(0.783), 'NNS': 10.02, 'best_params': OrderedDict([('C', 0.00040577523241920604), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.635, 'precision': 0.098, 'sensitivity': 0.797, 'f1_score': 0.175, 'fbeta_2': 0.329, 'roc_auc': np.float64(0.787), 'NNS': 10.191, 'best_params': OrderedDict([('C', 0.00045482359208723073), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.671, 'precision': 0.102, 'sensitivity': 0.746, 'f1_score': 0.18, 'fbeta_2': 0.33, 'roc_auc': np.float64(0.761), 'NNS': 9.773, 'best_params': OrderedDict([('C', 0.0005803825

In [26]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.585, 'precision': 0.086, 'sensitivity': 0.767, 'f1_score': 0.154, 'fbeta_2': 0.296, 'roc_auc': np.float64(0.648), 'NNS': 11.696, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 3), ('min_samples_leaf', 8), ('min_samples_split', 10)])}
{'fold': 2, 'accuracy': 0.571, 'precision': 0.083, 'sensitivity': 0.78, 'f1_score': 0.15, 'fbeta_2': 0.29, 'roc_auc': np.float64(0.694), 'NNS': 12.087, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 3), ('min_samples_leaf', 6), ('min_samples_split', 19)])}
{'fold': 3, 'accuracy': 0.634, 'precision': 0.099, 'sensitivity': 0.814, 'f1_score': 0.177, 'fbeta_2': 0.334, 'roc_auc': np.float64(0.733), 'NNS': 10.062, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 3), ('min_samples_leaf', 4), ('min_samples_split', 20)])}
{'fold': 4, 'accuracy': 0.679, 'precision': 0.089, 'sensitivity': 0.61, 'f1_score': 0.156, 'fbeta_2': 0.281, 'roc_auc': np.float

In [27]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.587, 'precision': 0.078, 'sensitivity': 0.683, 'f1_score': 0.14, 'fbeta_2': 0.268, 'roc_auc': np.float64(0.632), 'NNS': 12.829, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 1), ('max_features', None), ('min_samples_leaf', 5), ('min_samples_split', 5), ('n_estimators', 294)])}
{'fold': 2, 'accuracy': 0.673, 'precision': 0.11, 'sensitivity': 0.814, 'f1_score': 0.194, 'fbeta_2': 0.358, 'roc_auc': np.float64(0.752), 'NNS': 9.062, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 50), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 300)])}
{'fold': 3, 'accuracy': 0.71, 'precision': 0.131, 'sensitivity': 0.881, 'f1_score': 0.228, 'fbeta_2': 0.41, 'roc_auc': np.float64(0.83), 'NNS': 7.654, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 46), ('max_features', 'sqrt'), ('min_samples_leaf', 9), ('min_samples_split', 5), ('n_estimators', 163)])}
{'f

In [28]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.674, 'precision': 0.096, 'sensitivity': 0.667, 'f1_score': 0.168, 'fbeta_2': 0.304, 'roc_auc': np.float64(0.745), 'NNS': 10.425, 'best_params': OrderedDict([('colsample_bytree', 0.8309322448615906), ('gamma', 5), ('learning_rate', 0.06842439703369384), ('max_depth', 1), ('min_child_weight', 1), ('n_estimators', 80), ('subsample', 0.7501929799439746)])}
{'fold': 2, 'accuracy': 0.7, 'precision': 0.113, 'sensitivity': 0.763, 'f1_score': 0.197, 'fbeta_2': 0.355, 'roc_auc': np.float64(0.764), 'NNS': 8.822, 'best_params': OrderedDict([('colsample_bytree', 0.5072918751629665), ('gamma', 5), ('learning_rate', 0.06206823564283474), ('max_depth', 34), ('min_child_weight', 1), ('n_estimators', 242), ('subsample', 0.8783455991824893)])}
{'fold': 3, 'accuracy': 0.731, 'precision': 0.127, 'sensitivity': 0.78, 'f1_score': 0.219, 'fbeta_2': 0.385, 'roc_auc': np.float64(0.825), 'NNS': 7.848, 'best_params': OrderedDict([('colsample_bytree', 0.808

In [7]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    xgb_04=True
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.548, 'precision': 0.087, 'sensitivity': 0.867, 'f1_score': 0.159, 'fbeta_2': 0.311, 'roc_auc': np.float64(0.745), 'NNS': 11.442, 'best_params': OrderedDict([('colsample_bytree', 0.8309322448615906), ('gamma', 5), ('learning_rate', 0.06842439703369384), ('max_depth', 1), ('min_child_weight', 1), ('n_estimators', 80), ('subsample', 0.7501929799439746)])}
{'fold': 2, 'accuracy': 0.606, 'precision': 0.098, 'sensitivity': 0.864, 'f1_score': 0.175, 'fbeta_2': 0.336, 'roc_auc': np.float64(0.764), 'NNS': 10.255, 'best_params': OrderedDict([('colsample_bytree', 0.5072918751629665), ('gamma', 5), ('learning_rate', 0.06206823564283474), ('max_depth', 34), ('min_child_weight', 1), ('n_estimators', 242), ('subsample', 0.8783455991824893)])}
{'fold': 3, 'accuracy': 0.628, 'precision': 0.106, 'sensitivity': 0.898, 'f1_score': 0.19, 'fbeta_2': 0.36, 'roc_auc': np.float64(0.825), 'NNS': 9.434, 'best_params': OrderedDict([('colsample_bytree', 0.8

In [30]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.686, 'precision': 0.091, 'sensitivity': 0.6, 'f1_score': 0.158, 'fbeta_2': 0.283, 'roc_auc': np.float64(0.712), 'NNS': 10.972, 'best_params': {}}
{'fold': 2, 'accuracy': 0.663, 'precision': 0.087, 'sensitivity': 0.627, 'f1_score': 0.153, 'fbeta_2': 0.279, 'roc_auc': np.float64(0.714), 'NNS': 11.514, 'best_params': {}}
{'fold': 3, 'accuracy': 0.68, 'precision': 0.081, 'sensitivity': 0.542, 'f1_score': 0.141, 'fbeta_2': 0.254, 'roc_auc': np.float64(0.73), 'NNS': 12.344, 'best_params': {}}
{'fold': 4, 'accuracy': 0.674, 'precision': 0.092, 'sensitivity': 0.644, 'f1_score': 0.161, 'fbeta_2': 0.292, 'roc_auc': np.float64(0.702), 'NNS': 10.895, 'best_params': {}}
{'fold': 5, 'accuracy': 0.696, 'precision': 0.107, 'sensitivity': 0.7, 'f1_score': 0.185, 'fbeta_2': 0.331, 'roc_auc': np.float64(0.756), 'NNS': 9.381, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.680 ± 0.011
precision: 0.091 ± 0.008
sensitivity: 

## y_hf_6_months

In [31]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===
{'fold': 1, 'accuracy': 0.568, 'precision': 0.19, 'sensitivity': 0.762, 'f1_score': 0.304, 'fbeta_2': 0.475, 'roc_auc': np.float64(0.689), 'NNS': 5.27, 'best_params': OrderedDict([('var_smoothing', 0.01271665763595647)])}
{'fold': 2, 'accuracy': 0.616, 'precision': 0.209, 'sensitivity': 0.76, 'f1_score': 0.328, 'fbeta_2': 0.497, 'roc_auc': np.float64(0.725), 'NNS': 4.789, 'best_params': OrderedDict([('var_smoothing', 0.003979348101349949)])}
{'fold': 3, 'accuracy': 0.635, 'precision': 0.218, 'sensitivity': 0.755, 'f1_score': 0.339, 'fbeta_2': 0.506, 'roc_auc': np.float64(0.725), 'NNS': 4.579, 'best_params': OrderedDict([('var_smoothing', 0.0006409532820054014)])}
{'fold': 4, 'accuracy': 0.625, 'precision': 0.212, 'sensitivity': 0.748, 'f1_score': 0.331, 'fbeta_2': 0.497, 'roc_auc': np.float64(0.706), 'NNS': 4.708, 'best_params': OrderedDict([('var_smoothing', 0.004825782792861399)])}
{'fold': 5, 'accuracy': 0.585, 'precision': 0.206, 'sensitivity': 0.821, '

In [32]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="oversample",
    n_splits=5,
    random_state=42,
)


=== Technique: OVERSAMPLE ===
{'fold': 1, 'accuracy': 0.602, 'precision': 0.217, 'sensitivity': 0.848, 'f1_score': 0.345, 'fbeta_2': 0.536, 'roc_auc': np.float64(0.766), 'NNS': 4.609, 'best_params': OrderedDict([('C', 0.0022193844519719334), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.657, 'precision': 0.231, 'sensitivity': 0.767, 'f1_score': 0.355, 'fbeta_2': 0.524, 'roc_auc': np.float64(0.78), 'NNS': 4.33, 'best_params': OrderedDict([('C', 0.00016494870038291586), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.668, 'precision': 0.249, 'sensitivity': 0.828, 'f1_score': 0.382, 'fbeta_2': 0.565, 'roc_auc': np.float64(0.791), 'NNS': 4.024, 'best_params': OrderedDict([('C', 0.0001), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.663, 'precision': 0.235, 'sensitivity': 0.762, 'f1_score': 0.359, 'fbeta_2': 0.526, 'roc_auc': np.float64(0.77), 'NNS': 4.261, 'best_params': OrderedDict([('C', 0.00010602594470834996),

In [33]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.642, 'precision': 0.224, 'sensitivity': 0.768, 'f1_score': 0.347, 'fbeta_2': 0.517, 'roc_auc': np.float64(0.763), 'NNS': 4.457, 'best_params': OrderedDict([('C', 0.00024897756574214145), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.659, 'precision': 0.232, 'sensitivity': 0.767, 'f1_score': 0.357, 'fbeta_2': 0.525, 'roc_auc': np.float64(0.78), 'NNS': 4.304, 'best_params': OrderedDict([('C', 0.00028171725782416947), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.706, 'precision': 0.266, 'sensitivity': 0.781, 'f1_score': 0.397, 'fbeta_2': 0.564, 'roc_auc': np.float64(0.799), 'NNS': 3.754, 'best_params': OrderedDict([('C', 0.0006892627964523442), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.694, 'precision': 0.25, 'sensitivity': 0.735, 'f1_score': 0.373, 'fbeta_2': 0.53, 'roc_auc': np.float64(0.775), 'NNS': 4.0, 'best_params': OrderedDict([('C', 0.0006712453996

In [34]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.628, 'precision': 0.223, 'sensitivity': 0.808, 'f1_score': 0.35, 'fbeta_2': 0.53, 'roc_auc': np.float64(0.74), 'NNS': 4.475, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 3), ('min_samples_leaf', 10), ('min_samples_split', 8)])}
{'fold': 2, 'accuracy': 0.739, 'precision': 0.279, 'sensitivity': 0.707, 'f1_score': 0.4, 'fbeta_2': 0.541, 'roc_auc': np.float64(0.743), 'NNS': 3.585, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 3), ('min_samples_leaf', 10), ('min_samples_split', 19)])}
{'fold': 3, 'accuracy': 0.773, 'precision': 0.306, 'sensitivity': 0.656, 'f1_score': 0.417, 'fbeta_2': 0.533, 'roc_auc': np.float64(0.785), 'NNS': 3.273, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 3), ('min_samples_leaf', 10), ('min_samples_split', 4)])}
{'fold': 4, 'accuracy': 0.75, 'precision': 0.282, 'sensitivity': 0.662, 'f1_score': 0.396, 'fbeta_2': 0.522, 'roc_auc': np.float64(0.76

In [35]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.633, 'precision': 0.226, 'sensitivity': 0.808, 'f1_score': 0.353, 'fbeta_2': 0.533, 'roc_auc': np.float64(0.755), 'NNS': 4.426, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 1), ('max_features', None), ('min_samples_leaf', 20), ('min_samples_split', 4), ('n_estimators', 56)])}
{'fold': 2, 'accuracy': 0.619, 'precision': 0.215, 'sensitivity': 0.787, 'f1_score': 0.337, 'fbeta_2': 0.513, 'roc_auc': np.float64(0.741), 'NNS': 4.661, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 1), ('max_features', None), ('min_samples_leaf', 20), ('min_samples_split', 2), ('n_estimators', 295)])}
{'fold': 3, 'accuracy': 0.739, 'precision': 0.297, 'sensitivity': 0.808, 'f1_score': 0.434, 'fbeta_2': 0.601, 'roc_auc': np.float64(0.817), 'NNS': 3.369, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 26), ('max_features', 'sqrt'), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 300)])}


In [36]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.633, 'precision': 0.226, 'sensitivity': 0.808, 'f1_score': 0.353, 'fbeta_2': 0.533, 'roc_auc': np.float64(0.768), 'NNS': 4.426, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 3), ('learning_rate', 0.001), ('max_depth', 1), ('min_child_weight', 20), ('n_estimators', 203), ('subsample', 0.5)])}
{'fold': 2, 'accuracy': 0.72, 'precision': 0.273, 'sensitivity': 0.767, 'f1_score': 0.403, 'fbeta_2': 0.563, 'roc_auc': np.float64(0.79), 'NNS': 3.661, 'best_params': OrderedDict([('colsample_bytree', 0.8085396792511581), ('gamma', 4), ('learning_rate', 0.036543270260218995), ('max_depth', 44), ('min_child_weight', 12), ('n_estimators', 301), ('subsample', 0.8243887548518629)])}
{'fold': 3, 'accuracy': 0.743, 'precision': 0.294, 'sensitivity': 0.768, 'f1_score': 0.426, 'fbeta_2': 0.581, 'roc_auc': np.float64(0.812), 'NNS': 3.397, 'best_params': OrderedDict([('colsample_bytree', 0.9097517663877747), ('gamma', 2), ('learning

In [37]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    xgb_04=True
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.724, 'precision': 0.076, 'sensitivity': 0.578, 'f1_score': 0.134, 'fbeta_2': 0.249, 'roc_auc': np.float64(0.713), 'NNS': 13.192, 'best_params': OrderedDict([('colsample_bytree', 0.6433472919567121), ('gamma', 0), ('learning_rate', 0.06897949878729645), ('max_depth', 38), ('min_child_weight', 14), ('n_estimators', 315), ('subsample', 0.9943332581973823)])}
{'fold': 2, 'accuracy': 0.76, 'precision': 0.087, 'sensitivity': 0.591, 'f1_score': 0.151, 'fbeta_2': 0.273, 'roc_auc': np.float64(0.746), 'NNS': 11.538, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 1), ('learning_rate', 0.1), ('max_depth', 15), ('min_child_weight', 17), ('n_estimators', 453), ('subsample', 0.7304498720559391)])}
{'fold': 3, 'accuracy': 0.709, 'precision': 0.081, 'sensitivity': 0.682, 'f1_score': 0.145, 'fbeta_2': 0.274, 'roc_auc': np.float64(0.773), 'NNS': 12.367, 'best_params': OrderedDict([('colsample_bytree', 0.9327892914535585), ('gamma

In [38]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.655, 'precision': 0.218, 'sensitivity': 0.689, 'f1_score': 0.331, 'fbeta_2': 0.481, 'roc_auc': np.float64(0.714), 'NNS': 4.587, 'best_params': {}}
{'fold': 2, 'accuracy': 0.684, 'precision': 0.251, 'sensitivity': 0.787, 'f1_score': 0.38, 'fbeta_2': 0.551, 'roc_auc': np.float64(0.775), 'NNS': 3.992, 'best_params': {}}
{'fold': 3, 'accuracy': 0.677, 'precision': 0.229, 'sensitivity': 0.682, 'f1_score': 0.343, 'fbeta_2': 0.489, 'roc_auc': np.float64(0.725), 'NNS': 4.359, 'best_params': {}}
{'fold': 4, 'accuracy': 0.672, 'precision': 0.229, 'sensitivity': 0.695, 'f1_score': 0.344, 'fbeta_2': 0.494, 'roc_auc': np.float64(0.748), 'NNS': 4.371, 'best_params': {}}
{'fold': 5, 'accuracy': 0.665, 'precision': 0.225, 'sensitivity': 0.695, 'f1_score': 0.34, 'fbeta_2': 0.49, 'roc_auc': np.float64(0.734), 'NNS': 4.448, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.670 ± 0.010
precision: 0.230 ± 0.011
sensitivity: 0

## y_inp_6_months

In [39]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===
{'fold': 1, 'accuracy': 0.563, 'precision': 0.287, 'sensitivity': 0.705, 'f1_score': 0.408, 'fbeta_2': 0.546, 'roc_auc': np.float64(0.642), 'NNS': 3.478, 'best_params': OrderedDict([('var_smoothing', 9.404008530276485e-06)])}
{'fold': 2, 'accuracy': 0.554, 'precision': 0.285, 'sensitivity': 0.719, 'f1_score': 0.408, 'fbeta_2': 0.551, 'roc_auc': np.float64(0.665), 'NNS': 3.513, 'best_params': OrderedDict([('var_smoothing', 6.135824702323322e-06)])}
{'fold': 3, 'accuracy': 0.59, 'precision': 0.293, 'sensitivity': 0.65, 'f1_score': 0.404, 'fbeta_2': 0.523, 'roc_auc': np.float64(0.632), 'NNS': 3.414, 'best_params': OrderedDict([('var_smoothing', 1.0074747252857916e-09)])}
{'fold': 4, 'accuracy': 0.324, 'precision': 0.231, 'sensitivity': 0.927, 'f1_score': 0.369, 'fbeta_2': 0.578, 'roc_auc': np.float64(0.64), 'NNS': 4.336, 'best_params': OrderedDict([('var_smoothing', 9.880791041332321e-08)])}
{'fold': 5, 'accuracy': 0.569, 'precision': 0.299, 'sensitivity': 0.7

In [40]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="oversample",
    n_splits=5,
    random_state=42,
)


=== Technique: OVERSAMPLE ===
{'fold': 1, 'accuracy': 0.614, 'precision': 0.306, 'sensitivity': 0.632, 'f1_score': 0.412, 'fbeta_2': 0.521, 'roc_auc': np.float64(0.679), 'NNS': 3.273, 'best_params': OrderedDict([('C', 0.00022962349947547856), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.61, 'precision': 0.321, 'sensitivity': 0.738, 'f1_score': 0.447, 'fbeta_2': 0.586, 'roc_auc': np.float64(0.699), 'NNS': 3.12, 'best_params': OrderedDict([('C', 0.00019739492283452339), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.617, 'precision': 0.303, 'sensitivity': 0.612, 'f1_score': 0.406, 'fbeta_2': 0.508, 'roc_auc': np.float64(0.664), 'NNS': 3.296, 'best_params': OrderedDict([('C', 0.006325991548214911), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.605, 'precision': 0.306, 'sensitivity': 0.673, 'f1_score': 0.421, 'fbeta_2': 0.543, 'roc_auc': np.float64(0.671), 'NNS': 3.263, 'best_params': OrderedDict([('C', 0.000140

In [41]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.619, 'precision': 0.316, 'sensitivity': 0.67, 'f1_score': 0.43, 'fbeta_2': 0.548, 'roc_auc': np.float64(0.689), 'NNS': 3.16, 'best_params': OrderedDict([('C', 0.000720178702871792), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.598, 'precision': 0.313, 'sensitivity': 0.738, 'f1_score': 0.439, 'fbeta_2': 0.58, 'roc_auc': np.float64(0.686), 'NNS': 3.198, 'best_params': OrderedDict([('C', 0.007564210311264189), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.592, 'precision': 0.299, 'sensitivity': 0.681, 'f1_score': 0.416, 'fbeta_2': 0.543, 'roc_auc': np.float64(0.658), 'NNS': 3.339, 'best_params': OrderedDict([('C', 0.005815732359243949), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.59, 'precision': 0.29, 'sensitivity': 0.635, 'f1_score': 0.398, 'fbeta_2': 0.513, 'roc_auc': np.float64(0.654), 'NNS': 3.448, 'best_params': OrderedDict([('C', 0.0012790570317768033

In [42]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.683, 'precision': 0.345, 'sensitivity': 0.533, 'f1_score': 0.419, 'fbeta_2': 0.48, 'roc_auc': np.float64(0.667), 'NNS': 2.899, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 3), ('min_samples_leaf', 1), ('min_samples_split', 2)])}
{'fold': 2, 'accuracy': 0.571, 'precision': 0.291, 'sensitivity': 0.7, 'f1_score': 0.411, 'fbeta_2': 0.546, 'roc_auc': np.float64(0.649), 'NNS': 3.44, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 6), ('min_samples_leaf', 10), ('min_samples_split', 3)])}
{'fold': 3, 'accuracy': 0.612, 'precision': 0.29, 'sensitivity': 0.562, 'f1_score': 0.382, 'fbeta_2': 0.473, 'roc_auc': np.float64(0.603), 'NNS': 3.452, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 8), ('min_samples_leaf', 1), ('min_samples_split', 3)])}
{'fold': 4, 'accuracy': 0.635, 'precision': 0.304, 'sensitivity': 0.554, 'f1_score': 0.393, 'fbeta_2': 0.476, 'roc_auc': np.float64(0.6

In [43]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.533, 'precision': 0.287, 'sensitivity': 0.793, 'f1_score': 0.421, 'fbeta_2': 0.586, 'roc_auc': np.float64(0.628), 'NNS': 3.488, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 1), ('max_features', None), ('min_samples_leaf', 20), ('min_samples_split', 5), ('n_estimators', 246)])}
{'fold': 2, 'accuracy': 0.516, 'precision': 0.28, 'sensitivity': 0.808, 'f1_score': 0.416, 'fbeta_2': 0.587, 'roc_auc': np.float64(0.662), 'NNS': 3.571, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 1), ('max_features', None), ('min_samples_leaf', 16), ('min_samples_split', 10), ('n_estimators', 59)])}
{'fold': 3, 'accuracy': 0.493, 'precision': 0.268, 'sensitivity': 0.796, 'f1_score': 0.401, 'fbeta_2': 0.571, 'roc_auc': np.float64(0.603), 'NNS': 3.729, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 1), ('max_features', None), ('min_samples_leaf', 5), ('min_samples_split', 5), ('n_estimators', 294)])}
{'

In [44]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.533, 'precision': 0.287, 'sensitivity': 0.793, 'f1_score': 0.421, 'fbeta_2': 0.586, 'roc_auc': np.float64(0.637), 'NNS': 3.488, 'best_params': OrderedDict([('colsample_bytree', 0.9552728116822058), ('gamma', 4), ('learning_rate', 0.009446821441851432), ('max_depth', 1), ('min_child_weight', 18), ('n_estimators', 10), ('subsample', 1.0)])}
{'fold': 2, 'accuracy': 0.61, 'precision': 0.317, 'sensitivity': 0.719, 'f1_score': 0.441, 'fbeta_2': 0.574, 'roc_auc': np.float64(0.706), 'NNS': 3.15, 'best_params': OrderedDict([('colsample_bytree', 0.8134452027865562), ('gamma', 4), ('learning_rate', 0.012940481902536691), ('max_depth', 50), ('min_child_weight', 17), ('n_estimators', 90), ('subsample', 0.5)])}
{'fold': 3, 'accuracy': 0.513, 'precision': 0.277, 'sensitivity': 0.796, 'f1_score': 0.411, 'fbeta_2': 0.579, 'roc_auc': np.float64(0.665), 'NNS': 3.609, 'best_params': OrderedDict([('colsample_bytree', 0.7334683305756435), ('gamma', 0

In [45]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    xgb_04=True
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.214, 'precision': 0.214, 'sensitivity': 1.0, 'f1_score': 0.353, 'fbeta_2': 0.577, 'roc_auc': np.float64(0.637), 'NNS': 4.67, 'best_params': OrderedDict([('colsample_bytree', 0.9552728116822058), ('gamma', 4), ('learning_rate', 0.009446821441851432), ('max_depth', 1), ('min_child_weight', 18), ('n_estimators', 10), ('subsample', 1.0)])}
{'fold': 2, 'accuracy': 0.445, 'precision': 0.269, 'sensitivity': 0.931, 'f1_score': 0.417, 'fbeta_2': 0.624, 'roc_auc': np.float64(0.706), 'NNS': 3.719, 'best_params': OrderedDict([('colsample_bytree', 0.8134452027865562), ('gamma', 4), ('learning_rate', 0.012940481902536691), ('max_depth', 50), ('min_child_weight', 17), ('n_estimators', 90), ('subsample', 0.5)])}
{'fold': 3, 'accuracy': 0.213, 'precision': 0.213, 'sensitivity': 1.0, 'f1_score': 0.352, 'fbeta_2': 0.576, 'roc_auc': np.float64(0.665), 'NNS': 4.685, 'best_params': OrderedDict([('colsample_bytree', 0.7334683305756435), ('gamma', 0), 

In [46]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.605, 'precision': 0.289, 'sensitivity': 0.579, 'f1_score': 0.385, 'fbeta_2': 0.482, 'roc_auc': np.float64(0.634), 'NNS': 3.464, 'best_params': {}}
{'fold': 2, 'accuracy': 0.587, 'precision': 0.278, 'sensitivity': 0.585, 'f1_score': 0.377, 'fbeta_2': 0.479, 'roc_auc': np.float64(0.638), 'NNS': 3.599, 'best_params': {}}
{'fold': 3, 'accuracy': 0.608, 'precision': 0.294, 'sensitivity': 0.6, 'f1_score': 0.395, 'fbeta_2': 0.497, 'roc_auc': np.float64(0.631), 'NNS': 3.397, 'best_params': {}}
{'fold': 4, 'accuracy': 0.58, 'precision': 0.27, 'sensitivity': 0.565, 'f1_score': 0.365, 'fbeta_2': 0.464, 'roc_auc': np.float64(0.61), 'NNS': 3.707, 'best_params': {}}
{'fold': 5, 'accuracy': 0.599, 'precision': 0.29, 'sensitivity': 0.605, 'f1_score': 0.393, 'fbeta_2': 0.497, 'roc_auc': np.float64(0.636), 'NNS': 3.443, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.596 ± 0.010
precision: 0.284 ± 0.009
sensitivity: 0.58

## y_stk_or_aemb

In [47]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===
{'fold': 1, 'accuracy': 0.27, 'precision': 0.024, 'sensitivity': 0.786, 'f1_score': 0.047, 'fbeta_2': 0.108, 'roc_auc': np.float64(0.594), 'NNS': 41.182, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 2, 'accuracy': 0.367, 'precision': 0.029, 'sensitivity': 0.852, 'f1_score': 0.056, 'fbeta_2': 0.128, 'roc_auc': np.float64(0.642), 'NNS': 34.348, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 3, 'accuracy': 0.089, 'precision': 0.023, 'sensitivity': 0.963, 'f1_score': 0.045, 'fbeta_2': 0.105, 'roc_auc': np.float64(0.494), 'NNS': 43.654, 'best_params': OrderedDict([('var_smoothing', 1.3787826946465175e-11)])}
{'fold': 4, 'accuracy': 0.331, 'precision': 0.024, 'sensitivity': 0.741, 'f1_score': 0.047, 'fbeta_2': 0.107, 'roc_auc': np.float64(0.543), 'NNS': 41.4, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 5, 'accuracy': 0.272, 'precision': 0.023, 'sensitivity': 0.778, 'f1_score': 0.045, 'fbeta_2': 0.104, 'roc_auc'

In [48]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="oversample",
    n_splits=5,
    random_state=42,
)


=== Technique: OVERSAMPLE ===
{'fold': 1, 'accuracy': 0.761, 'precision': 0.045, 'sensitivity': 0.464, 'f1_score': 0.082, 'fbeta_2': 0.162, 'roc_auc': np.float64(0.651), 'NNS': 22.231, 'best_params': OrderedDict([('C', 0.39745073812040177), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.775, 'precision': 0.037, 'sensitivity': 0.37, 'f1_score': 0.068, 'fbeta_2': 0.133, 'roc_auc': np.float64(0.628), 'NNS': 26.7, 'best_params': OrderedDict([('C', 487.9169484427551), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.764, 'precision': 0.032, 'sensitivity': 0.333, 'f1_score': 0.059, 'fbeta_2': 0.116, 'roc_auc': np.float64(0.637), 'NNS': 31.0, 'best_params': OrderedDict([('C', 0.589666833681593), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.727, 'precision': 0.028, 'sensitivity': 0.333, 'f1_score': 0.051, 'fbeta_2': 0.104, 'roc_auc': np.float64(0.584), 'NNS': 35.889, 'best_params': OrderedDict([('C', 487.9169484427551)

In [49]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.779, 'precision': 0.045, 'sensitivity': 0.429, 'f1_score': 0.082, 'fbeta_2': 0.159, 'roc_auc': np.float64(0.645), 'NNS': 22.167, 'best_params': OrderedDict([('C', 72.73082081096716), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.776, 'precision': 0.038, 'sensitivity': 0.37, 'f1_score': 0.068, 'fbeta_2': 0.134, 'roc_auc': np.float64(0.622), 'NNS': 26.6, 'best_params': OrderedDict([('C', 487.9169484427551), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.779, 'precision': 0.038, 'sensitivity': 0.37, 'f1_score': 0.069, 'fbeta_2': 0.135, 'roc_auc': np.float64(0.642), 'NNS': 26.2, 'best_params': OrderedDict([('C', 305.30513480136455), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.757, 'precision': 0.028, 'sensitivity': 0.296, 'f1_score': 0.051, 'fbeta_2': 0.102, 'roc_auc': np.float64(0.583), 'NNS': 35.625, 'best_params': OrderedDict([('C', 5.418805102104851), ('pen

In [50]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.836, 'precision': 0.017, 'sensitivity': 0.107, 'f1_score': 0.029, 'fbeta_2': 0.052, 'roc_auc': np.float64(0.53), 'NNS': 59.333, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 19), ('min_samples_leaf', 2), ('min_samples_split', 10)])}
{'fold': 2, 'accuracy': 0.799, 'precision': 0.005, 'sensitivity': 0.037, 'f1_score': 0.008, 'fbeta_2': 0.015, 'roc_auc': np.float64(0.498), 'NNS': 220.0, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 14), ('min_samples_leaf', 10), ('min_samples_split', 20)])}
{'fold': 3, 'accuracy': 0.801, 'precision': 0.031, 'sensitivity': 0.259, 'f1_score': 0.055, 'fbeta_2': 0.104, 'roc_auc': np.float64(0.582), 'NNS': 32.714, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 13), ('min_samples_leaf', 10), ('min_samples_split', 20)])}
{'fold': 4, 'accuracy': 0.797, 'precision': 0.045, 'sensitivity': 0.407, 'f1_score': 0.082, 'fbeta_2': 0.157, 'roc_auc': np.f

In [51]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.815, 'precision': 0.024, 'sensitivity': 0.179, 'f1_score': 0.042, 'fbeta_2': 0.078, 'roc_auc': np.float64(0.539), 'NNS': 41.6, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 32), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 17), ('n_estimators', 190)])}
{'fold': 2, 'accuracy': 0.755, 'precision': 0.021, 'sensitivity': 0.222, 'f1_score': 0.039, 'fbeta_2': 0.077, 'roc_auc': np.float64(0.493), 'NNS': 47.167, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 44), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 90)])}
{'fold': 3, 'accuracy': 0.758, 'precision': 0.021, 'sensitivity': 0.222, 'f1_score': 0.039, 'fbeta_2': 0.077, 'roc_auc': np.float64(0.512), 'NNS': 46.667, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 45), ('max_features', None), ('min_samples_leaf', 2), ('min_samples_split', 2), ('n_estimators', 179)])

In [52]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.947, 'precision': 0.026, 'sensitivity': 0.036, 'f1_score': 0.03, 'fbeta_2': 0.033, 'roc_auc': np.float64(0.634), 'NNS': 38.0, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 5), ('learning_rate', 0.1), ('max_depth', 20), ('min_child_weight', 2), ('n_estimators', 131), ('subsample', 0.5)])}
{'fold': 2, 'accuracy': 0.892, 'precision': 0.035, 'sensitivity': 0.148, 'f1_score': 0.057, 'fbeta_2': 0.09, 'roc_auc': np.float64(0.601), 'NNS': 28.25, 'best_params': OrderedDict([('colsample_bytree', 0.6756992604048166), ('gamma', 1), ('learning_rate', 0.09132970816909401), ('max_depth', 23), ('min_child_weight', 7), ('n_estimators', 500), ('subsample', 0.5240192784386454)])}
{'fold': 3, 'accuracy': 0.927, 'precision': 0.03, 'sensitivity': 0.074, 'f1_score': 0.043, 'fbeta_2': 0.057, 'roc_auc': np.float64(0.444), 'NNS': 33.0, 'best_params': OrderedDict([('colsample_bytree', 0.8326055855336129), ('gamma', 2), ('learning_rate',

In [54]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    xgb_04=True
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.906, 'precision': 0.042, 'sensitivity': 0.143, 'f1_score': 0.065, 'fbeta_2': 0.097, 'roc_auc': np.float64(0.634), 'NNS': 23.75, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 5), ('learning_rate', 0.1), ('max_depth', 20), ('min_child_weight', 2), ('n_estimators', 131), ('subsample', 0.5)])}
{'fold': 2, 'accuracy': 0.854, 'precision': 0.031, 'sensitivity': 0.185, 'f1_score': 0.053, 'fbeta_2': 0.093, 'roc_auc': np.float64(0.601), 'NNS': 32.2, 'best_params': OrderedDict([('colsample_bytree', 0.6756992604048166), ('gamma', 1), ('learning_rate', 0.09132970816909401), ('max_depth', 23), ('min_child_weight', 7), ('n_estimators', 500), ('subsample', 0.5240192784386454)])}
{'fold': 3, 'accuracy': 0.869, 'precision': 0.028, 'sensitivity': 0.148, 'f1_score': 0.048, 'fbeta_2': 0.08, 'roc_auc': np.float64(0.444), 'NNS': 35.25, 'best_params': OrderedDict([('colsample_bytree', 0.8326055855336129), ('gamma', 2), ('learning_rat

In [6]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.864, 'precision': 0.04, 'sensitivity': 0.214, 'f1_score': 0.067, 'fbeta_2': 0.115, 'roc_auc': np.float64(0.604), 'NNS': 25.0, 'best_params': {}}
{'fold': 2, 'accuracy': 0.847, 'precision': 0.035, 'sensitivity': 0.222, 'f1_score': 0.061, 'fbeta_2': 0.108, 'roc_auc': np.float64(0.601), 'NNS': 28.5, 'best_params': {}}
{'fold': 3, 'accuracy': 0.856, 'precision': 0.019, 'sensitivity': 0.111, 'f1_score': 0.033, 'fbeta_2': 0.057, 'roc_auc': np.float64(0.635), 'NNS': 51.667, 'best_params': {}}
{'fold': 4, 'accuracy': 0.856, 'precision': 0.019, 'sensitivity': 0.111, 'f1_score': 0.033, 'fbeta_2': 0.057, 'roc_auc': np.float64(0.54), 'NNS': 51.667, 'best_params': {}}
{'fold': 5, 'accuracy': 0.861, 'precision': 0.033, 'sensitivity': 0.185, 'f1_score': 0.056, 'fbeta_2': 0.096, 'roc_auc': np.float64(0.577), 'NNS': 30.4, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.857 ± 0.006
precision: 0.029 ± 0.008
sensitivity: 0