# Imports

In [2]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, fbeta_score

from skopt.space import Integer, Categorical

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import fbeta_score, roc_auc_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score

from skopt import BayesSearchCV

from typing import Dict, List, Literal
from skopt.space import Real, Integer, Categorical

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings("ignore")

## Read dataset

In [3]:
data = pd.read_csv("../data/processed/20. FINAL_mean_delta_multi_output.csv")
print(data.shape)
data.head()

(6091, 210)


Unnamed: 0,Hba1c,Hba1c Time,Hba1c FM,Hba1c FM Time,BMI,BMI Time,Cancer,Cancer Time,Carotid Disease,Carotid Disease Time,...,y_stk_or_aemb_3_months,y_stk_or_aemb_6_months,y_stk_or_aemb_12_months,y_stk_or_aemb_24_months,y_stk_or_aemb,History of Vascular Disease,Antihypertensive Medication,Diabetes Mellitus,Diabetes Medication,Abnormal Kidney Function
0,6.26,-501,6.26,-501,20.7,-19,0,10000,0,10000,...,0,0,0,0,0,0,1,0,0,0
1,6.26,-501,6.26,-501,26.7,-780,1,-4846,0,10000,...,0,0,0,0,0,0,1,0,0,0
2,5.8,-287,6.3,-2701,31.1,-35,0,10000,0,10000,...,0,0,0,0,0,0,1,1,1,1
3,6.26,-501,6.26,-501,21.3,-207,0,10000,0,10000,...,0,0,0,0,0,1,1,0,0,1
4,5.9,-162,5.4,-5209,37.8,-554,1,-86,0,10000,...,0,0,0,0,0,1,1,1,1,0


## Drop columns

In [4]:
cols_to_drop = [col for col in data.columns if col.endswith('_t')]
data = data.drop(columns=cols_to_drop)

targets = [col for col in data.columns if col.startswith('y_')]
print(targets)

cols_to_drop = [col for col in data.columns if col.endswith('Time')]
data = data.drop(columns=cols_to_drop)


cols_to_drop = [col for col in data.columns if "FM" in col]
data = data.drop(columns=cols_to_drop)

print(data.shape)
data.head()

['y_acs', 'y_aemb', 'y_cvdeath', 'y_death', 'y_hf', 'y_inp', 'y_stk', 'y_acs_1_month', 'y_acs_3_months', 'y_acs_6_months', 'y_acs_12_months', 'y_acs_24_months', 'y_aemb_1_month', 'y_aemb_3_months', 'y_aemb_6_months', 'y_aemb_12_months', 'y_aemb_24_months', 'y_cvdeath_1_month', 'y_cvdeath_3_months', 'y_cvdeath_6_months', 'y_cvdeath_12_months', 'y_cvdeath_24_months', 'y_death_1_month', 'y_death_3_months', 'y_death_6_months', 'y_death_12_months', 'y_death_24_months', 'y_hf_1_month', 'y_hf_3_months', 'y_hf_6_months', 'y_hf_12_months', 'y_hf_24_months', 'y_inp_1_month', 'y_inp_3_months', 'y_inp_6_months', 'y_inp_12_months', 'y_inp_24_months', 'y_stk_1_month', 'y_stk_3_months', 'y_stk_6_months', 'y_stk_12_months', 'y_stk_24_months', 'y_stk_or_aemb_1_month', 'y_stk_or_aemb_3_months', 'y_stk_or_aemb_6_months', 'y_stk_or_aemb_12_months', 'y_stk_or_aemb_24_months', 'y_stk_or_aemb']
(6091, 124)


Unnamed: 0,Hba1c,BMI,Cancer,Carotid Disease,Coronary Disease,COPD,Creatinine,DBP,Dyslipidemia,eGFR,...,y_stk_or_aemb_3_months,y_stk_or_aemb_6_months,y_stk_or_aemb_12_months,y_stk_or_aemb_24_months,y_stk_or_aemb,History of Vascular Disease,Antihypertensive Medication,Diabetes Mellitus,Diabetes Medication,Abnormal Kidney Function
0,6.26,20.7,0,0,0,0,0.99,87.0,1,64.15,...,0,0,0,0,0,0,1,0,0,0
1,6.26,26.7,1,0,0,0,0.53,81.0,0,88.5,...,0,0,0,0,0,0,1,0,0,0
2,5.8,31.1,0,0,0,0,0.88,109.0,1,46.8,...,0,0,0,0,0,0,1,1,1,1
3,6.26,21.3,0,0,0,0,1.56,63.0,1,9.0,...,0,0,0,0,0,1,1,0,0,1
4,5.9,37.8,1,0,1,0,0.64,98.0,1,76.6,...,0,0,0,0,0,1,1,1,1,0


## Models settings

In [5]:
# Define 5-fold stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create F2 scorer
f2_scorer = make_scorer(fbeta_score, beta=2)

# ===================== Naive Bayes =====================
search_space_nb = {
    "var_smoothing": (1e-12, 1e-1, "log-uniform")
}

nb_opt = BayesSearchCV(
    estimator=GaussianNB(),
    search_spaces=search_space_nb,
    scoring=f2_scorer,
    n_iter=30,
    cv=cv,
    n_jobs=-1,
    random_state=42
)

# ===================== Logistic Regression =====================
search_space_lr = {
    'C': (1e-4, 1e+3, 'log-uniform'),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

lr_opt = BayesSearchCV(
    LogisticRegression(max_iter=1000, random_state=42),
    search_spaces=search_space_lr,
    scoring=f2_scorer,
    n_iter=30,
    cv=cv,
    n_jobs=-1,
    random_state=42
)

# ===================== Decision Tree =====================
search_space_dt = {
    "max_depth": Integer(3, 20),
    "min_samples_split": Integer(2, 20),
    "min_samples_leaf": Integer(1, 10),
    "criterion": Categorical(["gini", "entropy"])
}

dt_opt = BayesSearchCV(
    DecisionTreeClassifier(random_state=42),
    search_spaces=search_space_dt,
    n_iter=30,
    scoring=f2_scorer,
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

# ===================== Random Forest =====================
search_space_rf = {
    'n_estimators': (50, 300),
    'max_depth': (1, 50),
    'min_samples_split': (2, 20),
    'min_samples_leaf': (1, 20),
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

rf_opt = BayesSearchCV(
    RandomForestClassifier(random_state=42),
    search_spaces=search_space_rf,
    scoring=f2_scorer,
    n_iter=30,
    cv=cv,
    n_jobs=-1,
    random_state=42
)

# ===================== XGBoost =====================
search_space_xgb = {
    'n_estimators': (10, 500),
    'max_depth': (1, 50),
    'learning_rate': (0.001, 0.1, 'uniform'),
    'subsample': (0.5, 1.0),
    'colsample_bytree': (0.5, 1.0),
    'gamma': (0, 5),
    'min_child_weight': (1, 20)
}

xgb_opt = BayesSearchCV(
    XGBClassifier(random_state=42),
    search_spaces=search_space_xgb,
    scoring=f2_scorer,
    n_iter=30,
    n_jobs=-1,
    random_state=42
)

# ===================== MLP ===============================

mlp_opt = MLPClassifier(max_iter=2000, solver="adam", activation="relu", hidden_layer_sizes=(200,100), random_state=42)

# ===================== Optimizer Map =====================
optimizer_map = {
    "nb": nb_opt,
    "lr": lr_opt,
    "dt": dt_opt,
    "rf": rf_opt,
    "xgb": xgb_opt,
    "mlp": mlp_opt
}


## Define function

In [6]:
SamplingName = Literal["baseline", "undersample", "oversample", "smote", "all"]
ModelName = Literal["nb", "lr", "xgb", "dt", "rf", "mlp"]


def _make_resampler(name: SamplingName, random_state: int, y_train: pd.Series = None):
    if name == "baseline":
        return None
    if name == "undersample":
        if y_train is None:
            raise ValueError("y_train must be provided for undersampling strategy")
        n_minority = y_train.sum()
        n_required = max(int(0.1 * len(y_train)), n_minority * 2)
        n_majority = n_required - n_minority
        n_majority = min(n_majority, (y_train == 0).sum())
        sampling_strategy = {0: n_majority, 1: n_minority}
        return RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=random_state)
    if name == "oversample":
        return RandomOverSampler(random_state=random_state)
    if name == "smote":
        return SMOTE(random_state=random_state)
    raise ValueError(f"Unknown sampling technique: {name}")


def run_cv_with_sampling(
    X_full: pd.DataFrame,
    target_cols: List[str],
    target_name: str,
    optimizer_map: Dict[ModelName, object],
    model_name: ModelName = "nb",
    sampling: SamplingName = "all",
    n_splits: int = 5,
    random_state: int = 42,
    xgb_04 = False,
) -> Dict[str, dict]:
    if target_name not in target_cols:
        raise ValueError("target_name must be inside target_cols")
    missing_targets = [c for c in target_cols if c not in X_full.columns]
    if missing_targets:
        raise ValueError(f"Target columns not in X_full: {missing_targets}")

    y = X_full[target_name].copy()
    X = X_full.drop(columns=target_cols).copy()

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    if sampling == "all":
        techniques = ["baseline", "undersample", "oversample", "smote"]
    else:
        techniques = [sampling]

    results: Dict[str, dict] = {}

    for tech in techniques:
        print(f"\n=== Technique: {tech.upper()} ===")

        fold_results: List[dict] = []
        best_params_per_fold: List[dict] = []

        for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), start=1):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            if model_name in ["nb", "lr", "mlp"]:
                scaler = StandardScaler()
                X_train = scaler.fit_transform(X_train)
                X_val = scaler.transform(X_val)

            resampler = _make_resampler(tech, random_state, y_train)
            if resampler is not None:
                X_train_rs, y_train_rs = resampler.fit_resample(X_train, y_train)
            else:
                X_train_rs, y_train_rs = X_train, y_train

            opt = optimizer_map[model_name]
            opt.fit(X_train_rs, y_train_rs)

            if model_name == "mlp":
                best_model = opt
                best_params_per_fold = {}
            else:
                best_model = opt.best_estimator_
                best_params_per_fold.append(getattr(opt, "best_params_", {}))

            y_pred = best_model.predict(X_val)

            if hasattr(best_model, "predict_proba"):
                y_pred_proba = best_model.predict_proba(X_val)[:, 1]
            else:
                scores = best_model.decision_function(X_val)
                smin, smax = scores.min(), scores.max()
                y_pred_proba = (scores - smin) / (smax - smin + 1e-9)

            if model_name == "xgb" and xgb_04:
                y_pred = (y_pred_proba >= 0.4).astype(int)

            prec = precision_score(y_val, y_pred, zero_division=0)
            rec = recall_score(y_val, y_pred, zero_division=0)
            f1 = f1_score(y_val, y_pred, zero_division=0)
            fbeta2 = fbeta_score(y_val, y_pred, beta=2, zero_division=0)
            roc = roc_auc_score(y_val, y_pred_proba)

            fold_metrics = {
                "fold": fold,
                "accuracy": accuracy_score(y_val, y_pred),
                "precision": prec,
                "sensitivity": rec,
                "f1_score": f1,
                "fbeta_2": fbeta2,
                "roc_auc": roc,
                "NNS": (1 / prec) if prec > 0 else np.inf,
                "best_params": getattr(opt, "best_params_", {}),
            }
            fold_results.append(fold_metrics)

            print({k: (round(v, 3) if isinstance(v, float) else v) for k, v in fold_metrics.items()})

        metrics = [
            "accuracy",
            "precision",
            "sensitivity",
            "f1_score",
            "fbeta_2",
            "roc_auc",
            "NNS",
        ]
        mean_std = {
            m: (np.mean([fr[m] for fr in fold_results]), np.std([fr[m] for fr in fold_results]))
            for m in metrics
        }

        print("\nMean scores across folds (", tech, "):")
        for m in metrics:
            mu, sd = mean_std[m]
            print(f"{m}: {mu:.3f} \u00B1 {sd:.3f}")

        summary_df = pd.DataFrame(fold_results)
        results[tech] = {
            "fold_results": fold_results,
            "summary_metrics": summary_df,
            "best_params_per_fold": best_params_per_fold,
        }

    return results

## y_acs

In [24]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===
{'fold': 1, 'accuracy': 0.706, 'precision': 0.025, 'sensitivity': 0.6, 'f1_score': 0.048, 'fbeta_2': 0.107, 'roc_auc': np.float64(0.773), 'NNS': 40.111, 'best_params': OrderedDict([('var_smoothing', 0.09979862670673767)])}
{'fold': 2, 'accuracy': 0.635, 'precision': 0.024, 'sensitivity': 0.786, 'f1_score': 0.047, 'fbeta_2': 0.108, 'roc_auc': np.float64(0.758), 'NNS': 41.182, 'best_params': OrderedDict([('var_smoothing', 0.07375101834081149)])}
{'fold': 3, 'accuracy': 0.727, 'precision': 0.029, 'sensitivity': 0.714, 'f1_score': 0.057, 'fbeta_2': 0.127, 'roc_auc': np.float64(0.726), 'NNS': 33.9, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 4, 'accuracy': 0.768, 'precision': 0.021, 'sensitivity': 0.429, 'f1_score': 0.041, 'fbeta_2': 0.089, 'roc_auc': np.float64(0.72), 'NNS': 46.667, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 5, 'accuracy': 0.747, 'precision': 0.017, 'sensitivity': 0.333, 'f1_score': 0.031, 'fbeta_2': 0.069

In [25]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.837, 'precision': 0.035, 'sensitivity': 0.467, 'f1_score': 0.066, 'fbeta_2': 0.136, 'roc_auc': np.float64(0.708), 'NNS': 28.286, 'best_params': OrderedDict([('C', 0.00023356908350548447), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.818, 'precision': 0.036, 'sensitivity': 0.571, 'f1_score': 0.067, 'fbeta_2': 0.143, 'roc_auc': np.float64(0.749), 'NNS': 28.0, 'best_params': OrderedDict([('C', 0.0005663058078694401), ('penalty', 'l2'), ('solver', 'liblinear')])}


{'fold': 3, 'accuracy': 0.812, 'precision': 0.043, 'sensitivity': 0.714, 'f1_score': 0.08, 'fbeta_2': 0.172, 'roc_auc': np.float64(0.835), 'NNS': 23.5, 'best_params': OrderedDict([('C', 0.00010602594470834996), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.764, 'precision': 0.024, 'sensitivity': 0.5, 'f1_score': 0.046, 'fbeta_2': 0.102, 'roc_auc': np.float64(0.69), 'NNS': 41.143, 'best_params': OrderedDict([('C', 0.00010602594470834996), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 5, 'accuracy': 0.813, 'precision': 0.022, 'sensitivity': 0.333, 'f1_score': 0.042, 'fbeta_2': 0.088, 'roc_auc': np.float64(0.664), 'NNS': 44.6, 'best_params': OrderedDict([('C', 0.00010602594470834996), ('penalty', 'l2'), ('solver', 'liblinear')])}

Mean scores across folds ( undersample ):
accuracy: 0.809 ± 0.024
precision: 0.032 ± 0.008
sensitivity: 0.517 ± 0.125
f1_score: 0.060 ± 0.014
fbeta_2: 0.128 ± 0.030
roc_auc: 0.729 ± 0.060
NNS: 33.106 ± 8.225


In [26]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="oversample",
    n_splits=5,
    random_state=42,
)


=== Technique: OVERSAMPLE ===


{'fold': 1, 'accuracy': 0.782, 'precision': 0.026, 'sensitivity': 0.467, 'f1_score': 0.05, 'fbeta_2': 0.108, 'roc_auc': np.float64(0.663), 'NNS': 37.857, 'best_params': OrderedDict([('C', 31.821559591646302), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.765, 'precision': 0.024, 'sensitivity': 0.5, 'f1_score': 0.047, 'fbeta_2': 0.102, 'roc_auc': np.float64(0.674), 'NNS': 40.857, 'best_params': OrderedDict([('C', 1000.0), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.776, 'precision': 0.026, 'sensitivity': 0.5, 'f1_score': 0.049, 'fbeta_2': 0.106, 'roc_auc': np.float64(0.646), 'NNS': 39.0, 'best_params': OrderedDict([('C', 72.73082081096716), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.791, 'precision': 0.016, 'sensitivity': 0.286, 'f1_score': 0.03, 'fbeta_2': 0.066, 'roc_auc': np.float64(0.639), 'NNS': 62.25, 'best_params': OrderedDict([('C', 0.6365338060906855), ('penalty', 'l2'), ('solver', 'liblinear')])

In [27]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
)


=== Technique: SMOTE ===


{'fold': 1, 'accuracy': 0.792, 'precision': 0.02, 'sensitivity': 0.333, 'f1_score': 0.038, 'fbeta_2': 0.081, 'roc_auc': np.float64(0.639), 'NNS': 49.8, 'best_params': OrderedDict([('C', 5.5000787109147495), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.764, 'precision': 0.021, 'sensitivity': 0.429, 'f1_score': 0.04, 'fbeta_2': 0.088, 'roc_auc': np.float64(0.635), 'NNS': 47.667, 'best_params': OrderedDict([('C', 0.006408831457062401), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.778, 'precision': 0.026, 'sensitivity': 0.5, 'f1_score': 0.049, 'fbeta_2': 0.107, 'roc_auc': np.float64(0.637), 'NNS': 38.714, 'best_params': OrderedDict([('C', 2.0871588778809445), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.729, 'precision': 0.024, 'sensitivity': 0.571, 'f1_score': 0.046, 'fbeta_2': 0.103, 'roc_auc': np.float64(0.742), 'NNS': 41.5, 'best_params': OrderedDict([('C', 0.0021178426349068857), ('penalty', 'l2'), ('solv

In [28]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.886, 'precision': 0.037, 'sensitivity': 0.333, 'f1_score': 0.067, 'fbeta_2': 0.129, 'roc_auc': np.float64(0.601), 'NNS': 26.8, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 20), ('min_samples_leaf', 3), ('min_samples_split', 8)])}
{'fold': 2, 'accuracy': 0.866, 'precision': 0.013, 'sensitivity': 0.143, 'f1_score': 0.024, 'fbeta_2': 0.048, 'roc_auc': np.float64(0.503), 'NNS': 76.5, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 20), ('min_samples_leaf', 1), ('min_samples_split', 6)])}
{'fold': 3, 'accuracy': 0.881, 'precision': 0.029, 'sensitivity': 0.286, 'f1_score': 0.052, 'fbeta_2': 0.103, 'roc_auc': np.float64(0.573), 'NNS': 34.75, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 18), ('min_samples_leaf', 2), ('min_samples_split', 2)])}
{'fold': 4, 'accuracy': 0.903, 'precision': 0.019, 'sensitivity': 0.143, 'f1_score': 0.033, 'fbeta_2': 0.061, 'roc_auc': np.float64(0

In [29]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.915, 'precision': 0.041, 'sensitivity': 0.267, 'f1_score': 0.071, 'fbeta_2': 0.127, 'roc_auc': np.float64(0.654), 'NNS': 24.25, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 34), ('max_features', None), ('min_samples_leaf', 3), ('min_samples_split', 18), ('n_estimators', 50)])}
{'fold': 2, 'accuracy': 0.865, 'precision': 0.019, 'sensitivity': 0.214, 'f1_score': 0.035, 'fbeta_2': 0.07, 'roc_auc': np.float64(0.574), 'NNS': 52.333, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 50), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 11), ('n_estimators', 94)])}
{'fold': 3, 'accuracy': 0.934, 'precision': 0.015, 'sensitivity': 0.071, 'f1_score': 0.024, 'fbeta_2': 0.04, 'roc_auc': np.float64(0.55), 'NNS': 68.0, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 36), ('max_features', None), ('min_samples_leaf', 10), ('min_samples_split', 10), ('n_estimators', 50)])}
{

In [30]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.962, 'precision': 0.081, 'sensitivity': 0.2, 'f1_score': 0.115, 'fbeta_2': 0.155, 'roc_auc': np.float64(0.666), 'NNS': 12.333, 'best_params': OrderedDict([('colsample_bytree', 0.5151478437913364), ('gamma', 0), ('learning_rate', 0.1), ('max_depth', 9), ('min_child_weight', 3), ('n_estimators', 248), ('subsample', 0.5)])}
{'fold': 2, 'accuracy': 0.956, 'precision': 0.085, 'sensitivity': 0.286, 'f1_score': 0.131, 'fbeta_2': 0.194, 'roc_auc': np.float64(0.736), 'NNS': 11.75, 'best_params': OrderedDict([('colsample_bytree', 0.5), ('gamma', 0), ('learning_rate', 0.1), ('max_depth', 1), ('min_child_weight', 1), ('n_estimators', 500), ('subsample', 0.5)])}
{'fold': 3, 'accuracy': 0.969, 'precision': 0.071, 'sensitivity': 0.143, 'f1_score': 0.095, 'fbeta_2': 0.119, 'roc_auc': np.float64(0.779), 'NNS': 14.0, 'best_params': OrderedDict([('colsample_bytree', 0.5), ('gamma', 1), ('learning_rate', 0.0785448140527144), ('max_depth', 6), ('min

In [32]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_acs_6_months",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.946, 'precision': 0.053, 'sensitivity': 0.2, 'f1_score': 0.083, 'fbeta_2': 0.128, 'roc_auc': np.float64(0.671), 'NNS': 19.0, 'best_params': {}}
{'fold': 2, 'accuracy': 0.94, 'precision': 0.072, 'sensitivity': 0.357, 'f1_score': 0.12, 'fbeta_2': 0.2, 'roc_auc': np.float64(0.735), 'NNS': 13.8, 'best_params': {}}
{'fold': 3, 'accuracy': 0.941, 'precision': 0.017, 'sensitivity': 0.071, 'f1_score': 0.027, 'fbeta_2': 0.043, 'roc_auc': np.float64(0.675), 'NNS': 60.0, 'best_params': {}}
{'fold': 4, 'accuracy': 0.94, 'precision': 0.016, 'sensitivity': 0.071, 'f1_score': 0.027, 'fbeta_2': 0.043, 'roc_auc': np.float64(0.632), 'NNS': 61.0, 'best_params': {}}
{'fold': 5, 'accuracy': 0.947, 'precision': 0.069, 'sensitivity': 0.267, 'f1_score': 0.11, 'fbeta_2': 0.169, 'roc_auc': np.float64(0.594), 'NNS': 14.5, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.943 ± 0.003
precision: 0.045 ± 0.025
sensitivity: 0.193 ± 0.1

## y_cvdeath_6_months

In [33]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===
{'fold': 1, 'accuracy': 0.359, 'precision': 0.048, 'sensitivity': 0.867, 'f1_score': 0.091, 'fbeta_2': 0.196, 'roc_auc': np.float64(0.679), 'NNS': 20.872, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 2, 'accuracy': 0.386, 'precision': 0.048, 'sensitivity': 0.841, 'f1_score': 0.09, 'fbeta_2': 0.194, 'roc_auc': np.float64(0.723), 'NNS': 21.027, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 3, 'accuracy': 0.326, 'precision': 0.047, 'sensitivity': 0.909, 'f1_score': 0.089, 'fbeta_2': 0.194, 'roc_auc': np.float64(0.73), 'NNS': 21.425, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 4, 'accuracy': 0.41, 'precision': 0.051, 'sensitivity': 0.844, 'f1_score': 0.096, 'fbeta_2': 0.204, 'roc_auc': np.float64(0.658), 'NNS': 19.737, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 5, 'accuracy': 0.456, 'precision': 0.059, 'sensitivity': 0.911, 'f1_score': 0.11, 'fbeta_2': 0.233, 'roc_auc': np.float64(0.739),

In [34]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="oversample",
    n_splits=5,
    random_state=42,
)


=== Technique: OVERSAMPLE ===
{'fold': 1, 'accuracy': 0.655, 'precision': 0.077, 'sensitivity': 0.756, 'f1_score': 0.139, 'fbeta_2': 0.272, 'roc_auc': np.float64(0.758), 'NNS': 13.059, 'best_params': OrderedDict([('C', 0.0004690268643975102), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.669, 'precision': 0.08, 'sensitivity': 0.773, 'f1_score': 0.144, 'fbeta_2': 0.282, 'roc_auc': np.float64(0.778), 'NNS': 12.559, 'best_params': OrderedDict([('C', 0.0006392659781938485), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.615, 'precision': 0.079, 'sensitivity': 0.909, 'f1_score': 0.146, 'fbeta_2': 0.294, 'roc_auc': np.float64(0.794), 'NNS': 12.625, 'best_params': OrderedDict([('C', 0.00032269909019619456), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.626, 'precision': 0.076, 'sensitivity': 0.822, 'f1_score': 0.14, 'fbeta_2': 0.279, 'roc_auc': np.float64(0.758), 'NNS': 13.081, 'best_params': OrderedDict([('C', 0.00

In [35]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.65, 'precision': 0.076, 'sensitivity': 0.756, 'f1_score': 0.137, 'fbeta_2': 0.27, 'roc_auc': np.float64(0.765), 'NNS': 13.235, 'best_params': OrderedDict([('C', 0.0005945480662650276), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.69, 'precision': 0.08, 'sensitivity': 0.727, 'f1_score': 0.145, 'fbeta_2': 0.279, 'roc_auc': np.float64(0.769), 'NNS': 12.438, 'best_params': OrderedDict([('C', 0.0010194654000856055), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.639, 'precision': 0.081, 'sensitivity': 0.864, 'f1_score': 0.147, 'fbeta_2': 0.293, 'roc_auc': np.float64(0.78), 'NNS': 12.421, 'best_params': OrderedDict([('C', 0.0005340086603168545), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.687, 'precision': 0.078, 'sensitivity': 0.689, 'f1_score': 0.14, 'fbeta_2': 0.268, 'roc_auc': np.float64(0.746), 'NNS': 12.839, 'best_params': OrderedDict([('C', 0.000978706385

In [36]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.713, 'precision': 0.058, 'sensitivity': 0.444, 'f1_score': 0.103, 'fbeta_2': 0.19, 'roc_auc': np.float64(0.613), 'NNS': 17.25, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 20), ('min_samples_leaf', 3), ('min_samples_split', 18)])}
{'fold': 2, 'accuracy': 0.716, 'precision': 0.064, 'sensitivity': 0.5, 'f1_score': 0.113, 'fbeta_2': 0.211, 'roc_auc': np.float64(0.661), 'NNS': 15.727, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 9), ('min_samples_leaf', 7), ('min_samples_split', 2)])}
{'fold': 3, 'accuracy': 0.748, 'precision': 0.085, 'sensitivity': 0.614, 'f1_score': 0.15, 'fbeta_2': 0.274, 'roc_auc': np.float64(0.736), 'NNS': 11.741, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 11), ('min_samples_leaf', 6), ('min_samples_split', 16)])}
{'fold': 4, 'accuracy': 0.7, 'precision': 0.056, 'sensitivity': 0.444, 'f1_score': 0.099, 'fbeta_2': 0.185, 'roc_auc': np.float64(0.

In [37]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.82, 'precision': 0.081, 'sensitivity': 0.378, 'f1_score': 0.134, 'fbeta_2': 0.219, 'roc_auc': np.float64(0.607), 'NNS': 12.294, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 1), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 300)])}
{'fold': 2, 'accuracy': 0.713, 'precision': 0.07, 'sensitivity': 0.568, 'f1_score': 0.125, 'fbeta_2': 0.235, 'roc_auc': np.float64(0.664), 'NNS': 14.24, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 39), ('max_features', None), ('min_samples_leaf', 5), ('min_samples_split', 2), ('n_estimators', 300)])}
{'fold': 3, 'accuracy': 0.759, 'precision': 0.081, 'sensitivity': 0.545, 'f1_score': 0.14, 'fbeta_2': 0.253, 'roc_auc': np.float64(0.728), 'NNS': 12.417, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 50), ('max_features', None), ('min_samples_leaf', 15), ('min_samples_split', 2), ('n_estimators', 50)])}
{

In [38]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.79, 'precision': 0.08, 'sensitivity': 0.444, 'f1_score': 0.135, 'fbeta_2': 0.232, 'roc_auc': np.float64(0.686), 'NNS': 12.55, 'best_params': OrderedDict([('colsample_bytree', 0.9983989965175464), ('gamma', 0), ('learning_rate', 0.08071118324661555), ('max_depth', 44), ('min_child_weight', 7), ('n_estimators', 379), ('subsample', 0.8166024665290403)])}
{'fold': 2, 'accuracy': 0.817, 'precision': 0.091, 'sensitivity': 0.455, 'f1_score': 0.152, 'fbeta_2': 0.253, 'roc_auc': np.float64(0.768), 'NNS': 10.95, 'best_params': OrderedDict([('colsample_bytree', 0.9802905199997531), ('gamma', 0), ('learning_rate', 0.045889311488422534), ('max_depth', 1), ('min_child_weight', 1), ('n_estimators', 500), ('subsample', 0.7110650142841602)])}
{'fold': 3, 'accuracy': 0.788, 'precision': 0.101, 'sensitivity': 0.614, 'f1_score': 0.173, 'fbeta_2': 0.304, 'roc_auc': np.float64(0.79), 'NNS': 9.926, 'best_params': OrderedDict([('colsample_bytree', 1.0)

In [39]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_cvdeath_6_months",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.767, 'precision': 0.081, 'sensitivity': 0.511, 'f1_score': 0.139, 'fbeta_2': 0.247, 'roc_auc': np.float64(0.718), 'NNS': 12.391, 'best_params': {}}
{'fold': 2, 'accuracy': 0.744, 'precision': 0.065, 'sensitivity': 0.455, 'f1_score': 0.114, 'fbeta_2': 0.207, 'roc_auc': np.float64(0.667), 'NNS': 15.4, 'best_params': {}}
{'fold': 3, 'accuracy': 0.719, 'precision': 0.062, 'sensitivity': 0.477, 'f1_score': 0.109, 'fbeta_2': 0.203, 'roc_auc': np.float64(0.681), 'NNS': 16.19, 'best_params': {}}
{'fold': 4, 'accuracy': 0.768, 'precision': 0.084, 'sensitivity': 0.533, 'f1_score': 0.145, 'fbeta_2': 0.258, 'roc_auc': np.float64(0.687), 'NNS': 11.917, 'best_params': {}}
{'fold': 5, 'accuracy': 0.745, 'precision': 0.087, 'sensitivity': 0.622, 'f1_score': 0.153, 'fbeta_2': 0.279, 'roc_auc': np.float64(0.753), 'NNS': 11.464, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.749 ± 0.018
precision: 0.076 ± 0.010
sensitivi

## y_death_6_months

In [40]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===
{'fold': 1, 'accuracy': 0.309, 'precision': 0.06, 'sensitivity': 0.883, 'f1_score': 0.112, 'fbeta_2': 0.235, 'roc_auc': np.float64(0.681), 'NNS': 16.755, 'best_params': OrderedDict([('var_smoothing', 0.09975217360887728)])}
{'fold': 2, 'accuracy': 0.271, 'precision': 0.058, 'sensitivity': 0.915, 'f1_score': 0.108, 'fbeta_2': 0.23, 'roc_auc': np.float64(0.685), 'NNS': 17.352, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 3, 'accuracy': 0.272, 'precision': 0.06, 'sensitivity': 0.949, 'f1_score': 0.112, 'fbeta_2': 0.238, 'roc_auc': np.float64(0.695), 'NNS': 16.786, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 4, 'accuracy': 0.239, 'precision': 0.053, 'sensitivity': 0.881, 'f1_score': 0.101, 'fbeta_2': 0.215, 'roc_auc': np.float64(0.665), 'NNS': 18.692, 'best_params': OrderedDict([('var_smoothing', 0.09909284870061752)])}
{'fold': 5, 'accuracy': 0.289, 'precision': 0.064, 'sensitivity': 0.983, 'f1_score': 0.12, 'fbeta_2': 0.25

In [41]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.628, 'precision': 0.091, 'sensitivity': 0.733, 'f1_score': 0.162, 'fbeta_2': 0.305, 'roc_auc': np.float64(0.742), 'NNS': 10.955, 'best_params': OrderedDict([('C', 0.00046871452930658214), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.581, 'precision': 0.092, 'sensitivity': 0.864, 'f1_score': 0.167, 'fbeta_2': 0.323, 'roc_auc': np.float64(0.795), 'NNS': 10.843, 'best_params': OrderedDict([('C', 0.00020944089717429826), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.597, 'precision': 0.097, 'sensitivity': 0.881, 'f1_score': 0.175, 'fbeta_2': 0.337, 'roc_auc': np.float64(0.778), 'NNS': 10.308, 'best_params': OrderedDict([('C', 0.00021182343339935965), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.626, 'precision': 0.094, 'sensitivity': 0.78, 'f1_score': 0.168, 'fbeta_2': 0.318, 'roc_auc': np.float64(0.754), 'NNS': 10.609, 'best_params': OrderedDict([('C', 0.0002

In [42]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.605, 'precision': 0.086, 'sensitivity': 0.733, 'f1_score': 0.154, 'fbeta_2': 0.293, 'roc_auc': np.float64(0.671), 'NNS': 11.591, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 4), ('min_samples_leaf', 5), ('min_samples_split', 15)])}
{'fold': 2, 'accuracy': 0.683, 'precision': 0.084, 'sensitivity': 0.559, 'f1_score': 0.146, 'fbeta_2': 0.262, 'roc_auc': np.float64(0.668), 'NNS': 11.909, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 3), ('min_samples_leaf', 6), ('min_samples_split', 19)])}
{'fold': 3, 'accuracy': 0.556, 'precision': 0.086, 'sensitivity': 0.847, 'f1_score': 0.156, 'fbeta_2': 0.306, 'roc_auc': np.float64(0.715), 'NNS': 11.64, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 3), ('min_samples_leaf', 10), ('min_samples_split', 2)])}
{'fold': 4, 'accuracy': 0.55, 'precision': 0.078, 'sensitivity': 0.763, 'f1_score': 0.141, 'fbeta_2': 0.276, 'roc_auc': np.float6

In [43]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.649, 'precision': 0.086, 'sensitivity': 0.633, 'f1_score': 0.151, 'fbeta_2': 0.278, 'roc_auc': np.float64(0.725), 'NNS': 11.684, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 15), ('max_features', 'sqrt'), ('min_samples_leaf', 1), ('min_samples_split', 16), ('n_estimators', 195)])}
{'fold': 2, 'accuracy': 0.673, 'precision': 0.112, 'sensitivity': 0.831, 'f1_score': 0.198, 'fbeta_2': 0.364, 'roc_auc': np.float64(0.793), 'NNS': 8.918, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 50), ('max_features', 'sqrt'), ('min_samples_leaf', 1), ('min_samples_split', 19), ('n_estimators', 109)])}
{'fold': 3, 'accuracy': 0.676, 'precision': 0.1, 'sensitivity': 0.712, 'f1_score': 0.175, 'fbeta_2': 0.32, 'roc_auc': np.float64(0.769), 'NNS': 10.0, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 50), ('max_features', None), ('min_samples_leaf', 14), ('min_samples_split', 2), ('n_estimators', 300)]

In [44]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.699, 'precision': 0.105, 'sensitivity': 0.683, 'f1_score': 0.183, 'fbeta_2': 0.326, 'roc_auc': np.float64(0.749), 'NNS': 9.488, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 4), ('learning_rate', 0.09810230417646906), ('max_depth', 50), ('min_child_weight', 7), ('n_estimators', 442), ('subsample', 0.8173662655131322)])}
{'fold': 2, 'accuracy': 0.675, 'precision': 0.111, 'sensitivity': 0.814, 'f1_score': 0.195, 'fbeta_2': 0.359, 'roc_auc': np.float64(0.79), 'NNS': 9.021, 'best_params': OrderedDict([('colsample_bytree', 0.7015972493898902), ('gamma', 5), ('learning_rate', 0.1), ('max_depth', 41), ('min_child_weight', 1), ('n_estimators', 122), ('subsample', 0.9110749910920206)])}
{'fold': 3, 'accuracy': 0.644, 'precision': 0.09, 'sensitivity': 0.695, 'f1_score': 0.159, 'fbeta_2': 0.296, 'roc_auc': np.float64(0.69), 'NNS': 11.122, 'best_params': OrderedDict([('colsample_bytree', 0.8962159842007731), ('gamma', 4),

In [45]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_death_6_months",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.672, 'precision': 0.093, 'sensitivity': 0.65, 'f1_score': 0.163, 'fbeta_2': 0.296, 'roc_auc': np.float64(0.724), 'NNS': 10.718, 'best_params': {}}
{'fold': 2, 'accuracy': 0.647, 'precision': 0.089, 'sensitivity': 0.678, 'f1_score': 0.157, 'fbeta_2': 0.291, 'roc_auc': np.float64(0.716), 'NNS': 11.275, 'best_params': {}}
{'fold': 3, 'accuracy': 0.689, 'precision': 0.094, 'sensitivity': 0.627, 'f1_score': 0.163, 'fbeta_2': 0.294, 'roc_auc': np.float64(0.688), 'NNS': 10.649, 'best_params': {}}
{'fold': 4, 'accuracy': 0.681, 'precision': 0.092, 'sensitivity': 0.627, 'f1_score': 0.16, 'fbeta_2': 0.29, 'roc_auc': np.float64(0.708), 'NNS': 10.892, 'best_params': {}}
{'fold': 5, 'accuracy': 0.675, 'precision': 0.1, 'sensitivity': 0.7, 'f1_score': 0.175, 'fbeta_2': 0.318, 'roc_auc': np.float64(0.743), 'NNS': 10.0, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.673 ± 0.014
precision: 0.094 ± 0.004
sensitivity: 0.

## y_hf_6_months

In [47]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===
{'fold': 1, 'accuracy': 0.537, 'precision': 0.18, 'sensitivity': 0.768, 'f1_score': 0.291, 'fbeta_2': 0.464, 'roc_auc': np.float64(0.685), 'NNS': 5.56, 'best_params': OrderedDict([('var_smoothing', 0.06892272039488176)])}
{'fold': 2, 'accuracy': 0.585, 'precision': 0.197, 'sensitivity': 0.773, 'f1_score': 0.314, 'fbeta_2': 0.488, 'roc_auc': np.float64(0.721), 'NNS': 5.069, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 3, 'accuracy': 0.594, 'precision': 0.2, 'sensitivity': 0.762, 'f1_score': 0.317, 'fbeta_2': 0.488, 'roc_auc': np.float64(0.725), 'NNS': 4.991, 'best_params': OrderedDict([('var_smoothing', 0.06264158997342416)])}
{'fold': 4, 'accuracy': 0.603, 'precision': 0.199, 'sensitivity': 0.728, 'f1_score': 0.313, 'fbeta_2': 0.476, 'roc_auc': np.float64(0.696), 'NNS': 5.018, 'best_params': OrderedDict([('var_smoothing', 0.0996692821042565)])}
{'fold': 5, 'accuracy': 0.534, 'precision': 0.19, 'sensitivity': 0.848, 'f1_score': 0.311, 'fbet

In [48]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.663, 'precision': 0.221, 'sensitivity': 0.682, 'f1_score': 0.334, 'fbeta_2': 0.481, 'roc_auc': np.float64(0.743), 'NNS': 4.524, 'best_params': OrderedDict([('C', 0.0014705453761746973), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.658, 'precision': 0.234, 'sensitivity': 0.78, 'f1_score': 0.359, 'fbeta_2': 0.531, 'roc_auc': np.float64(0.773), 'NNS': 4.282, 'best_params': OrderedDict([('C', 0.0004393166723520355), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.716, 'precision': 0.27, 'sensitivity': 0.755, 'f1_score': 0.397, 'fbeta_2': 0.555, 'roc_auc': np.float64(0.796), 'NNS': 3.711, 'best_params': OrderedDict([('C', 0.0013257231478996509), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.701, 'precision': 0.249, 'sensitivity': 0.702, 'f1_score': 0.368, 'fbeta_2': 0.515, 'roc_auc': np.float64(0.763), 'NNS': 4.009, 'best_params': OrderedDict([('C', 0.000951238481

In [49]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.613, 'precision': 0.213, 'sensitivity': 0.788, 'f1_score': 0.335, 'fbeta_2': 0.512, 'roc_auc': np.float64(0.733), 'NNS': 4.697, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 3), ('min_samples_leaf', 10), ('min_samples_split', 19)])}
{'fold': 2, 'accuracy': 0.688, 'precision': 0.234, 'sensitivity': 0.673, 'f1_score': 0.347, 'fbeta_2': 0.489, 'roc_auc': np.float64(0.71), 'NNS': 4.277, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 4), ('min_samples_leaf', 4), ('min_samples_split', 19)])}
{'fold': 3, 'accuracy': 0.709, 'precision': 0.25, 'sensitivity': 0.675, 'f1_score': 0.365, 'fbeta_2': 0.504, 'roc_auc': np.float64(0.738), 'NNS': 4.0, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 4), ('min_samples_leaf', 10), ('min_samples_split', 2)])}
{'fold': 4, 'accuracy': 0.677, 'precision': 0.206, 'sensitivity': 0.563, 'f1_score': 0.302, 'fbeta_2': 0.418, 'roc_auc': np.float64

In [50]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.662, 'precision': 0.234, 'sensitivity': 0.762, 'f1_score': 0.358, 'fbeta_2': 0.525, 'roc_auc': np.float64(0.757), 'NNS': 4.27, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 50), ('max_features', 'log2'), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 300)])}
{'fold': 2, 'accuracy': 0.667, 'precision': 0.238, 'sensitivity': 0.773, 'f1_score': 0.364, 'fbeta_2': 0.534, 'roc_auc': np.float64(0.772), 'NNS': 4.198, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 50), ('max_features', 'log2'), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 300)])}
{'fold': 3, 'accuracy': 0.72, 'precision': 0.271, 'sensitivity': 0.742, 'f1_score': 0.396, 'fbeta_2': 0.55, 'roc_auc': np.float64(0.788), 'NNS': 3.696, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 12), ('max_features', 'sqrt'), ('min_samples_leaf', 20), ('min_samples_split', 2), ('n_estimators', 50)]

In [51]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.675, 'precision': 0.237, 'sensitivity': 0.728, 'f1_score': 0.357, 'fbeta_2': 0.514, 'roc_auc': np.float64(0.766), 'NNS': 4.227, 'best_params': OrderedDict([('colsample_bytree', 0.5706450197463859), ('gamma', 5), ('learning_rate', 0.1), ('max_depth', 15), ('min_child_weight', 20), ('n_estimators', 10), ('subsample', 0.8540141162519486)])}
{'fold': 2, 'accuracy': 0.669, 'precision': 0.236, 'sensitivity': 0.753, 'f1_score': 0.359, 'fbeta_2': 0.524, 'roc_auc': np.float64(0.755), 'NNS': 4.239, 'best_params': OrderedDict([('colsample_bytree', 0.5292300451951665), ('gamma', 5), ('learning_rate', 0.028862383159566848), ('max_depth', 1), ('min_child_weight', 1), ('n_estimators', 10), ('subsample', 0.8537480666869095)])}
{'fold': 3, 'accuracy': 0.712, 'precision': 0.269, 'sensitivity': 0.768, 'f1_score': 0.398, 'fbeta_2': 0.56, 'roc_auc': np.float64(0.78), 'NNS': 3.724, 'best_params': OrderedDict([('colsample_bytree', 0.6682920119032947),

In [52]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.646, 'precision': 0.212, 'sensitivity': 0.682, 'f1_score': 0.323, 'fbeta_2': 0.472, 'roc_auc': np.float64(0.707), 'NNS': 4.718, 'best_params': {}}
{'fold': 2, 'accuracy': 0.661, 'precision': 0.212, 'sensitivity': 0.647, 'f1_score': 0.32, 'fbeta_2': 0.459, 'roc_auc': np.float64(0.721), 'NNS': 4.711, 'best_params': {}}
{'fold': 3, 'accuracy': 0.654, 'precision': 0.225, 'sensitivity': 0.735, 'f1_score': 0.345, 'fbeta_2': 0.506, 'roc_auc': np.float64(0.73), 'NNS': 4.441, 'best_params': {}}
{'fold': 4, 'accuracy': 0.654, 'precision': 0.214, 'sensitivity': 0.669, 'f1_score': 0.324, 'fbeta_2': 0.469, 'roc_auc': np.float64(0.709), 'NNS': 4.673, 'best_params': {}}
{'fold': 5, 'accuracy': 0.623, 'precision': 0.194, 'sensitivity': 0.649, 'f1_score': 0.299, 'fbeta_2': 0.442, 'roc_auc': np.float64(0.703), 'NNS': 5.143, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.648 ± 0.013
precision: 0.212 ± 0.010
sensitivity: 

## y_inp_6_months

In [53]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===
{'fold': 1, 'accuracy': 0.449, 'precision': 0.257, 'sensitivity': 0.835, 'f1_score': 0.394, 'fbeta_2': 0.576, 'roc_auc': np.float64(0.63), 'NNS': 3.885, 'best_params': OrderedDict([('var_smoothing', 0.00026235672098000784)])}
{'fold': 2, 'accuracy': 0.464, 'precision': 0.262, 'sensitivity': 0.831, 'f1_score': 0.398, 'fbeta_2': 0.579, 'roc_auc': np.float64(0.653), 'NNS': 3.819, 'best_params': OrderedDict([('var_smoothing', 0.0006238610296676051)])}
{'fold': 3, 'accuracy': 0.518, 'precision': 0.266, 'sensitivity': 0.715, 'f1_score': 0.388, 'fbeta_2': 0.535, 'roc_auc': np.float64(0.625), 'NNS': 3.758, 'best_params': OrderedDict([('var_smoothing', 1.325677268771663e-10)])}
{'fold': 4, 'accuracy': 0.305, 'precision': 0.227, 'sensitivity': 0.935, 'f1_score': 0.365, 'fbeta_2': 0.575, 'roc_auc': np.float64(0.634), 'NNS': 4.412, 'best_params': OrderedDict([('var_smoothing', 1.0963100107414223e-12)])}
{'fold': 5, 'accuracy': 0.446, 'precision': 0.26, 'sensitivity': 0

In [54]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.622, 'precision': 0.315, 'sensitivity': 0.655, 'f1_score': 0.426, 'fbeta_2': 0.539, 'roc_auc': np.float64(0.679), 'NNS': 3.17, 'best_params': OrderedDict([('C', 0.0011600454162844262), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.579, 'precision': 0.3, 'sensitivity': 0.727, 'f1_score': 0.424, 'fbeta_2': 0.566, 'roc_auc': np.float64(0.675), 'NNS': 3.339, 'best_params': OrderedDict([('C', 0.00988172488781246), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.599, 'precision': 0.296, 'sensitivity': 0.638, 'f1_score': 0.405, 'fbeta_2': 0.519, 'roc_auc': np.float64(0.646), 'NNS': 3.373, 'best_params': OrderedDict([('C', 0.008361879008621117), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.584, 'precision': 0.286, 'sensitivity': 0.635, 'f1_score': 0.394, 'fbeta_2': 0.51, 'roc_auc': np.float64(0.65), 'NNS': 3.497, 'best_params': OrderedDict([('C', 0.000574018948847378

In [55]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.535, 'precision': 0.24, 'sensitivity': 0.54, 'f1_score': 0.332, 'fbeta_2': 0.432, 'roc_auc': np.float64(0.565), 'NNS': 4.17, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 20), ('min_samples_leaf', 7), ('min_samples_split', 20)])}
{'fold': 2, 'accuracy': 0.659, 'precision': 0.302, 'sensitivity': 0.454, 'f1_score': 0.363, 'fbeta_2': 0.412, 'roc_auc': np.float64(0.63), 'NNS': 3.314, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 3), ('min_samples_leaf', 1), ('min_samples_split', 14)])}
{'fold': 3, 'accuracy': 0.629, 'precision': 0.286, 'sensitivity': 0.492, 'f1_score': 0.362, 'fbeta_2': 0.43, 'roc_auc': np.float64(0.587), 'NNS': 3.5, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 4), ('min_samples_leaf', 4), ('min_samples_split', 17)])}
{'fold': 4, 'accuracy': 0.612, 'precision': 0.292, 'sensitivity': 0.573, 'f1_score': 0.387, 'fbeta_2': 0.481, 'roc_auc': np.float64(0.614), 

In [56]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.601, 'precision': 0.306, 'sensitivity': 0.678, 'f1_score': 0.421, 'fbeta_2': 0.545, 'roc_auc': np.float64(0.673), 'NNS': 3.271, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 16), ('max_features', None), ('min_samples_leaf', 3), ('min_samples_split', 18), ('n_estimators', 147)])}
{'fold': 2, 'accuracy': 0.589, 'precision': 0.3, 'sensitivity': 0.696, 'f1_score': 0.419, 'fbeta_2': 0.551, 'roc_auc': np.float64(0.681), 'NNS': 3.331, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 48), ('max_features', 'sqrt'), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 200)])}
{'fold': 3, 'accuracy': 0.606, 'precision': 0.307, 'sensitivity': 0.673, 'f1_score': 0.422, 'fbeta_2': 0.543, 'roc_auc': np.float64(0.667), 'NNS': 3.257, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 32), ('max_features', 'log2'), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 223)]

In [57]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.627, 'precision': 0.318, 'sensitivity': 0.651, 'f1_score': 0.428, 'fbeta_2': 0.539, 'roc_auc': np.float64(0.673), 'NNS': 3.141, 'best_params': OrderedDict([('colsample_bytree', 0.7162912155050629), ('gamma', 5), ('learning_rate', 0.010336469196656064), ('max_depth', 25), ('min_child_weight', 20), ('n_estimators', 260), ('subsample', 0.5098594836424245)])}
{'fold': 2, 'accuracy': 0.597, 'precision': 0.299, 'sensitivity': 0.662, 'f1_score': 0.412, 'fbeta_2': 0.533, 'roc_auc': np.float64(0.67), 'NNS': 3.343, 'best_params': OrderedDict([('colsample_bytree', 0.8897976055075922), ('gamma', 0), ('learning_rate', 0.001), ('max_depth', 22), ('min_child_weight', 1), ('n_estimators', 396), ('subsample', 0.5)])}
{'fold': 3, 'accuracy': 0.622, 'precision': 0.308, 'sensitivity': 0.615, 'f1_score': 0.41, 'fbeta_2': 0.513, 'roc_auc': np.float64(0.657), 'NNS': 3.25, 'best_params': OrderedDict([('colsample_bytree', 0.5), ('gamma', 5), ('learning_

In [58]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.596, 'precision': 0.282, 'sensitivity': 0.571, 'f1_score': 0.377, 'fbeta_2': 0.474, 'roc_auc': np.float64(0.617), 'NNS': 3.55, 'best_params': {}}
{'fold': 2, 'accuracy': 0.566, 'precision': 0.259, 'sensitivity': 0.558, 'f1_score': 0.354, 'fbeta_2': 0.453, 'roc_auc': np.float64(0.589), 'NNS': 3.855, 'best_params': {}}
{'fold': 3, 'accuracy': 0.584, 'precision': 0.278, 'sensitivity': 0.596, 'f1_score': 0.379, 'fbeta_2': 0.485, 'roc_auc': np.float64(0.622), 'NNS': 3.594, 'best_params': {}}
{'fold': 4, 'accuracy': 0.576, 'precision': 0.266, 'sensitivity': 0.562, 'f1_score': 0.361, 'fbeta_2': 0.459, 'roc_auc': np.float64(0.612), 'NNS': 3.76, 'best_params': {}}
{'fold': 5, 'accuracy': 0.575, 'precision': 0.276, 'sensitivity': 0.605, 'f1_score': 0.379, 'fbeta_2': 0.489, 'roc_auc': np.float64(0.631), 'NNS': 3.627, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.579 ± 0.010
precision: 0.272 ± 0.008
sensitivity: 

## y_stk_or_aemb

In [59]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="nb",
    sampling="all",
    n_splits=5,
    random_state=42,
)


=== Technique: BASELINE ===
{'fold': 1, 'accuracy': 0.118, 'precision': 0.023, 'sensitivity': 0.893, 'f1_score': 0.044, 'fbeta_2': 0.103, 'roc_auc': np.float64(0.572), 'NNS': 43.88, 'best_params': OrderedDict([('var_smoothing', 0.00016833560494324088)])}
{'fold': 2, 'accuracy': 0.156, 'precision': 0.023, 'sensitivity': 0.889, 'f1_score': 0.045, 'fbeta_2': 0.104, 'roc_auc': np.float64(0.573), 'NNS': 43.708, 'best_params': OrderedDict([('var_smoothing', 0.045909390680620464)])}
{'fold': 3, 'accuracy': 0.076, 'precision': 0.023, 'sensitivity': 1.0, 'f1_score': 0.046, 'fbeta_2': 0.107, 'roc_auc': np.float64(0.485), 'NNS': 42.667, 'best_params': OrderedDict([('var_smoothing', 3.5588214738093886e-12)])}
{'fold': 4, 'accuracy': 0.12, 'precision': 0.024, 'sensitivity': 0.963, 'f1_score': 0.046, 'fbeta_2': 0.108, 'roc_auc': np.float64(0.527), 'NNS': 42.192, 'best_params': OrderedDict([('var_smoothing', 0.1)])}
{'fold': 5, 'accuracy': 0.183, 'precision': 0.021, 'sensitivity': 0.778, 'f1_score':

In [60]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="oversample",
    n_splits=5,
    random_state=42,
)


=== Technique: OVERSAMPLE ===
{'fold': 1, 'accuracy': 0.622, 'precision': 0.036, 'sensitivity': 0.607, 'f1_score': 0.069, 'fbeta_2': 0.147, 'roc_auc': np.float64(0.624), 'NNS': 27.471, 'best_params': OrderedDict([('C', 0.0007019978261793328), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.629, 'precision': 0.033, 'sensitivity': 0.556, 'f1_score': 0.062, 'fbeta_2': 0.133, 'roc_auc': np.float64(0.627), 'NNS': 30.333, 'best_params': OrderedDict([('C', 0.0011874560998179724), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.61, 'precision': 0.037, 'sensitivity': 0.667, 'f1_score': 0.07, 'fbeta_2': 0.152, 'roc_auc': np.float64(0.658), 'NNS': 26.889, 'best_params': OrderedDict([('C', 0.0003134839566716342), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.646, 'precision': 0.032, 'sensitivity': 0.519, 'f1_score': 0.061, 'fbeta_2': 0.13, 'roc_auc': np.float64(0.601), 'NNS': 30.857, 'best_params': OrderedDict([('C', 0.0011

In [61]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="dt",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.761, 'precision': 0.025, 'sensitivity': 0.25, 'f1_score': 0.046, 'fbeta_2': 0.09, 'roc_auc': np.float64(0.518), 'NNS': 39.571, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 20), ('min_samples_leaf', 1), ('min_samples_split', 2)])}
{'fold': 2, 'accuracy': 0.763, 'precision': 0.025, 'sensitivity': 0.259, 'f1_score': 0.046, 'fbeta_2': 0.091, 'roc_auc': np.float64(0.517), 'NNS': 39.429, 'best_params': OrderedDict([('criterion', 'gini'), ('max_depth', 19), ('min_samples_leaf', 1), ('min_samples_split', 2)])}
{'fold': 3, 'accuracy': 0.766, 'precision': 0.033, 'sensitivity': 0.333, 'f1_score': 0.059, 'fbeta_2': 0.117, 'roc_auc': np.float64(0.505), 'NNS': 30.667, 'best_params': OrderedDict([('criterion', 'entropy'), ('max_depth', 20), ('min_samples_leaf', 10), ('min_samples_split', 20)])}
{'fold': 4, 'accuracy': 0.795, 'precision': 0.017, 'sensitivity': 0.148, 'f1_score': 0.031, 'fbeta_2': 0.059, 'roc_auc': np.floa

In [62]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.797, 'precision': 0.03, 'sensitivity': 0.25, 'f1_score': 0.053, 'fbeta_2': 0.101, 'roc_auc': np.float64(0.519), 'NNS': 33.429, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 45), ('max_features', None), ('min_samples_leaf', 5), ('min_samples_split', 3), ('n_estimators', 300)])}
{'fold': 2, 'accuracy': 0.763, 'precision': 0.035, 'sensitivity': 0.37, 'f1_score': 0.065, 'fbeta_2': 0.128, 'roc_auc': np.float64(0.543), 'NNS': 28.2, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 36), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 6), ('n_estimators', 74)])}
{'fold': 3, 'accuracy': 0.833, 'precision': 0.016, 'sensitivity': 0.111, 'f1_score': 0.029, 'fbeta_2': 0.052, 'roc_auc': np.float64(0.568), 'NNS': 60.667, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 44), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 300)])}
{

In [63]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.916, 'precision': 0.037, 'sensitivity': 0.107, 'f1_score': 0.056, 'fbeta_2': 0.078, 'roc_auc': np.float64(0.543), 'NNS': 26.667, 'best_params': OrderedDict([('colsample_bytree', 0.5), ('gamma', 0), ('learning_rate', 0.1), ('max_depth', 50), ('min_child_weight', 1), ('n_estimators', 500), ('subsample', 0.5)])}
{'fold': 2, 'accuracy': 0.886, 'precision': 0.017, 'sensitivity': 0.074, 'f1_score': 0.028, 'fbeta_2': 0.045, 'roc_auc': np.float64(0.591), 'NNS': 58.0, 'best_params': OrderedDict([('colsample_bytree', 0.7752207775160657), ('gamma', 0), ('learning_rate', 0.07420112768680359), ('max_depth', 18), ('min_child_weight', 6), ('n_estimators', 393), ('subsample', 0.8180764659633659)])}
{'fold': 3, 'accuracy': 0.888, 'precision': 0.026, 'sensitivity': 0.111, 'f1_score': 0.042, 'fbeta_2': 0.067, 'roc_auc': np.float64(0.545), 'NNS': 38.667, 'best_params': OrderedDict([('colsample_bytree', 0.5), ('gamma', 0), ('learning_rate', 0.1), ('

In [64]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb",
    optimizer_map=optimizer_map,
    model_name="mlp",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.852, 'precision': 0.048, 'sensitivity': 0.286, 'f1_score': 0.082, 'fbeta_2': 0.143, 'roc_auc': np.float64(0.599), 'NNS': 21.0, 'best_params': {}}
{'fold': 2, 'accuracy': 0.833, 'precision': 0.027, 'sensitivity': 0.185, 'f1_score': 0.047, 'fbeta_2': 0.085, 'roc_auc': np.float64(0.578), 'NNS': 37.2, 'best_params': {}}
{'fold': 3, 'accuracy': 0.856, 'precision': 0.037, 'sensitivity': 0.222, 'f1_score': 0.064, 'fbeta_2': 0.112, 'roc_auc': np.float64(0.62), 'NNS': 26.833, 'best_params': {}}
{'fold': 4, 'accuracy': 0.861, 'precision': 0.027, 'sensitivity': 0.148, 'f1_score': 0.045, 'fbeta_2': 0.078, 'roc_auc': np.float64(0.528), 'NNS': 37.5, 'best_params': {}}
{'fold': 5, 'accuracy': 0.875, 'precision': 0.037, 'sensitivity': 0.185, 'f1_score': 0.062, 'fbeta_2': 0.103, 'roc_auc': np.float64(0.547), 'NNS': 27.0, 'best_params': {}}

Mean scores across folds ( undersample ):
accuracy: 0.856 ± 0.014
precision: 0.035 ± 0.008
sensitivity: 0.

## Other time intervals

In [68]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb_3_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="oversample",
    n_splits=5,
    random_state=42,
)


=== Technique: OVERSAMPLE ===
{'fold': 1, 'accuracy': 0.879, 'precision': 0.007, 'sensitivity': 0.2, 'f1_score': 0.013, 'fbeta_2': 0.03, 'roc_auc': np.float64(0.713), 'NNS': 144.0, 'best_params': OrderedDict([('C', 1000.0), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.886, 'precision': 0.0, 'sensitivity': 0.0, 'f1_score': 0.0, 'fbeta_2': 0.0, 'roc_auc': np.float64(0.448), 'NNS': inf, 'best_params': OrderedDict([('C', 1000.0), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.896, 'precision': 0.0, 'sensitivity': 0.0, 'f1_score': 0.0, 'fbeta_2': 0.0, 'roc_auc': np.float64(0.508), 'NNS': inf, 'best_params': OrderedDict([('C', 48.61518228524981), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.906, 'precision': 0.0, 'sensitivity': 0.0, 'f1_score': 0.0, 'fbeta_2': 0.0, 'roc_auc': np.float64(0.625), 'NNS': inf, 'best_params': OrderedDict([('C', 487.9169484427551), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold':

In [65]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb_6_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="oversample",
    n_splits=5,
    random_state=42,
)


=== Technique: OVERSAMPLE ===
{'fold': 1, 'accuracy': 0.819, 'precision': 0.014, 'sensitivity': 0.333, 'f1_score': 0.026, 'fbeta_2': 0.059, 'roc_auc': np.float64(0.597), 'NNS': 72.667, 'best_params': OrderedDict([('C', 18.490125311170186), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.731, 'precision': 0.015, 'sensitivity': 0.625, 'f1_score': 0.03, 'fbeta_2': 0.069, 'roc_auc': np.float64(0.677), 'NNS': 66.0, 'best_params': OrderedDict([('C', 0.007317739953312007), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.776, 'precision': 0.011, 'sensitivity': 0.375, 'f1_score': 0.022, 'fbeta_2': 0.05, 'roc_auc': np.float64(0.646), 'NNS': 90.333, 'best_params': OrderedDict([('C', 487.9169484427551), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.785, 'precision': 0.008, 'sensitivity': 0.25, 'f1_score': 0.015, 'fbeta_2': 0.034, 'roc_auc': np.float64(0.464), 'NNS': 129.0, 'best_params': OrderedDict([('C', 72.73082081096716

In [66]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb_12_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="oversample",
    n_splits=5,
    random_state=42,
)


=== Technique: OVERSAMPLE ===
{'fold': 1, 'accuracy': 0.783, 'precision': 0.019, 'sensitivity': 0.417, 'f1_score': 0.036, 'fbeta_2': 0.081, 'roc_auc': np.float64(0.607), 'NNS': 52.4, 'best_params': OrderedDict([('C', 72.73082081096716), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.756, 'precision': 0.003, 'sensitivity': 0.091, 'f1_score': 0.007, 'fbeta_2': 0.015, 'roc_auc': np.float64(0.494), 'NNS': 288.0, 'best_params': OrderedDict([('C', 0.025372859356249205), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.701, 'precision': 0.022, 'sensitivity': 0.727, 'f1_score': 0.042, 'fbeta_2': 0.097, 'roc_auc': np.float64(0.787), 'NNS': 46.125, 'best_params': OrderedDict([('C', 48.61518228524981), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.764, 'precision': 0.021, 'sensitivity': 0.545, 'f1_score': 0.04, 'fbeta_2': 0.09, 'roc_auc': np.float64(0.696), 'NNS': 48.0, 'best_params': OrderedDict([('C', 72.73082081096716),

In [67]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_stk_or_aemb_24_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="oversample",
    n_splits=5,
    random_state=42,
)


=== Technique: OVERSAMPLE ===
{'fold': 1, 'accuracy': 0.622, 'precision': 0.017, 'sensitivity': 0.5, 'f1_score': 0.034, 'fbeta_2': 0.076, 'roc_auc': np.float64(0.607), 'NNS': 57.625, 'best_params': OrderedDict([('C', 0.0006565146233750605), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.653, 'precision': 0.019, 'sensitivity': 0.5, 'f1_score': 0.036, 'fbeta_2': 0.082, 'roc_auc': np.float64(0.621), 'NNS': 52.875, 'best_params': OrderedDict([('C', 0.002389404689685648), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.74, 'precision': 0.016, 'sensitivity': 0.312, 'f1_score': 0.031, 'fbeta_2': 0.067, 'roc_auc': np.float64(0.527), 'NNS': 62.2, 'best_params': OrderedDict([('C', 5.06862674475771), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.686, 'precision': 0.021, 'sensitivity': 0.5, 'f1_score': 0.04, 'fbeta_2': 0.09, 'roc_auc': np.float64(0.581), 'NNS': 47.75, 'best_params': OrderedDict([('C', 0.30507492737441433),

## other time intervals slope based rf

In [9]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_1_month",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.589, 'precision': 0.127, 'sensitivity': 0.615, 'f1_score': 0.211, 'fbeta_2': 0.348, 'roc_auc': np.float64(0.638), 'NNS': 7.851, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 50), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 50)])}
{'fold': 2, 'accuracy': 0.593, 'precision': 0.141, 'sensitivity': 0.697, 'f1_score': 0.235, 'fbeta_2': 0.39, 'roc_auc': np.float64(0.694), 'NNS': 7.092, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 43), ('max_features', 'log2'), ('min_samples_leaf', 3), ('min_samples_split', 15), ('n_estimators', 296)])}
{'fold': 3, 'accuracy': 0.603, 'precision': 0.129, 'sensitivity': 0.596, 'f1_score': 0.212, 'fbeta_2': 0.345, 'roc_auc': np.float64(0.66), 'NNS': 7.769, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 50), ('max_features', 'log2'), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 50)])}

In [10]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_3_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.609, 'precision': 0.24, 'sensitivity': 0.667, 'f1_score': 0.353, 'fbeta_2': 0.492, 'roc_auc': np.float64(0.688), 'NNS': 4.169, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 41), ('max_features', 'log2'), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 300)])}
{'fold': 2, 'accuracy': 0.605, 'precision': 0.231, 'sensitivity': 0.634, 'f1_score': 0.338, 'fbeta_2': 0.47, 'roc_auc': np.float64(0.657), 'NNS': 4.333, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 44), ('max_features', 'sqrt'), ('min_samples_leaf', 19), ('min_samples_split', 18), ('n_estimators', 66)])}
{'fold': 3, 'accuracy': 0.611, 'precision': 0.254, 'sensitivity': 0.738, 'f1_score': 0.378, 'fbeta_2': 0.535, 'roc_auc': np.float64(0.714), 'NNS': 3.938, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 33), ('max_features', 'sqrt'), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 30

In [11]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_6_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===


{'fold': 1, 'accuracy': 0.601, 'precision': 0.306, 'sensitivity': 0.678, 'f1_score': 0.421, 'fbeta_2': 0.545, 'roc_auc': np.float64(0.673), 'NNS': 3.271, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 16), ('max_features', None), ('min_samples_leaf', 3), ('min_samples_split', 18), ('n_estimators', 147)])}
{'fold': 2, 'accuracy': 0.589, 'precision': 0.3, 'sensitivity': 0.696, 'f1_score': 0.419, 'fbeta_2': 0.551, 'roc_auc': np.float64(0.681), 'NNS': 3.331, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 48), ('max_features', 'sqrt'), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 200)])}
{'fold': 3, 'accuracy': 0.606, 'precision': 0.307, 'sensitivity': 0.673, 'f1_score': 0.422, 'fbeta_2': 0.543, 'roc_auc': np.float64(0.667), 'NNS': 3.257, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 32), ('max_features', 'log2'), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 223)])}
{'fold': 4, 'accuracy': 0.589

In [12]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_12_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.587, 'precision': 0.382, 'sensitivity': 0.664, 'f1_score': 0.485, 'fbeta_2': 0.578, 'roc_auc': np.float64(0.672), 'NNS': 2.62, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 50), ('max_features', 'log2'), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 300)])}
{'fold': 2, 'accuracy': 0.624, 'precision': 0.42, 'sensitivity': 0.75, 'f1_score': 0.538, 'fbeta_2': 0.648, 'roc_auc': np.float64(0.697), 'NNS': 2.382, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 31), ('max_features', 'log2'), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 290)])}
{'fold': 3, 'accuracy': 0.609, 'precision': 0.401, 'sensitivity': 0.685, 'f1_score': 0.506, 'fbeta_2': 0.6, 'roc_auc': np.float64(0.683), 'NNS': 2.492, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 50), ('max_features', 'sqrt'), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 300)])

In [13]:
results = run_cv_with_sampling(
    X_full=data,
    target_cols=targets,
    target_name="y_inp_24_months",
    optimizer_map=optimizer_map,
    model_name="rf",
    sampling="undersample",
    n_splits=5,
    random_state=42,
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.603, 'precision': 0.492, 'sensitivity': 0.686, 'f1_score': 0.573, 'fbeta_2': 0.636, 'roc_auc': np.float64(0.669), 'NNS': 2.031, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 50), ('max_features', 'sqrt'), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 298)])}
{'fold': 2, 'accuracy': 0.613, 'precision': 0.501, 'sensitivity': 0.71, 'f1_score': 0.588, 'fbeta_2': 0.656, 'roc_auc': np.float64(0.682), 'NNS': 1.994, 'best_params': OrderedDict([('bootstrap', False), ('max_depth', 50), ('max_features', 'log2'), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 300)])}
{'fold': 3, 'accuracy': 0.617, 'precision': 0.505, 'sensitivity': 0.71, 'f1_score': 0.59, 'fbeta_2': 0.657, 'roc_auc': np.float64(0.675), 'NNS': 1.982, 'best_params': OrderedDict([('bootstrap', True), ('max_depth', 50), ('max_features', 'log2'), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 198