# Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, fbeta_score

from skopt.space import Integer, Categorical

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import fbeta_score, roc_auc_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score
import shap

from skopt import BayesSearchCV

from typing import Dict, List, Literal
from skopt.space import Real, Integer, Categorical

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings("ignore")

## Read dataset

In [2]:
data = pd.read_csv("../data/processed/20. FINAL_mean_delta_multi_output.csv")
print(data.shape)
data.head()

(6091, 210)


Unnamed: 0,Hba1c,Hba1c Time,Hba1c FM,Hba1c FM Time,BMI,BMI Time,Cancer,Cancer Time,Carotid Disease,Carotid Disease Time,...,y_stk_or_aemb_3_months,y_stk_or_aemb_6_months,y_stk_or_aemb_12_months,y_stk_or_aemb_24_months,y_stk_or_aemb,History of Vascular Disease,Antihypertensive Medication,Diabetes Mellitus,Diabetes Medication,Abnormal Kidney Function
0,6.26,-501,6.26,-501,20.7,-19,0,10000,0,10000,...,0,0,0,0,0,0,1,0,0,0
1,6.26,-501,6.26,-501,26.7,-780,1,-4846,0,10000,...,0,0,0,0,0,0,1,0,0,0
2,5.8,-287,6.3,-2701,31.1,-35,0,10000,0,10000,...,0,0,0,0,0,0,1,1,1,1
3,6.26,-501,6.26,-501,21.3,-207,0,10000,0,10000,...,0,0,0,0,0,1,1,0,0,1
4,5.9,-162,5.4,-5209,37.8,-554,1,-86,0,10000,...,0,0,0,0,0,1,1,1,1,0


## Drop columns

In [3]:
targets = ["y_acs_6_months", "y_cvdeath_6_months", "y_death_6_months", "y_hf_6_months", "y_inp_6_months", "y_stk_or_aemb"]

# keep only target Ys, drop any other y_* cols first
cols_to_drop = [col for col in data.columns if col.startswith('y_') and col not in targets]
data_all_features = data.drop(columns=cols_to_drop)

# start longitudinal from this cleaned version
data_slope = data_all_features.copy()

# drop *_t
cols_to_drop = [col for col in data_slope.columns if col.endswith('Time')]
data_slope = data_slope.drop(columns=cols_to_drop)

# drop FM
cols_to_drop = [col for col in data_slope.columns if "FM" in col]
data_slope = data_slope.drop(columns=cols_to_drop)

# static = longitudinal without delta
data_static = data_slope.copy()

cols_to_drop = [col for col in data_slope.columns if col.startswith('Δ')]
data_static = data_static.drop(columns=cols_to_drop)


In [4]:
data_static.head()

Unnamed: 0,Hba1c,BMI,Cancer,Carotid Disease,Coronary Disease,COPD,Creatinine,DBP,Dyslipidemia,eGFR,...,y_cvdeath_6_months,y_death_6_months,y_hf_6_months,y_inp_6_months,y_stk_or_aemb,History of Vascular Disease,Antihypertensive Medication,Diabetes Mellitus,Diabetes Medication,Abnormal Kidney Function
0,6.26,20.7,0,0,0,0,0.99,87.0,1,64.15,...,0,0,0,0,0,0,1,0,0,0
1,6.26,26.7,1,0,0,0,0.53,81.0,0,88.5,...,0,0,1,1,0,0,1,0,0,0
2,5.8,31.1,0,0,0,0,0.88,109.0,1,46.8,...,0,0,0,0,0,0,1,1,1,1
3,6.26,21.3,0,0,0,0,1.56,63.0,1,9.0,...,0,0,0,0,0,1,1,0,0,1
4,5.9,37.8,1,0,1,0,0.64,98.0,1,76.6,...,0,0,1,1,0,1,1,1,1,0


## Models settings

In [4]:
# Define 5-fold stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create F2 scorer
f2_scorer = make_scorer(fbeta_score, beta=2)

# ===================== Naive Bayes =====================
search_space_nb = {
    "var_smoothing": (1e-12, 1e-1, "log-uniform")
}

nb_opt = BayesSearchCV(
    estimator=GaussianNB(),
    search_spaces=search_space_nb,
    scoring=f2_scorer,
    n_iter=30,
    cv=cv,
    n_jobs=-1,
    random_state=42
)

# ===================== Logistic Regression =====================
search_space_lr = {
    'C': (1e-4, 1e+3, 'log-uniform'),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

lr_opt = BayesSearchCV(
    LogisticRegression(max_iter=1000, random_state=42),
    search_spaces=search_space_lr,
    scoring=f2_scorer,
    n_iter=30,
    cv=cv,
    n_jobs=-1,
    random_state=42
)

# ===================== Decision Tree =====================
search_space_dt = {
    "max_depth": Integer(3, 20),
    "min_samples_split": Integer(2, 20),
    "min_samples_leaf": Integer(1, 10),
    "criterion": Categorical(["gini", "entropy"])
}

dt_opt = BayesSearchCV(
    DecisionTreeClassifier(random_state=42),
    search_spaces=search_space_dt,
    n_iter=30,
    scoring=f2_scorer,
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

# ===================== Random Forest =====================
search_space_rf = {
    'n_estimators': (50, 300),
    'max_depth': (1, 50),
    'min_samples_split': (2, 20),
    'min_samples_leaf': (1, 20),
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

rf_opt = BayesSearchCV(
    RandomForestClassifier(random_state=42),
    search_spaces=search_space_rf,
    scoring=f2_scorer,
    n_iter=30,
    cv=cv,
    n_jobs=-1,
    random_state=42
)

# ===================== XGBoost =====================
search_space_xgb = {
    'n_estimators': (10, 500),
    'max_depth': (1, 50),
    'learning_rate': (0.001, 0.1, 'uniform'),
    'subsample': (0.5, 1.0),
    'colsample_bytree': (0.5, 1.0),
    'gamma': (0, 5),
    'min_child_weight': (1, 20)
}

xgb_opt = BayesSearchCV(
    XGBClassifier(random_state=42),
    search_spaces=search_space_xgb,
    scoring=f2_scorer,
    n_iter=30,
    n_jobs=-1,
    random_state=42
)

# ===================== MLP ===============================

mlp_opt = MLPClassifier(max_iter=2000, solver="adam", activation="relu", hidden_layer_sizes=(200,100), random_state=42)

# ===================== Optimizer Map =====================
optimizer_map = {
    "nb": nb_opt,
    "lr": lr_opt,
    "dt": dt_opt,
    "rf": rf_opt,
    "xgb": xgb_opt,
    "mlp": mlp_opt
}


## Define function

In [5]:
SamplingName = Literal["baseline", "undersample", "oversample", "smote", "all"]
ModelName = Literal["nb", "lr", "xgb", "dt", "rf", "mlp"]
ExplainerName = Literal["none", "linear", "tree", "kernel"]

def _make_resampler(name: SamplingName, random_state: int, y_train: pd.Series = None):
    if name == "baseline":
        return None
    if name == "undersample":
        if y_train is None:
            raise ValueError("y_train must be provided for undersampling strategy")
        n_minority = y_train.sum()
        n_required = max(int(0.1 * len(y_train)), n_minority * 2)
        n_majority = n_required - n_minority
        n_majority = min(n_majority, (y_train == 0).sum())
        sampling_strategy = {0: n_majority, 1: n_minority}
        return RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=random_state)
    if name == "oversample":
        return RandomOverSampler(random_state=random_state)
    if name == "smote":
        return SMOTE(random_state=random_state)
    raise ValueError(f"Unknown sampling technique: {name}")

def run_cv_with_sampling(
    X_full: pd.DataFrame,
    target_cols: List[str],
    target_name: str,
    optimizer_map: Dict[ModelName, object],
    model_name: ModelName = "nb",
    sampling: SamplingName = "all",
    n_splits: int = 5,
    random_state: int = 42,
    xgb_04: bool = False,
    explainer: ExplainerName = "none",
    max_disp = 20,
) -> Dict[str, dict]:
    if target_name not in target_cols:
        raise ValueError("target_name must be inside target_cols")
    missing_targets = [c for c in target_cols if c not in X_full.columns]
    if missing_targets:
        raise ValueError(f"Target columns not in X_full: {missing_targets}")

    y = X_full[target_name].copy()
    X = X_full.drop(columns=target_cols).copy()

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    if sampling == "all":
        techniques = ["baseline", "undersample", "oversample", "smote"]
    else:
        techniques = [sampling]

    results: Dict[str, dict] = {}
    fold_results: List[dict] = []   # collect across folds

    for tech in techniques:
        print(f"\n=== Technique: {tech.upper()} ===")

        fold_results = []
        best_params_per_fold: List[dict] = []

        for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), start=1):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            if model_name in ["nb", "lr", "mlp"]:
                scaler = StandardScaler()
                X_train = scaler.fit_transform(X_train)
                X_val = scaler.transform(X_val)

            resampler = _make_resampler(tech, random_state, y_train)
            if resampler is not None:
                X_train_rs, y_train_rs = resampler.fit_resample(X_train, y_train)
            else:
                X_train_rs, y_train_rs = X_train, y_train

            opt = optimizer_map[model_name]
            opt.fit(X_train_rs, y_train_rs)

            if model_name == "mlp":
                best_model = opt
                best_params_per_fold = {}
            else:
                best_model = opt.best_estimator_
                best_params_per_fold.append(getattr(opt, "best_params_", {}))

            y_pred = best_model.predict(X_val)

            if hasattr(best_model, "predict_proba"):
                y_pred_proba = best_model.predict_proba(X_val)[:, 1]
            else:
                scores = best_model.decision_function(X_val)
                smin, smax = scores.min(), scores.max()
                y_pred_proba = (scores - smin) / (smax - smin + 1e-9)

            if model_name == "xgb" and xgb_04:
                y_pred = (y_pred_proba >= 0.4).astype(int)

            prec = precision_score(y_val, y_pred, zero_division=0)
            rec = recall_score(y_val, y_pred, zero_division=0)
            f1 = f1_score(y_val, y_pred, zero_division=0)
            fbeta2 = fbeta_score(y_val, y_pred, beta=2, zero_division=0)
            roc = roc_auc_score(y_val, y_pred_proba)

            fold_metrics = {
                "fold": fold,
                "accuracy": accuracy_score(y_val, y_pred),
                "precision": prec,
                "sensitivity": rec,
                "f1_score": f1,
                "fbeta_2": fbeta2,
                "roc_auc": roc,
                "NNS": (1 / prec) if prec > 0 else np.inf,
                "best_params": getattr(opt, "best_params_", {}),
            }
            fold_results.append(fold_metrics)

            print({k: (round(v, 3) if isinstance(v, float) else v) for k, v in fold_metrics.items()})

        metrics = [
            "accuracy",
            "precision",
            "sensitivity",
            "f1_score",
            "fbeta_2",
            "roc_auc",
            "NNS",
        ]
        mean_std = {
            m: (np.mean([fr[m] for fr in fold_results]), np.std([fr[m] for fr in fold_results]))
            for m in metrics
        }

        print("\nMean scores across folds (", tech, "):")
        for m in metrics:
            mu, sd = mean_std[m]
            print(f"{m}: {mu:.3f} \u00B1 {sd:.3f}")

        summary_df = pd.DataFrame(fold_results)
        results[tech] = {
            "fold_results": fold_results,
            "summary_metrics": summary_df,
            "best_params_per_fold": best_params_per_fold,
        }

    # ---- FINAL MODEL FIT AND SHAP ----
    best_params_overall = max(fold_results, key=lambda x: x["f1_score"])["best_params"]

    # rebuild the chosen model with best params
    if model_name == "rf":
        final_model = RandomForestClassifier(**best_params_overall)
    elif model_name == "xgb":
        final_model = XGBClassifier(**best_params_overall, use_label_encoder=False, eval_metric="logloss")
    elif model_name == "dt":
        final_model = DecisionTreeClassifier(**best_params_overall)
    elif model_name == "lr":
        final_model = LogisticRegression(**best_params_overall, max_iter=1000)
    elif model_name == "nb":
        final_model = GaussianNB(**best_params_overall)
    elif model_name == "mlp":
        final_model = MLPClassifier(**best_params_overall, max_iter=2000)
    else:
        raise ValueError(f"Unknown model_name: {model_name}")

    final_model.fit(X, y)

    if explainer != "none":
        X_background_final = X.sample(min(100, len(X)), random_state=random_state)
        X_sample_final = X.sample(min(200, len(X)), random_state=random_state)

        if explainer == "kernel":
            predict_fn_final = lambda x: final_model.predict_proba(x)[:, 1]
            shap_explainer = shap.KernelExplainer(predict_fn_final, X_background_final)
            shap_values_final = shap_explainer.shap_values(X_sample_final)
        elif explainer == "tree":
            shap_explainer = shap.TreeExplainer(final_model)
            shap_values_final = shap_explainer.shap_values(X_sample_final)
        elif explainer == "linear":
            shap_explainer = shap.LinearExplainer(final_model, X_background_final)
            shap_values_final = shap_explainer.shap_values(X_sample_final)

        shap.summary_plot(
            shap_values_final,
            X_sample_final,
            feature_names=X.columns,
            max_display=max_disp
        )

    return results

## Stroke Static LR

In [30]:
targets = [
    "y_stk_or_aemb_1_month", "y_stk_or_aemb_3_months", "y_stk_or_aemb_6_months", 
    "y_stk_or_aemb_12_months", "y_stk_or_aemb_24_months", "y_stk_or_aemb",

    "y_death_1_month", "y_death_3_months", "y_death_6_months", 
    "y_death_12_months", "y_death_24_months", "y_death",

    "y_cvdeath_1_month", "y_cvdeath_3_months", "y_cvdeath_6_months", 
    "y_cvdeath_12_months", "y_cvdeath_24_months", "y_cvdeath",

    "y_hf_1_month", "y_hf_3_months", "y_hf_6_months", 
    "y_hf_12_months", "y_hf_24_months", "y_hf",

    "y_inp_1_month", "y_inp_3_months", "y_inp_6_months", 
    "y_inp_12_months", "y_inp_24_months", "y_inp",

    "y_acs_1_month", "y_acs_3_months", "y_acs_6_months", 
    "y_acs_12_months", "y_acs_24_months", "y_acs"
]

# keep only target Ys, drop any other y_* cols first
cols_to_drop = [col for col in data.columns if col.startswith('y_') and col not in targets]
data_all_features = data.drop(columns=cols_to_drop)

# start longitudinal from this cleaned version
data_slope = data_all_features.copy()

# drop *_t
cols_to_drop = [col for col in data_slope.columns if col.endswith('Time')]
data_slope = data_slope.drop(columns=cols_to_drop)

# drop FM
cols_to_drop = [col for col in data_slope.columns if "FM" in col]
data_slope = data_slope.drop(columns=cols_to_drop)

# static = longitudinal without delta
data_static = data_slope.copy()

cols_to_drop = [col for col in data_slope.columns if col.startswith('Δ')]
data_static = data_static.drop(columns=cols_to_drop)

In [31]:
data_static.shape

(6091, 102)

In [8]:
results = run_cv_with_sampling(
    X_full=data_static,
    target_cols=targets,
    target_name="y_stk_or_aemb_24_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
    explainer="none"
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.572, 'precision': 0.019, 'sensitivity': 0.625, 'f1_score': 0.037, 'fbeta_2': 0.085, 'roc_auc': np.float64(0.606), 'NNS': 52.6, 'best_params': OrderedDict([('C', 0.00025867031730454013), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.588, 'precision': 0.016, 'sensitivity': 0.5, 'f1_score': 0.031, 'fbeta_2': 0.071, 'roc_auc': np.float64(0.537), 'NNS': 62.75, 'best_params': OrderedDict([('C', 0.0004048464889463556), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.568, 'precision': 0.017, 'sensitivity': 0.562, 'f1_score': 0.033, 'fbeta_2': 0.076, 'roc_auc': np.float64(0.525), 'NNS': 58.667, 'best_params': OrderedDict([('C', 0.0001566482327075753), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.61, 'precision': 0.023, 'sensitivity': 0.688, 'f1_score': 0.044, 'fbeta_2': 0.101, 'roc_auc': np.float64(0.667), 'NNS': 43.727, 'best_params': OrderedDict([('C', 0.00445180964

In [9]:
results = run_cv_with_sampling(
    X_full=data_static,
    target_cols=targets,
    target_name="y_stk_or_aemb_12_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
    explainer="none"
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.797, 'precision': 0.016, 'sensitivity': 0.333, 'f1_score': 0.031, 'fbeta_2': 0.068, 'roc_auc': np.float64(0.617), 'NNS': 61.0, 'best_params': OrderedDict([('C', 487.9169484427551), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.698, 'precision': 0.008, 'sensitivity': 0.273, 'f1_score': 0.016, 'fbeta_2': 0.037, 'roc_auc': np.float64(0.552), 'NNS': 121.0, 'best_params': OrderedDict([('C', 0.00277290682409599), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.548, 'precision': 0.016, 'sensitivity': 0.818, 'f1_score': 0.032, 'fbeta_2': 0.075, 'roc_auc': np.float64(0.762), 'NNS': 62.0, 'best_params': OrderedDict([('C', 0.00014419146277986448), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.663, 'precision': 0.017, 'sensitivity': 0.636, 'f1_score': 0.033, 'fbeta_2': 0.076, 'roc_auc': np.float64(0.665), 'NNS': 59.143, 'best_params': OrderedDict([('C', 0.0011081095784807

## All cause death slope based XGB

In [7]:
results = run_cv_with_sampling(
    X_full=data_slope,
    target_cols=targets,
    target_name="y_death_1_month",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    explainer="none"
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.966, 'precision': 0.069, 'sensitivity': 0.118, 'f1_score': 0.087, 'fbeta_2': 0.103, 'roc_auc': np.float64(0.679), 'NNS': 14.5, 'best_params': OrderedDict([('colsample_bytree', 0.5), ('gamma', 5), ('learning_rate', 0.1), ('max_depth', 50), ('min_child_weight', 1), ('n_estimators', 500), ('subsample', 0.5)])}
{'fold': 2, 'accuracy': 0.955, 'precision': 0.025, 'sensitivity': 0.059, 'f1_score': 0.035, 'fbeta_2': 0.046, 'roc_auc': np.float64(0.684), 'NNS': 40.0, 'best_params': OrderedDict([('colsample_bytree', 0.9817498389741233), ('gamma', 1), ('learning_rate', 0.1), ('max_depth', 42), ('min_child_weight', 8), ('n_estimators', 500), ('subsample', 0.5363513193967164)])}
{'fold': 3, 'accuracy': 0.919, 'precision': 0.023, 'sensitivity': 0.118, 'f1_score': 0.039, 'fbeta_2': 0.065, 'roc_auc': np.float64(0.664), 'NNS': 43.0, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 0), ('learning_rate', 0.1), ('max_depth', 1), ('mi

In [8]:
results = run_cv_with_sampling(
    X_full=data_slope,
    target_cols=targets,
    target_name="y_death_3_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    explainer="none"
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.841, 'precision': 0.107, 'sensitivity': 0.525, 'f1_score': 0.178, 'fbeta_2': 0.295, 'roc_auc': np.float64(0.76), 'NNS': 9.333, 'best_params': OrderedDict([('colsample_bytree', 0.5), ('gamma', 0), ('learning_rate', 0.1), ('max_depth', 19), ('min_child_weight', 8), ('n_estimators', 322), ('subsample', 0.8966309402917302)])}
{'fold': 2, 'accuracy': 0.837, 'precision': 0.092, 'sensitivity': 0.462, 'f1_score': 0.154, 'fbeta_2': 0.256, 'roc_auc': np.float64(0.747), 'NNS': 10.833, 'best_params': OrderedDict([('colsample_bytree', 0.8441174868111101), ('gamma', 0), ('learning_rate', 0.1), ('max_depth', 1), ('min_child_weight', 3), ('n_estimators', 500), ('subsample', 0.9486554558912768)])}
{'fold': 3, 'accuracy': 0.841, 'precision': 0.067, 'sensitivity': 0.3, 'f1_score': 0.11, 'fbeta_2': 0.178, 'roc_auc': np.float64(0.712), 'NNS': 14.833, 'best_params': OrderedDict([('colsample_bytree', 0.5), ('gamma', 0), ('learning_rate', 0.07148226498

In [9]:
results = run_cv_with_sampling(
    X_full=data_slope,
    target_cols=targets,
    target_name="y_death_12_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    explainer="none"
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.671, 'precision': 0.149, 'sensitivity': 0.779, 'f1_score': 0.25, 'fbeta_2': 0.422, 'roc_auc': np.float64(0.775), 'NNS': 6.701, 'best_params': OrderedDict([('colsample_bytree', 0.705051979426657), ('gamma', 4), ('learning_rate', 0.09335393188593556), ('max_depth', 16), ('min_child_weight', 14), ('n_estimators', 213), ('subsample', 0.675465667449572)])}
{'fold': 2, 'accuracy': 0.66, 'precision': 0.136, 'sensitivity': 0.709, 'f1_score': 0.228, 'fbeta_2': 0.384, 'roc_auc': np.float64(0.722), 'NNS': 7.377, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 5), ('learning_rate', 0.00560773047680431), ('max_depth', 42), ('min_child_weight', 5), ('n_estimators', 205), ('subsample', 0.6789555582274166)])}
{'fold': 3, 'accuracy': 0.709, 'precision': 0.153, 'sensitivity': 0.686, 'f1_score': 0.25, 'fbeta_2': 0.404, 'roc_auc': np.float64(0.785), 'NNS': 6.542, 'best_params': OrderedDict([('colsample_bytree', 0.7864573848001208),

In [10]:
results = run_cv_with_sampling(
    X_full=data_slope,
    target_cols=targets,
    target_name="y_death_24_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    explainer="none"
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.651, 'precision': 0.198, 'sensitivity': 0.772, 'f1_score': 0.315, 'fbeta_2': 0.489, 'roc_auc': np.float64(0.748), 'NNS': 5.051, 'best_params': OrderedDict([('colsample_bytree', 0.8670140089927842), ('gamma', 5), ('learning_rate', 0.01719711068521648), ('max_depth', 10), ('min_child_weight', 16), ('n_estimators', 193), ('subsample', 0.7295122570754029)])}
{'fold': 2, 'accuracy': 0.615, 'precision': 0.188, 'sensitivity': 0.817, 'f1_score': 0.305, 'fbeta_2': 0.489, 'roc_auc': np.float64(0.75), 'NNS': 5.33, 'best_params': OrderedDict([('colsample_bytree', 0.7716369348128362), ('gamma', 5), ('learning_rate', 0.021270867030867287), ('max_depth', 24), ('min_child_weight', 1), ('n_estimators', 125), ('subsample', 0.7558013795036381)])}
{'fold': 3, 'accuracy': 0.666, 'precision': 0.201, 'sensitivity': 0.74, 'f1_score': 0.316, 'fbeta_2': 0.482, 'roc_auc': np.float64(0.753), 'NNS': 4.979, 'best_params': OrderedDict([('colsample_bytree', 0.

## cardiovascular death staticc XGB

In [11]:
results = run_cv_with_sampling(
    X_full=data_static,
    target_cols=targets,
    target_name="y_cvdeath_1_month",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    explainer="none"
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.959, 'precision': 0.0, 'sensitivity': 0.0, 'f1_score': 0.0, 'fbeta_2': 0.0, 'roc_auc': np.float64(0.547), 'NNS': inf, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 0), ('learning_rate', 0.1), ('max_depth', 1), ('min_child_weight', 1), ('n_estimators', 490), ('subsample', 0.5)])}
{'fold': 2, 'accuracy': 0.966, 'precision': 0.0, 'sensitivity': 0.0, 'f1_score': 0.0, 'fbeta_2': 0.0, 'roc_auc': np.float64(0.558), 'NNS': inf, 'best_params': OrderedDict([('colsample_bytree', 0.5), ('gamma', 0), ('learning_rate', 0.08540698745082223), ('max_depth', 50), ('min_child_weight', 3), ('n_estimators', 500), ('subsample', 0.5)])}
{'fold': 3, 'accuracy': 0.975, 'precision': 0.091, 'sensitivity': 0.167, 'f1_score': 0.118, 'fbeta_2': 0.143, 'roc_auc': np.float64(0.849), 'NNS': 11.0, 'best_params': OrderedDict([('colsample_bytree', 0.5168589103299077), ('gamma', 2), ('learning_rate', 0.08029284646562845), ('max_depth', 1), ('min_

In [12]:
results = run_cv_with_sampling(
    X_full=data_static,
    target_cols=targets,
    target_name="y_cvdeath_3_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    explainer="none"
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.87, 'precision': 0.036, 'sensitivity': 0.161, 'f1_score': 0.06, 'fbeta_2': 0.096, 'roc_auc': np.float64(0.737), 'NNS': 27.4, 'best_params': OrderedDict([('colsample_bytree', 0.6277617351201892), ('gamma', 1), ('learning_rate', 0.1), ('max_depth', 24), ('min_child_weight', 7), ('n_estimators', 500), ('subsample', 0.9188776530827603)])}
{'fold': 2, 'accuracy': 0.881, 'precision': 0.074, 'sensitivity': 0.333, 'f1_score': 0.121, 'fbeta_2': 0.196, 'roc_auc': np.float64(0.728), 'NNS': 13.5, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 0), ('learning_rate', 0.1), ('max_depth', 50), ('min_child_weight', 6), ('n_estimators', 440), ('subsample', 1.0)])}
{'fold': 3, 'accuracy': 0.883, 'precision': 0.088, 'sensitivity': 0.4, 'f1_score': 0.145, 'fbeta_2': 0.234, 'roc_auc': np.float64(0.781), 'NNS': 11.333, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 3), ('learning_rate', 0.08217728849368063), ('max_de

In [13]:
results = run_cv_with_sampling(
    X_full=data_static,
    target_cols=targets,
    target_name="y_cvdeath_12_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    explainer="none"
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.662, 'precision': 0.115, 'sensitivity': 0.739, 'f1_score': 0.198, 'fbeta_2': 0.354, 'roc_auc': np.float64(0.725), 'NNS': 8.725, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 0), ('learning_rate', 0.09293587317507378), ('max_depth', 50), ('min_child_weight', 7), ('n_estimators', 77), ('subsample', 0.9271679445747645)])}
{'fold': 2, 'accuracy': 0.651, 'precision': 0.101, 'sensitivity': 0.652, 'f1_score': 0.175, 'fbeta_2': 0.312, 'roc_auc': np.float64(0.722), 'NNS': 9.911, 'best_params': OrderedDict([('colsample_bytree', 0.705051979426657), ('gamma', 4), ('learning_rate', 0.09335393188593556), ('max_depth', 16), ('min_child_weight', 14), ('n_estimators', 213), ('subsample', 0.675465667449572)])}
{'fold': 3, 'accuracy': 0.642, 'precision': 0.105, 'sensitivity': 0.71, 'f1_score': 0.184, 'fbeta_2': 0.331, 'roc_auc': np.float64(0.745), 'NNS': 9.49, 'best_params': OrderedDict([('colsample_bytree', 0.8523366728727011),

In [14]:
results = run_cv_with_sampling(
    X_full=data_static,
    target_cols=targets,
    target_name="y_cvdeath_24_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    explainer="none"
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.608, 'precision': 0.143, 'sensitivity': 0.728, 'f1_score': 0.239, 'fbeta_2': 0.4, 'roc_auc': np.float64(0.719), 'NNS': 7.0, 'best_params': OrderedDict([('colsample_bytree', 0.8192702452192342), ('gamma', 1), ('learning_rate', 0.00364909035453866), ('max_depth', 2), ('min_child_weight', 12), ('n_estimators', 482), ('subsample', 0.8300759524616861)])}
{'fold': 2, 'accuracy': 0.581, 'precision': 0.14, 'sensitivity': 0.775, 'f1_score': 0.237, 'fbeta_2': 0.406, 'roc_auc': np.float64(0.745), 'NNS': 7.165, 'best_params': OrderedDict([('colsample_bytree', 0.9470085292975561), ('gamma', 2), ('learning_rate', 0.025132243091102565), ('max_depth', 50), ('min_child_weight', 20), ('n_estimators', 10), ('subsample', 0.5120947255985121)])}
{'fold': 3, 'accuracy': 0.655, 'precision': 0.16, 'sensitivity': 0.735, 'f1_score': 0.263, 'fbeta_2': 0.428, 'roc_auc': np.float64(0.741), 'NNS': 6.24, 'best_params': OrderedDict([('colsample_bytree', 0.77170

## HF hospitalization Static XGB

In [15]:
results = run_cv_with_sampling(
    X_full=data_static,
    target_cols=targets,
    target_name="y_hf_1_month",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    explainer="none"
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.66, 'precision': 0.108, 'sensitivity': 0.75, 'f1_score': 0.188, 'fbeta_2': 0.342, 'roc_auc': np.float64(0.775), 'NNS': 9.292, 'best_params': OrderedDict([('colsample_bytree', 0.7400867784708406), ('gamma', 4), ('learning_rate', 0.020731198798090593), ('max_depth', 39), ('min_child_weight', 12), ('n_estimators', 445), ('subsample', 0.9322996691803485)])}
{'fold': 2, 'accuracy': 0.521, 'precision': 0.081, 'sensitivity': 0.794, 'f1_score': 0.146, 'fbeta_2': 0.286, 'roc_auc': np.float64(0.74), 'NNS': 12.42, 'best_params': OrderedDict([('colsample_bytree', 0.5), ('gamma', 5), ('learning_rate', 0.001), ('max_depth', 1), ('min_child_weight', 7), ('n_estimators', 500), ('subsample', 1.0)])}
{'fold': 3, 'accuracy': 0.535, 'precision': 0.094, 'sensitivity': 0.921, 'f1_score': 0.17, 'fbeta_2': 0.333, 'roc_auc': np.float64(0.747), 'NNS': 10.672, 'best_params': OrderedDict([('colsample_bytree', 0.7645913890558189), ('gamma', 5), ('learning_r

In [16]:
results = run_cv_with_sampling(
    X_full=data_static,
    target_cols=targets,
    target_name="y_hf_3_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    explainer="none"
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.701, 'precision': 0.2, 'sensitivity': 0.722, 'f1_score': 0.313, 'fbeta_2': 0.474, 'roc_auc': np.float64(0.791), 'NNS': 5.012, 'best_params': OrderedDict([('colsample_bytree', 1.0), ('gamma', 2), ('learning_rate', 0.05565792931813125), ('max_depth', 48), ('min_child_weight', 18), ('n_estimators', 212), ('subsample', 0.5390190044265418)])}
{'fold': 2, 'accuracy': 0.666, 'precision': 0.177, 'sensitivity': 0.702, 'f1_score': 0.282, 'fbeta_2': 0.44, 'roc_auc': np.float64(0.739), 'NNS': 5.662, 'best_params': OrderedDict([('colsample_bytree', 0.6245697124321521), ('gamma', 1), ('learning_rate', 0.03653492952239346), ('max_depth', 35), ('min_child_weight', 6), ('n_estimators', 214), ('subsample', 0.5307327998340758)])}
{'fold': 3, 'accuracy': 0.672, 'precision': 0.193, 'sensitivity': 0.789, 'f1_score': 0.31, 'fbeta_2': 0.488, 'roc_auc': np.float64(0.771), 'NNS': 5.178, 'best_params': OrderedDict([('colsample_bytree', 0.9691149773413857)

In [27]:
results = run_cv_with_sampling(
    X_full=data_static,
    target_cols=targets,
    target_name="y_hf_6_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    explainer="none"
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.673, 'precision': 0.243, 'sensitivity': 0.775, 'f1_score': 0.37, 'fbeta_2': 0.539, 'roc_auc': np.float64(0.749), 'NNS': 4.12, 'best_params': OrderedDict([('colsample_bytree', 0.5604633606673581), ('gamma', 5), ('learning_rate', 0.1), ('max_depth', 36), ('min_child_weight', 14), ('n_estimators', 10), ('subsample', 1.0)])}
{'fold': 2, 'accuracy': 0.647, 'precision': 0.224, 'sensitivity': 0.76, 'f1_score': 0.347, 'fbeta_2': 0.514, 'roc_auc': np.float64(0.751), 'NNS': 4.456, 'best_params': OrderedDict([('colsample_bytree', 0.8260367457426929), ('gamma', 5), ('learning_rate', 0.003384429637393834), ('max_depth', 33), ('min_child_weight', 1), ('n_estimators', 10), ('subsample', 0.5370531719943081)])}
{'fold': 3, 'accuracy': 0.725, 'precision': 0.278, 'sensitivity': 0.762, 'f1_score': 0.407, 'fbeta_2': 0.565, 'roc_auc': np.float64(0.795), 'NNS': 3.6, 'best_params': OrderedDict([('colsample_bytree', 0.6091886815818075), ('gamma', 0), ('

In [17]:
results = run_cv_with_sampling(
    X_full=data_static,
    target_cols=targets,
    target_name="y_hf_12_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    explainer="none"
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.705, 'precision': 0.316, 'sensitivity': 0.719, 'f1_score': 0.439, 'fbeta_2': 0.573, 'roc_auc': np.float64(0.77), 'NNS': 3.163, 'best_params': OrderedDict([('colsample_bytree', 0.7717015338451563), ('gamma', 5), ('learning_rate', 0.05015020365607496), ('max_depth', 42), ('min_child_weight', 7), ('n_estimators', 35), ('subsample', 0.785388901339449)])}
{'fold': 2, 'accuracy': 0.736, 'precision': 0.347, 'sensitivity': 0.738, 'f1_score': 0.472, 'fbeta_2': 0.603, 'roc_auc': np.float64(0.784), 'NNS': 2.882, 'best_params': OrderedDict([('colsample_bytree', 0.7717015338451563), ('gamma', 5), ('learning_rate', 0.05015020365607496), ('max_depth', 42), ('min_child_weight', 7), ('n_estimators', 35), ('subsample', 0.785388901339449)])}
{'fold': 3, 'accuracy': 0.689, 'precision': 0.305, 'sensitivity': 0.73, 'f1_score': 0.43, 'fbeta_2': 0.571, 'roc_auc': np.float64(0.763), 'NNS': 3.28, 'best_params': OrderedDict([('colsample_bytree', 0.8986475

In [18]:
results = run_cv_with_sampling(
    X_full=data_static,
    target_cols=targets,
    target_name="y_hf_24_months",
    optimizer_map=optimizer_map,
    model_name="xgb",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    explainer="none"
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.712, 'precision': 0.395, 'sensitivity': 0.717, 'f1_score': 0.509, 'fbeta_2': 0.616, 'roc_auc': np.float64(0.773), 'NNS': 2.533, 'best_params': OrderedDict([('colsample_bytree', 0.8274051494701928), ('gamma', 5), ('learning_rate', 0.0334847796030407), ('max_depth', 15), ('min_child_weight', 15), ('n_estimators', 436), ('subsample', 0.6549818150910632)])}
{'fold': 2, 'accuracy': 0.712, 'precision': 0.397, 'sensitivity': 0.751, 'f1_score': 0.52, 'fbeta_2': 0.638, 'roc_auc': np.float64(0.788), 'NNS': 2.516, 'best_params': OrderedDict([('colsample_bytree', 0.8204726563783876), ('gamma', 1), ('learning_rate', 0.05975315454381301), ('max_depth', 50), ('min_child_weight', 13), ('n_estimators', 53), ('subsample', 0.5)])}
{'fold': 3, 'accuracy': 0.681, 'precision': 0.355, 'sensitivity': 0.652, 'f1_score': 0.46, 'fbeta_2': 0.559, 'roc_auc': np.float64(0.739), 'NNS': 2.818, 'best_params': OrderedDict([('colsample_bytree', 0.5), ('gamma', 4)

## Inpatient Visit Static LR

In [19]:
results = run_cv_with_sampling(
    X_full=data_static,
    target_cols=targets,
    target_name="y_inp_1_month",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
    explainer="none"
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.622, 'precision': 0.145, 'sensitivity': 0.661, 'f1_score': 0.238, 'fbeta_2': 0.386, 'roc_auc': np.float64(0.679), 'NNS': 6.889, 'best_params': OrderedDict([('C', 0.0007066104944685682), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.58, 'precision': 0.135, 'sensitivity': 0.679, 'f1_score': 0.225, 'fbeta_2': 0.375, 'roc_auc': np.float64(0.695), 'NNS': 7.432, 'best_params': OrderedDict([('C', 0.00041335692795483974), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.592, 'precision': 0.141, 'sensitivity': 0.697, 'f1_score': 0.234, 'fbeta_2': 0.389, 'roc_auc': np.float64(0.67), 'NNS': 7.105, 'best_params': OrderedDict([('C', 0.00025647903211168164), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.594, 'precision': 0.135, 'sensitivity': 0.651, 'f1_score': 0.223, 'fbeta_2': 0.369, 'roc_auc': np.float64(0.65), 'NNS': 7.423, 'best_params': OrderedDict([('C', 0.00052484379

In [20]:
results = run_cv_with_sampling(
    X_full=data_static,
    target_cols=targets,
    target_name="y_inp_3_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
    explainer="none"
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.62, 'precision': 0.237, 'sensitivity': 0.621, 'f1_score': 0.343, 'fbeta_2': 0.469, 'roc_auc': np.float64(0.677), 'NNS': 4.215, 'best_params': OrderedDict([('C', 0.0010813846390044723), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.594, 'precision': 0.23, 'sensitivity': 0.66, 'f1_score': 0.341, 'fbeta_2': 0.48, 'roc_auc': np.float64(0.656), 'NNS': 4.344, 'best_params': OrderedDict([('C', 0.0004102370627232382), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.6, 'precision': 0.233, 'sensitivity': 0.651, 'f1_score': 0.343, 'fbeta_2': 0.479, 'roc_auc': np.float64(0.677), 'NNS': 4.299, 'best_params': OrderedDict([('C', 0.008198655839764403), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.569, 'precision': 0.231, 'sensitivity': 0.728, 'f1_score': 0.351, 'fbeta_2': 0.509, 'roc_auc': np.float64(0.676), 'NNS': 4.324, 'best_params': OrderedDict([('C', 0.00612205244150098

In [26]:
results = run_cv_with_sampling(
    X_full=data_static,
    target_cols=targets,
    target_name="y_inp_12_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
    explainer="none"
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.61, 'precision': 0.398, 'sensitivity': 0.641, 'f1_score': 0.491, 'fbeta_2': 0.571, 'roc_auc': np.float64(0.674), 'NNS': 2.515, 'best_params': OrderedDict([('C', 0.014436477424603305), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.626, 'precision': 0.419, 'sensitivity': 0.716, 'f1_score': 0.528, 'fbeta_2': 0.627, 'roc_auc': np.float64(0.69), 'NNS': 2.388, 'best_params': OrderedDict([('C', 0.013487152027021007), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.603, 'precision': 0.393, 'sensitivity': 0.652, 'f1_score': 0.49, 'fbeta_2': 0.576, 'roc_auc': np.float64(0.664), 'NNS': 2.547, 'best_params': OrderedDict([('C', 0.012617988598696388), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.599, 'precision': 0.396, 'sensitivity': 0.695, 'f1_score': 0.504, 'fbeta_2': 0.603, 'roc_auc': np.float64(0.662), 'NNS': 2.528, 'best_params': OrderedDict([('C', 0.0117004560082420

In [21]:
results = run_cv_with_sampling(
    X_full=data_static,
    target_cols=targets,
    target_name="y_inp_24_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="smote",
    n_splits=5,
    random_state=42,
    explainer="none"
)


=== Technique: SMOTE ===
{'fold': 1, 'accuracy': 0.602, 'precision': 0.491, 'sensitivity': 0.639, 'f1_score': 0.555, 'fbeta_2': 0.603, 'roc_auc': np.float64(0.647), 'NNS': 2.036, 'best_params': OrderedDict([('C', 0.012268287924771948), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.614, 'precision': 0.502, 'sensitivity': 0.706, 'f1_score': 0.587, 'fbeta_2': 0.653, 'roc_auc': np.float64(0.674), 'NNS': 1.991, 'best_params': OrderedDict([('C', 0.019667187160174765), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.597, 'precision': 0.486, 'sensitivity': 0.647, 'f1_score': 0.555, 'fbeta_2': 0.607, 'roc_auc': np.float64(0.657), 'NNS': 2.059, 'best_params': OrderedDict([('C', 0.01308287746734809), ('penalty', 'l1'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.599, 'precision': 0.486, 'sensitivity': 0.581, 'f1_score': 0.529, 'fbeta_2': 0.559, 'roc_auc': np.float64(0.638), 'NNS': 2.058, 'best_params': OrderedDict([('C', 0.01974823076295

## ACS ALl-features MLP (static lr)

In [22]:
results = run_cv_with_sampling(
    X_full=data_static,
    target_cols=targets,
    target_name="y_acs_1_month",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    explainer="none"
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.946, 'precision': 0.016, 'sensitivity': 0.167, 'f1_score': 0.029, 'fbeta_2': 0.058, 'roc_auc': np.float64(0.415), 'NNS': 62.0, 'best_params': OrderedDict([('C', 9.47505505290482), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.933, 'precision': 0.0, 'sensitivity': 0.0, 'f1_score': 0.0, 'fbeta_2': 0.0, 'roc_auc': np.float64(0.654), 'NNS': inf, 'best_params': OrderedDict([('C', 8.025939611577696), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.952, 'precision': 0.0, 'sensitivity': 0.0, 'f1_score': 0.0, 'fbeta_2': 0.0, 'roc_auc': np.float64(0.571), 'NNS': inf, 'best_params': OrderedDict([('C', 0.001890058911513031), ('penalty', 'l2'), ('solver', 'liblinear')])}




{'fold': 4, 'accuracy': 0.979, 'precision': 0.0, 'sensitivity': 0.0, 'f1_score': 0.0, 'fbeta_2': 0.0, 'roc_auc': np.float64(0.564), 'NNS': inf, 'best_params': OrderedDict([('C', 0.00046888400366283434), ('penalty', 'l2'), ('solver', 'liblinear')])}




{'fold': 5, 'accuracy': 0.906, 'precision': 0.009, 'sensitivity': 0.167, 'f1_score': 0.017, 'fbeta_2': 0.037, 'roc_auc': np.float64(0.724), 'NNS': 110.0, 'best_params': OrderedDict([('C', 1000.0), ('penalty', 'l1'), ('solver', 'liblinear')])}

Mean scores across folds ( undersample ):
accuracy: 0.943 ± 0.024
precision: 0.005 ± 0.007
sensitivity: 0.067 ± 0.082
f1_score: 0.009 ± 0.012
fbeta_2: 0.019 ± 0.024
roc_auc: 0.586 ± 0.103
NNS: inf ± nan


In [23]:
results = run_cv_with_sampling(
    X_full=data_static,
    target_cols=targets,
    target_name="y_acs_3_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    explainer="none"
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.918, 'precision': 0.021, 'sensitivity': 0.2, 'f1_score': 0.038, 'fbeta_2': 0.075, 'roc_auc': np.float64(0.642), 'NNS': 47.0, 'best_params': OrderedDict([('C', 0.00010602594470834996), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.889, 'precision': 0.016, 'sensitivity': 0.2, 'f1_score': 0.029, 'fbeta_2': 0.059, 'roc_auc': np.float64(0.688), 'NNS': 64.5, 'best_params': OrderedDict([('C', 0.0002159214391723222), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.842, 'precision': 0.036, 'sensitivity': 0.7, 'f1_score': 0.068, 'fbeta_2': 0.148, 'roc_auc': np.float64(0.837), 'NNS': 28.0, 'best_params': OrderedDict([('C', 0.0003573100304648202), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.85, 'precision': 0.027, 'sensitivity': 0.5, 'f1_score': 0.052, 'fbeta_2': 0.112, 'roc_auc': np.float64(0.729), 'NNS': 36.6, 'best_params': OrderedDict([('C', 0.0001060259447083

In [24]:
results = run_cv_with_sampling(
    X_full=data_static,
    target_cols=targets,
    target_name="y_acs_12_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    explainer="none"
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.769, 'precision': 0.042, 'sensitivity': 0.571, 'f1_score': 0.079, 'fbeta_2': 0.163, 'roc_auc': np.float64(0.684), 'NNS': 23.667, 'best_params': OrderedDict([('C', 0.0006281757539413166), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.787, 'precision': 0.035, 'sensitivity': 0.45, 'f1_score': 0.065, 'fbeta_2': 0.134, 'roc_auc': np.float64(0.78), 'NNS': 28.556, 'best_params': OrderedDict([('C', 0.00014481851721830116), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.784, 'precision': 0.034, 'sensitivity': 0.45, 'f1_score': 0.064, 'fbeta_2': 0.132, 'roc_auc': np.float64(0.683), 'NNS': 29.0, 'best_params': OrderedDict([('C', 0.00015362328497863813), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.787, 'precision': 0.035, 'sensitivity': 0.45, 'f1_score': 0.065, 'fbeta_2': 0.133, 'roc_auc': np.float64(0.714), 'NNS': 28.667, 'best_params': OrderedDict([('C', 0.0001

In [25]:
results = run_cv_with_sampling(
    X_full=data_static,
    target_cols=targets,
    target_name="y_acs_24_months",
    optimizer_map=optimizer_map,
    model_name="lr",
    sampling="undersample",
    n_splits=5,
    random_state=42,
    explainer="none"
)


=== Technique: UNDERSAMPLE ===
{'fold': 1, 'accuracy': 0.698, 'precision': 0.055, 'sensitivity': 0.677, 'f1_score': 0.102, 'fbeta_2': 0.209, 'roc_auc': np.float64(0.777), 'NNS': 18.048, 'best_params': OrderedDict([('C', 0.0002879036169324249), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 2, 'accuracy': 0.7, 'precision': 0.051, 'sensitivity': 0.633, 'f1_score': 0.094, 'fbeta_2': 0.193, 'roc_auc': np.float64(0.732), 'NNS': 19.632, 'best_params': OrderedDict([('C', 0.000182942508261182), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 3, 'accuracy': 0.701, 'precision': 0.046, 'sensitivity': 0.548, 'f1_score': 0.085, 'fbeta_2': 0.173, 'roc_auc': np.float64(0.655), 'NNS': 21.588, 'best_params': OrderedDict([('C', 0.00030409508650245244), ('penalty', 'l2'), ('solver', 'liblinear')])}
{'fold': 4, 'accuracy': 0.694, 'precision': 0.048, 'sensitivity': 0.581, 'f1_score': 0.088, 'fbeta_2': 0.179, 'roc_auc': np.float64(0.694), 'NNS': 21.0, 'best_params': OrderedDict([('C', 0.0002