# ACP Project - Predicting Critical Events 

## Setup

In [None]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 300)

from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})

import shap
shap.initjs()

%load_ext autoreload
%autoreload 1

In [None]:
from utils.evaluation import f2_score, METRICS, evaluate, evaluate_from_pred, with_sampling_strategies, spotCheckCV, spotCheckDatasets, F2TabNet
from utils.isolation_forest_wrapper import IsolationForestWrapper
%aimport utils.evaluation

In [None]:
class Notebook:
    IMAGE_DIR = 'images'
    OUTCOME = "CriticalEvent"
    MODELS = {}
    EXPLAINERS = {}


## Summary of results (*pre-tuning*)

See following sections for detailed results on each model, and the improvement achieved after tuning model parameters.

In [None]:
pd.read_csv("old_code/ml_test_4/results.csv").drop("dataset", axis=1).set_index(
    "model"
).sort_values("test_F2 Score", ascending=False)



## Test Setup

### Thresholding/Tuning

In [None]:
from sklearn.metrics import precision_recall_curve

def get_threshold(y_train, y_pred_proba, target=0.85):
    """ Given prediction probabilities, sets the prediction threshold to approach the given target recall
    """

    # Get candidate thresholds from the model, and find the one that gives the best fbeta score
    precision, recall, thresholds = precision_recall_curve(
        y_train, y_pred_proba
    )
    closest = thresholds[np.abs(recall-target).argmin()]
    
    return closest

### Dataset

Different variations on this initial, pre-processed version will be tested throughout.

In [None]:
from dataset import SCIData, SCICols
%aimport dataset

# SCIData.load('data/sci.h5').clean_all().filter_vague_diagnoses().derive_readmission().omit_vbg()
sci = SCIData.load('data/sci_processed.h5').fix_readmissionband().derive_critical_event(within=2)

In [None]:
from sklearn.model_selection import train_test_split

scii = (
    sci.omit_redundant()
    .drop(["ReadmissionBand", "AgeBand"], axis=1)
    .omit_ae()
    .raw_news()
    .mandate_news()
    .mandate_blood()
    .augment_hsmr()
)

sci_train, sci_test = train_test_split(
    scii, test_size=0.33, random_state=42, stratify=scii[Notebook.OUTCOME]
)
sci_train, sci_test = SCIData(sci_train), SCIData(sci_test)

# Drop HSMR_15 as there is only 1 in the entire dataset, making a split impossible
(X_train, y_train), (X_test, y_test) = (
    sci_train.encode_ccs_onehot()
    .xy(outcome=Notebook.OUTCOME, ordinal_encoding=True, dropna=False, fillna=True),
    sci_test.encode_ccs_onehot()
    .xy(outcome=Notebook.OUTCOME, ordinal_encoding=True, dropna=False, fillna=True),
)

def drop_exclusive_cols(X1, X2):
    exclusive_cols = set(X1.columns) ^ set(X2.columns)
    X1.drop(exclusive_cols, axis=1, errors='ignore', inplace=True)
    X2.drop(exclusive_cols, axis=1, errors='ignore', inplace=True)

drop_exclusive_cols(X_train, X_test)

In [None]:
scii = (
    sci.omit_redundant()
    .drop(["ReadmissionBand", "AgeBand"], axis=1)
    .omit_ae()
    .raw_news()
)

datasets = {
    "Mandated vitals, One-hot diagnoses": (
        scii.mandate_news()
        .mandate_blood()
        .augment_hsmr()
        .encode_ccs_onehot()
        .xy(outcome=Notebook.OUTCOME, ordinal_encoding=True, dropna=True)
    ),
    "Mandated vitals, Categorical diagnoses (main only)": (
        scii.mandate_news()
        .mandate_blood()
        .augment_hsmr()
        .drop(SCICols.diagnoses[1:], axis=1)
        .xy(outcome=Notebook.OUTCOME, ordinal_encoding=True, dropna=True)
    ),
    "Mandated vitals, Categorical diagnoses (with missing)": (
        scii.mandate_news()
        .mandate_blood()
        .augment_hsmr()
        .drop(SCICols.diagnoses[1:], axis=1)
        .xy(outcome=Notebook.OUTCOME, ordinal_encoding=True, fillna=True)
    ),
    "Imputed vitals": (
        scii.impute_news()
        .impute_blood()
        .augment_hsmr()
        .encode_ccs_onehot()
        .xy(outcome=Notebook.OUTCOME, ordinal_encoding=True, dropna=True)
    ),
    "Missing NEWS, imputed blood": (
        scii.augment_hsmr()
        .impute_blood()
        .encode_ccs_onehot()
        .mandate_diagnoses()
        .xy(outcome=Notebook.OUTCOME, ordinal_encoding=True, fillna=True)
    ),
    "Missing vitals": (
        scii.augment_hsmr()
        .impute_blood()
        .encode_ccs_onehot()
        .mandate_diagnoses()
        .xy(outcome=Notebook.OUTCOME, ordinal_encoding=True, fillna=True)
    ),
}


## Logistic Regression 

### NEWS only
Logistic regression (L2 penalty, by default) slightly beats the baseline NEWS model, which had AUC `0.807` for this outcome, but doesn't beat the original study score of `0.873`.

Balanced variant completely flips the precision vs. recall relationship, so it does a better job of minimising false-negatives. However, the final values are still low.

In [None]:
from sklearn.linear_model import LogisticRegression

datasets_lr = {
    "Raw NEWS": sci.mandate(SCICols.news_data_raw).xy(
        outcome=Notebook.OUTCOME, x=SCICols.news_data_raw, dtype=float
    ),
    "Scored NEWS": sci.mandate(SCICols.news_data_scored).xy(
        outcome=Notebook.OUTCOME, x=SCICols.news_data_scored, dtype=float
    ),
}

models = {
    "LR": LogisticRegression(max_iter=1000, random_state=42),
    "LR Balanced": LogisticRegression(
        class_weight="balanced", max_iter=1000, random_state=42
    ),
}

spotCheckDatasets(models, datasets_lr)


In [None]:
spotCheckCV(
    {
        "LR": LogisticRegression(max_iter=1000, random_state=42),
        **with_sampling_strategies(
            LogisticRegression(max_iter=1000, random_state=42), "LR"
        ),
    },
    *datasets_lr["Raw NEWS"]
)



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train_news, X_test_news, y_train_news, y_test_news = train_test_split(
    *sci.mandate(SCICols.news_data_raw).xy(
        outcome=Notebook.OUTCOME, x=SCICols.news_data_raw, dtype=float
    ),
    test_size=0.33,
    random_state=42
)

evaluate(
    LogisticRegression(class_weight="balanced", max_iter=1000, random_state=42).fit(
        X_train_news, y_train_news
    ),
    X_test_news,
    y_test_news,
)


#### Thresholding

The baseline model's pre-defined risk ratio (NEWS score $\geq 7$) yields its own precision and recall "ratio". I.e., it balances false-positives and negatives in a certain way. We can emulate this tradeoff by adjusting the class threshold in the regression model. 

In [None]:
from sklearn.metrics import precision_recall_curve, precision_score, recall_score


def get_news_target_ratio(X, y):
    y_pred = X.c_NEWS_score >= 7
    p = precision_score(y, y_pred)
    r = recall_score(y, y_pred)
    print(f"NEWS Precision={p:.5f}, Recall={r:.5f}")
    return p / r


# Get the "target" ratio of precision/recall that the NEWS threshold produces
target_ratio = get_news_target_ratio(
    *sci.mandate(SCICols.news_data_raw).xy(outcome=Notebook.OUTCOME)
)

# Train the LR model
model = LogisticRegression(class_weight="balanced", random_state=42, max_iter=1000).fit(
    X_train_news, y_train_news
)

# Get candidate thresholds from the model, and find the one that gives the ratio closest to the target
precision, recall, thresholds = precision_recall_curve(
    y_train_news, model.predict_proba(X_train_news)[:, 1]
)
closest = thresholds[
    np.abs(
        np.divide(precision, recall, out=np.ones_like(recall), where=recall != 0)
        - target_ratio
    ).argmin()
]



On the test set, this gives much improved F2, while the AUC remains consistent (slightly better than baseline model).

In [None]:
# Create predictions on the test set, using this new threshold
y_pred_proba = model.predict_proba(X_test_news)
y_pred = np.where(y_pred_proba[:, 1] > closest, 1, 0)

# Produce scores
evaluate_from_pred(y_test_news, y_pred, y_pred_proba[:, 1])


### Dataset Variants

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Logistic Regression (balanced)": LogisticRegression(
        max_iter=1000, class_weight="balanced"
    ),
    "Logistic Regression (saga)": Pipeline(
        steps=[
            ("scaler", StandardScaler()),
            (
                "clf",
                LogisticRegression(
                    max_iter=1000,
                    class_weight="balanced",
                    solver="saga",
                    penalty="elasticnet",
                    l1_ratio=0.5,
                ),
            ),
        ]
    ),
}

spotCheckDatasets(models, datasets_lr)


### Resampling

In [None]:
dataset = datasets["Mandated vitals, One-hot diagnoses"]
categorical_cols_idx = SCIData(dataset[0]).describe_categories()[0]
spotCheckCV(
    {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Logistic Regression (balanced)": LogisticRegression(
            max_iter=1000, class_weight="balanced"
        ),
        **with_sampling_strategies(
            LogisticRegression(max_iter=1000),
            "Logistic Regression",
            categorical_cols_idx,
        ),
        **with_sampling_strategies(
            LogisticRegression(max_iter=1000, class_weight="balanced"),
            "Logistic Regression (balanced)",
            categorical_cols_idx,
        ),
    },
    *dataset
)



### Evaluation

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

model = LogisticRegression(max_iter=1000, class_weight="balanced").fit(X_train, y_train)

evaluate(
    model,
    X_test,
    y_test,
    plot_title="Logistic Regression (non-tuned)",
    save=f"{Notebook.IMAGE_DIR}/eval_logistic_regression.png",
)

Notebook.MODELS["Logistic Regression"] = model


#### Thresholding

In [None]:
model = Notebook.MODELS["Logistic Regression"]

threshold = get_threshold(
    y_train, 
    model.predict_proba(X_train)[:, 1], 
)

# Create predictions on the test set, using this new threshold
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = np.where(y_pred_proba > threshold, 1, 0)

# Produce scores
evaluate_from_pred(
    y_test,
    y_pred,
    y_pred_proba,
    plot_title="Logistic Regression (tuned)",
    save=f"{Notebook.IMAGE_DIR}/eval_thresholded_logistic_regression.png",
)



### Explanations

In [None]:
import shap

explainer = shap.LinearExplainer(model, X_train)
shap_values = explainer(X_test)
Notebook.EXPLAINERS["Logistic Regression"] = (explainer, shap_values)

fig = shap.plots.beeswarm(shap_values, max_display=30, show=False)
plt.title("Logistic Regression")
plt.savefig(f"{Notebook.IMAGE_DIR}/shap_swarm_logistic_regression.png", bbox_inches="tight")


In [None]:
positive_class_idx = np.where(model.predict(X_test))[0][-1]
fig = shap.plots.force(shap_values[positive_class_idx])

shap.save_html(f"{Notebook.IMAGE_DIR}/force_plot_logistic_regression.html", fig)
fig


## XGBoost

Achieves best-in-class performance (nearly matched by LightGBM). Performs best on the dataset variant with mandated vitals data and one-hot encoded diagnoses with random undersampling to combat the class imbalance.

### Dataset Variants

In [None]:
datasets_xgb = {
    "Mandated vitals, One-hot diagnoses": (
        scii.mandate_news()
        .mandate_blood()
        .augment_hsmr()
        .encode_ccs_onehot()
        .xy(outcome=Notebook.OUTCOME, dropna=True)
    ),
    "Mandated vitals, Categorical diagnoses": (
        scii.mandate_news()
        .mandate_blood()
        .impute_blood()
        .augment_hsmr()
        .xy(outcome=Notebook.OUTCOME)
    ),
    "Imputed vitals, One-hot diagnoses": (
        scii.impute_news()
        .impute_blood()
        .augment_hsmr()
        .encode_ccs_onehot()
        .xy(outcome=Notebook.OUTCOME)
    ),
    "Mandated NEWS, imputed blood, One-hot diagnoses": (
        scii.mandate_news()
        .impute_blood()
        .augment_hsmr()
        .encode_ccs_onehot()
        .xy(outcome=Notebook.OUTCOME)
    ),
    "Missing NEWS, imputed blood, One-hot diagnoses": (
        scii.augment_hsmr()
        .impute_blood()
        .encode_ccs_onehot()
        .xy(outcome=Notebook.OUTCOME)
    ),
    "Missing blood & NEWS, One-hot diagnoses": (
        scii.augment_hsmr()
        .impute_blood()
        .encode_ccs_onehot()
        .xy(outcome=Notebook.OUTCOME)
    ),
}


In [None]:
from xgboost import XGBClassifier

xgb_weight_lambda = lambda X, y: dict(scale_pos_weight=y.shape[0] / y.sum())

xgb_set_params = {
    "XGB Balanced": xgb_weight_lambda,
    "Hist XGB Balanced": xgb_weight_lambda,
}

models = {
    "Approx XGB": XGBClassifier(
        tree_method="approx", enable_categorical=True, scale_pos_weight=1
    ),
    "Approx XGB Balanced": XGBClassifier(
        tree_method="approx", enable_categorical=True,
    ),
    "Hist XGB": XGBClassifier(
        tree_method="hist", enable_categorical=True, scale_pos_weight=1
    ),
    "Hist XGB Balanced": XGBClassifier(tree_method="hist", enable_categorical=True,),
}

spotCheckDatasets(datasets=datasets_xgb, models=models, set_params=xgb_set_params)


### Resampling

In [None]:
X, y = datasets_xgb["Mandated NEWS, imputed blood, One-hot diagnoses"]
categorical_cols_idx = X.describe_categories()[0]
X = X.ordinal_encode_categories().fillna(-1)
scale_pos_weight = y.shape[0] / y.sum()
spotCheckCV(
    {
        "XGB": XGBClassifier(
            tree_method="hist",
            enable_categorical=True,
            scale_pos_weight=scale_pos_weight,
        ),
        **with_sampling_strategies(
            XGBClassifier(
                tree_method="hist",
                enable_categorical=True,
                scale_pos_weight=scale_pos_weight,
            ),
            "XGB",
            categorical_cols_idx,
        ),
    },
    X,
    y,
)



### Hyperparameters

In [None]:
param_grid = {
    "XGB__max_depth": np.arange(10, 25, 1),
    "XGB__learning_rate": [0.01, 0.025, 0.05, 0.085, 0.1, 0.15, 0.2, 0.25, 0.3],
    "XGB__subsample": np.arange(0.3, 1.0, 0.05),
    "XGB__colsample_bytree": np.arange(0.1, 1.0, 0.05),
    "XGB__colsample_bylevel": np.arange(0.6, 1.0, 0.05),
    "XGB__n_estimators": np.arange(50, 250, 10),
    "XGB__scale_pos_weight": np.arange(1, 60, 2),
}


### Evaluation

In [None]:
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

model = ImbPipeline(
    steps=[
        ("undersampling", RandomUnderSampler(sampling_strategy=0.1)),
        (
            "XGB",
            XGBClassifier(
                tree_method="approx",
                enable_categorical=True,
                subsample=0.85,
                scale_pos_weight=31,
                n_estimators=140,
                max_depth=13,
                learning_rate=0.05,
                colsample_bytree=0.7,
                colsample_bylevel=0.9,
            ),
        ),
    ]
).fit(X_train, y_train)

evaluate(
    model, X_test, y_test, "XGBoost (non-tuned)", save=f"{Notebook.IMAGE_DIR}/eval_xgboost.png"
)

Notebook.MODELS["XGBoost"] = model



#### Thresholding

We can further minimise false-negatives (at the expense of FPR) by altering the decision threshold manually. Below, we set it such that training recall/sensitivity approaches 0.95.

In [None]:
model = Notebook.MODELS["XGBoost"]

threshold = get_threshold(
    y_train, 
    model.predict_proba(X_train)[:, 1], 
)

# Create predictions on the test set, using this new threshold
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = np.where(y_pred_proba > 1-threshold, 1, 0)

# Produce scores
evaluate_from_pred(
    y_test,
    y_pred,
    y_pred_proba,
    plot_title="XGBoost (tuned)",
    save=f"{Notebook.IMAGE_DIR}/eval_thresholded_xgboost.png",
)



### Explanations

In [None]:
sorted_idx = model["XGB"].feature_importances_.argsort()[::-1]
fig = sns.barplot(
    x=model["XGB"].feature_importances_[sorted_idx],
    y=X_train.columns[sorted_idx],
    color="deepskyblue",
)
fig.set_title("XGBoost - Global feature importance (gain)")
plt.savefig(f"{Notebook.IMAGE_DIR}/global_weights_xgboost.png", bbox_inches="tight")



In [None]:
import shap

explainer = shap.TreeExplainer(model["XGB"])
shap_values = explainer(X_test)
Notebook.EXPLAINERS["XGBoost"] = (explainer, shap_values)

fig = shap.plots.beeswarm(shap_values, max_display=30, show=False)
plt.title("XGBoost")
plt.savefig(f"{Notebook.IMAGE_DIR}/shap_swarm_xgboost.png", bbox_inches="tight")


In [None]:
positive_class_idx = np.where(model.predict(X_test))[0][-1]
fig = shap.plots.force(shap_values[positive_class_idx])

shap.save_html(f"{Notebook.IMAGE_DIR}/force_plot_xgboost.html", fig)
fig


## Random Forest

### Dataset Variants

In [None]:
from sklearn.ensemble import RandomForestClassifier

models = {
    "Random Forest": RandomForestClassifier(),
    "Random Forest (balanced)": RandomForestClassifier(
        class_weight="balanced_subsample"
    ),
}


In [None]:
spotCheckDatasets(datasets=datasets, models=models)


### Resampling

In [None]:
dataset = datasets["Mandated vitals, One-hot diagnoses"]
categorical_cols_idx = SCIData(dataset[0]).describe_categories()[0]
spotCheckCV(
    {
        "Random Forest": RandomForestClassifier(),
        **with_sampling_strategies(
            RandomForestClassifier(), "Random Forest", categorical_cols_idx
        ),
    },
    *dataset
)



### Evaluation

In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

model = ImbPipeline(
    steps=[
        ("undersampling", RandomUnderSampler(sampling_strategy=0.1)),
        ("randomforest", RandomForestClassifier()),
    ]
).fit(X_train, y_train)

Notebook.MODELS["Random Forest"] = model

evaluate(
    model,
    X_test,
    y_test,
    plot_title="Random Forest (non-tuned)",
    save=f"{Notebook.IMAGE_DIR}/eval_random_forest.png",
)



#### Thresholding

In [None]:
model = Notebook.MODELS["Random Forest"]

threshold = get_threshold(
    y_train, 
    model.predict_proba(X_train)[:, 1], 
)

# Create predictions on the test set, using this new threshold
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = np.where(y_pred_proba > 1-threshold, 1, 0)

# Produce scores
evaluate_from_pred(
    y_test,
    y_pred,
    y_pred_proba,
    plot_title="Random Forest (tuned)",
    save=f"{Notebook.IMAGE_DIR}/eval_thresholded_random_forest.png",
)



### Explanations

In [None]:
import shap

explainer = shap.TreeExplainer(model["randomforest"])
shap_values = explainer(X_test)
Notebook.EXPLAINERS["Random Forest"] = (explainer, shap_values)

fig = shap.plots.beeswarm(shap_values[:, :, 1], max_display=30, show=False)
plt.title("Random Forest")
plt.savefig(f"{Notebook.IMAGE_DIR}/shap_swarm_random_forest.png", bbox_inches="tight")


In [None]:
positive_class_idx = np.where(model.predict(X_test))[0][-1]
fig = shap.plots.force(shap_values[positive_class_idx, :, 1])

shap.save_html(f"{Notebook.IMAGE_DIR}/force_plot_random_forest.html", fig)
fig


## Isolation Forest

Unlike other models tested, this is a one-class model that detects outliers instead of predicting outcomes. We test the hypothesis that "abnormal" values associated with the tracked outcome will be outliers compared to the normal ranges.

### Dataset Variants

In [None]:
models = {
    "Isolation Forest": IsolationForestWrapper(),
    "Isolation Forest (contamination)": IsolationForestWrapper(),
}
isolation_forest_set_params = {
    "Isolation Forest (contamination)": lambda X, y: dict(
        contamination=(y.sum()) / y.shape[0]
    )
}


In [None]:
spotCheckDatasets(
    datasets=datasets, models=models, set_params=isolation_forest_set_params
)


### Evaluation

In [None]:
from sklearn.ensemble import IsolationForest

# Mandated vitals, Categorical diagnoses (main only)
(X_train_if, y_train_if), (X_test_if, y_test_if) = (
    sci_train.drop(SCICols.diagnoses[1:], axis=1).xy(
        outcome=Notebook.OUTCOME, ordinal_encoding=True, fillna=True
    ),
    sci_test.drop(SCICols.diagnoses[1:], axis=1).xy(
        outcome=Notebook.OUTCOME, ordinal_encoding=True, fillna=True
    ),
)
drop_exclusive_cols(X_train_if, X_test_if)

# Train without positive labelled instances
model = IsolationForestWrapper().fit(X_train_if[~y_train_if].to_numpy())

Notebook.MODELS["Isolation Forest"] = model

evaluate(
    model,
    X_test_if.to_numpy(),
    y_test_if,
    plot_title="Isolation Forest (non-tuned)",
    save=f"{Notebook.IMAGE_DIR}/eval_isolation_forest.png",
)


#### Thresholding

In [None]:
model = Notebook.MODELS["Isolation Forest"]

y_pred_proba_train = model.decision_function(X_train_if)
y_pred_proba_train -= y_pred_proba_train.min()

threshold = get_threshold(
    y_train_if, 
    y_pred_proba_train, 
)

# Create predictions on the test set, using this new threshold
y_pred_proba = model.decision_function(X_test_if)
y_pred = np.where(y_pred_proba-y_pred_proba.min() > threshold, 1, 0)

evaluate_from_pred(
    y_test_if,
    y_pred,
    y_pred_proba,
    plot_title="Isolation Forest (tuned)",
    save=f"{Notebook.IMAGE_DIR}/eval_thresholded_isolation_forest.png",
)


### Explanations

In [None]:
import shap

explainer = shap.TreeExplainer(model)
shap_values = explainer(X_test_if)
shap_values.values = -shap_values.values  # Fix for isolation forest's unique labelling
Notebook.EXPLAINERS["Isolation Forest"] = (explainer, shap_values)

fig = shap.plots.beeswarm(shap_values, max_display=30, show=False)
plt.title("Isolation Forest")
plt.savefig(f"{Notebook.IMAGE_DIR}/shap_swarm_isolation_forest.png", bbox_inches="tight")


In [None]:
positive_class_idx = np.where(model.predict(X_test_if.to_numpy()))[0][-1]
fig = shap.plots.force(shap_values[positive_class_idx])

shap.save_html(f"{Notebook.IMAGE_DIR}/force_plot_random_forest.html", fig)
fig


## LightGBM

### Dataset Variants

In [None]:
from lightgbm import LGBMClassifier

lightgbm_set_params = {
    "LightGBM Weighted": lambda X, y: dict(scale_pos_weight=y.shape[0] / y.sum())
}

spotCheckDatasets(
    datasets=datasets_xgb,
    models={
        "LightGBM": LGBMClassifier(metric=["l2", "auc"]),
        "LightGBM Balanced": LGBMClassifier(metric=["l2", "auc"], is_unbalance=True),
        "LightGBM Weighted": LGBMClassifier(metric=["l2", "auc"]),
    },
    set_params=lightgbm_set_params,
)



### Resampling

In [None]:
X, y = datasets_xgb["Mandated vitals, One-hot diagnoses"]
categorical_cols_idx = X.describe_categories()[0]
X = X.ordinal_encode_categories()
result = spotCheckCV(
    {
        "LightGBM": LGBMClassifier(
            metric=["l2", "auc"], scale_pos_weight=y.shape[0] / y.sum()
        ),
        **with_sampling_strategies(
            LGBMClassifier(metric=["l2", "auc"], scale_pos_weight=y.shape[0] / y.sum()),
            "LightGBM",
            categorical_cols_idx,
        ),
    },
    X,
    y,
)

display(result)


In [None]:
param_grid = {
    "learning_rate": [0.01, 0.025, 0.05, 0.085, 0.1, 0.15, 0.2, 0.25, 0.3],
    "boosting_type": ["gbdt", "dart", "goss"],
    "sub_feature": np.arange(0, 1, 0.05),
    "num_leaves": np.arange(20, 300, 20),
    "min_data": np.arange(10, 100, 10),
    "max_depth": np.arange(5, 200, 20),
    "scale_pos_weight": np.arange(1, 60, 2),
    "colsample_bytree": np.arange(0.1, 1.0, 0.05),
    "subsample": np.arange(0.3, 1.0, 0.05),
}


### Evaluation

In [None]:
from lightgbm import LGBMClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

(X_train_lgbm, y_train_lgbm), (X_test_lgbm, y_test_lgbm) = \
    sci_train.encode_ccs_onehot().xy(outcome=Notebook.OUTCOME, fillna=True), \
    sci_test.encode_ccs_onehot().xy(outcome=Notebook.OUTCOME, fillna=True),

drop_exclusive_cols(X_train_lgbm, X_test_lgbm)

model = ImbPipeline(
    steps=[
        ("undersampling", RandomUnderSampler(sampling_strategy=0.1)),
        ("lightgbm", LGBMClassifier(metric=["l2", "auc"], is_unbalance=True)),
    ]
).fit(X_train_lgbm.copy(), y_train_lgbm)

Notebook.MODELS["LightGBM"] = model

evaluate(
    model,
    X_test_lgbm.copy(),
    y_test_lgbm,
    plot_title="LightGBM (non-tuned)",
    save=f"{Notebook.IMAGE_DIR}/eval_lightgbm.png",
)


#### Thresholding
We adjust the classification threshold to further minimise false-negatives. Here we set it such that training recall/sensitivity is 0.85.

In [None]:
model = Notebook.MODELS["LightGBM"]

threshold = get_threshold(
    y_test_lgbm, 
    model.predict_proba(X_test_lgbm)[:, 1], 
)

# Create predictions on the test set, using this new threshold
y_pred_proba = model.predict_proba(X_test_lgbm)[:, 1]
y_pred = np.where(y_pred_proba > threshold, 1, 0)

evaluate_from_pred(
    y_test_lgbm,
    y_pred,
    y_pred_proba,
    plot_title="LightGBM (tuned)",
    save=f"{Notebook.IMAGE_DIR}/eval_thresholded_lightgbm.png",
)


### Explanations

In [None]:
import lightgbm as lgb

fig = lgb.plot_importance(model["lightgbm"])
fig.set_title("LightGBM - Global feature importance (gain)")
plt.savefig(f"{Notebook.IMAGE_DIR}/global_weights_lightgbm.png", bbox_inches="tight")



In [None]:
import shap

explainer = shap.TreeExplainer(model["lightgbm"])
shap_values = explainer(X_test)
Notebook.EXPLAINERS["LightGBM"] = (explainer, shap_values)

fig = shap.plots.beeswarm(shap_values[:, :, 1], max_display=30, show=False)
plt.title("LightGBM")
plt.savefig(f"{Notebook.IMAGE_DIR}/shap_swarm_lightgbm.png", bbox_inches="tight")


In [None]:
positive_class_idx = np.where(model.predict(X_test_lgbm))[0][-1]
fig = shap.plots.force(shap_values[positive_class_idx, :, 1])

shap.save_html(f"{Notebook.IMAGE_DIR}/force_plot_lightgbm.html", fig)
fig


## TabNet

Deep-learning model for outcome prediction.

In [None]:
categorical_cols_idx, categorical_cols_dims = X_train.describe_categories()

(X_train_tn, X_valid_tn, y_train_tn, y_valid_tn) = train_test_split(
    X_train.to_numpy(),
    y_train.to_numpy(),
    stratify=y_train,
    test_size=0.33,
    random_state=42,
)


In [None]:
import torch
from pytorch_tabnet.tab_model import TabNetClassifier

tabnet_params = dict(
    n_a=24,
    n_d=24,
    cat_idxs=categorical_cols_idx,
    cat_dims=categorical_cols_dims,
    cat_emb_dim=1,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=0.1),
    scheduler_params=dict(step_size=50, gamma=0.7),
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type="entmax",
    verbose=0,
)

model = TabNetClassifier(**tabnet_params)

model.fit(
    X_train=X_train_tn,
    y_train=y_train_tn,
    eval_set=[(X_train_tn, y_train_tn), (X_valid_tn, y_valid_tn),],
    eval_name=["train", "valid"],
    eval_metric=[F2TabNet],
    max_epochs=300,
    patience=50,
    batch_size=512,
    virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False,
)

Notebook.MODELS["TabNet"] = model


### Evaluation

In [None]:
evaluate(
    model, X_test.to_numpy(), y_test, "TabNet (non-tuned)", save=f"{Notebook.IMAGE_DIR}/eval_tabnet.png"
)



#### Thresholding

In [None]:
model = Notebook.MODELS["TabNet"]

threshold = get_threshold(
    y_test, 
    model.predict_proba(X_test.to_numpy())[:, 1],
    target=0.785
)

# Create predictions on the test set, using this new threshold
y_pred_proba = model.predict_proba(X_test.to_numpy())[:, 1]
y_pred = np.where(y_pred_proba > threshold, 1, 0)

evaluate_from_pred(
    y_test,
    y_pred,
    y_pred_proba,
    plot_title="TabNet (tuned)",
    save=f"{Notebook.IMAGE_DIR}/eval_thresholded_tabnet.png",
)


### Explanations

In [None]:
sorted_idx = model.feature_importances_.argsort()[::-1]
fig = sns.barplot(
    x=model.feature_importances_[sorted_idx],
    y=X_train.columns[sorted_idx],
    color="deepskyblue",
)
fig.set_title("TabNet - Global feature importance")
plt.savefig(f"{Notebook.IMAGE_DIR}/global_weights_tabnet.png", bbox_inches="tight")



In [None]:
positive_class_idx = np.where(model.predict(X_test.to_numpy()))[0][-1]
explain_matrix, masks = model.explain(
    X_test.to_numpy()[positive_class_idx : positive_class_idx + 1]
)
list(sorted(zip(X_test.columns, explain_matrix[0]), key=lambda x: x[1]))


## Comparisons

In [None]:
y_preds = pd.DataFrame({
    key: model.predict({'LightGBM': X_test_lgbm,'Isolation Forest': X_test_if,'TabNet': X_test.to_numpy(),'XGBoost': X_test,'Random Forest': X_test,'Logistic Regression': X_test}[key])
    for key, model in Notebook.MODELS.items()
})

In [None]:
candidates = y_preds[y_preds.all(axis=1)].index

In [None]:
shap_values = {
    key: shap_value for key, (explainer, shap_value) in Notebook.EXPLAINERS.items()
}
shap_values['LightGBM'] = shap_values['LightGBM'][:,:,1]
shap_values['Random Forest'] = shap_values['Random Forest'][:,:,1]

In [None]:
for modelkey, shap_value in shap_values.items():
    fig = shap.plots.force(shap_value[candidates[0]])
    shap.save_html(f"{Notebook.IMAGE_DIR}/comaprison_force_plot_{modelkey.replace(' ','')}.html", fig)
    
    print(modelkey)
    display(fig)
    