In [60]:
import warnings, math, itertools, json, logging
from hashlib import md5

import numpy as np
import pandas as pd

from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns

from dataset import SCIData, SCICols
%load_ext autoreload
%autoreload 1
%aimport utils.data_profiling, dataset


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [61]:
pd.set_option("display.max_columns", None)
sns.set_theme(style="darkgrid", palette="colorblind")
sns.set(rc={"figure.figsize": (11.5, 8.5), "figure.dpi": 100})

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
)


In [63]:
class Notebook:
    DATASET_CACHE = {}
    RUN_MODEL_SELECTION = True


In [93]:
from sklearn.model_selection import train_test_split


def sci_variant_cached(sci, **kwargs):
    key = md5(json.dumps(kwargs, sort_keys=True).encode("utf-8")).hexdigest()
    if key in Notebook.DATASET_CACHE:
        return Notebook.DATASET_CACHE[key]
    from sklearn.model_selection import train_test_split

    logging.info(f"Generating SCI variant: {kwargs}")
    X, y = sci.preprocess_from_params(**kwargs)

    X_train, _, y_train, _ = train_test_split(X, y, test_size=0.25, random_state=42)
    Notebook.DATASET_CACHE[key] = (X_train, y_train)

    return X_train, y_train


In [95]:
possible_params = {
    "news_columns": ["impute", "omit", "mandate"],
    "blood_columns": ["impute", "omit", "mandate"],
    "news_format": ["raw", "scored"],
    "diagnoses_grouping": ["icd10_3code", "icd10_group", "hsmr", "shmi"],
    "diagnoses_onehot": [False, True],
    "categorical_encoding": [False, True],
    "numerical_scaling": [False, True],
    "outcome": ['DiedDuringStay', 'DiedWithin30Days', 'Readmitted', 'CriticalCare', 'CriticalEvent']
    # Assumed steps:
    # - Mandate diagnoses, omit a&e text, omit VBG
}


In [120]:
# some_variants = {
#     'No_missing_values': sci.mandate(SCICols.news_data_raw + SCICols.blood + SCICols.ae).augment_hsmr(onehot=True),
#     'No_missing_vitals': sci.mandate(SCICols.news_data_raw + SCICols.blood).omit_ae().augment_hsmr(onehot=True),
#     'News_only': SCIData(sci[SCICols.news_data_raw]),
#     'Vitals_only': SCIData(sci[SCICols.news_data_raw + SCICols.blood]),
#     'Vitals_and_age': SCIData(sci[SCICols.news_data_raw + SCICols.blood + SCICols.patient[:-2]]),
#    # 'Vitals_and_ae': SCIData(sci[SCICols.news_data_raw + SCICols.blood + SCICols.patient[:-2] + SCICols.ae])
# }

In [106]:
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer
from sklearn.pipeline import Pipeline


def generate_sci(r, **kwargs):
    for _ in ["news", "blood"]:
        v = kwargs.get(f"{_}_columns")
        if v is not None:
            r = getattr(r, f"{v}_{_}")()

    v = kwargs.get("news_format")
    if v == "raw":
        r = r.raw_news()
    elif v == "scored":
        r = r.scored_news()

    v, e = kwargs.get("diagnoses_grouping"), kwargs.get("diagnoses_onehot", False)
    if v is not None:
        r = getattr(r, f"augment_{v}")(onehot=e)

    v = kwargs.get("numerical_scaling", False)
    numeric_pipeline = Pipeline(steps=[("scale", MinMaxScaler())])

    v = kwargs.get("categorical_encoding", False)
    categorical_pipeline = Pipeline(
        steps=[("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))]
    )

    r = r.mandate_diagnoses()
    return r.xy(outcome = kwargs.get('outcome', 'DiedDuringStay'))
    #return transform_sci(X, **kwargs), y


def transform_sci(X, **kwargs):
    numeric_pipeline = Pipeline(
        steps=[
            (
                "scale",
                MinMaxScaler() if kwargs.get("numeric_scaling") else "passthrough",
            )
        ]
    )

    categorical_pipeline = Pipeline(
        steps=[
            (
                "one-hot",
                OneHotEncoder(handle_unknown="ignore", sparse=False)
                if kwargs.get("categorical_onehot")
                else "passthrough",
            ),
        ]
    )

    ct = make_column_transformer(
        (numeric_pipeline, make_column_selector(dtype_include=np.number)),
        (categorical_pipeline, make_column_selector(dtype_include=object)),
    )

    full_processor = Pipeline(steps=[("columns", ct)])

    return full_processor.fit_transform(X)


In [114]:
sci = SCIData.load("data/sci_processed.h5").omit_redundant().omit_vbg().omit_ae().derive_critical_event()


In [98]:
some_params = {
    "news_columns": "impute",
    "blood_columns": "impute",
    "news_format": "raw",
    "diagnoses_grouping": "hsmr",
    "diagnoses_onehot": True,
    "outcome": "DiedDuringStay"
}
X, y = generate_sci(sci, **some_params)


In [99]:
len(list(makeGrid(possible_params)))

2880

In [8]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    roc_auc_score,
    f1_score,
    fbeta_score,
    make_scorer,
)

f2_score = make_scorer(fbeta_score, beta=2)

METRICS = {
    "Accuracy": "accuracy",
    "Precision": "precision",
    "Recall": "recall",
    "AUC": "roc_auc",
    "F1 Score": "f1",
    "F2 Score": f2_score,
}


In [10]:
def makeGrid(pars_dict):
    keys = pars_dict.keys()
    combinations = product(*pars_dict.values())
    return (dict(zip(keys, cc)) for cc in combinations)
    return ds


In [22]:
from sklearn.model_selection import cross_validate


def spotCheckCV(model, X, y, cv=3, pretty=True):
    scores = cross_validate(model, X, y, scoring=METRICS, cv=cv)
    if pretty:
        display(
            pd.DataFrame(
                [
                    (name.split("_")[1], sc)
                    for name, score in scores.items()
                    if name.startswith("test")
                    for sc in score
                ],
                columns=["Metric", "Score"],
            )
            .groupby("Metric")
            .agg(
                Mean=pd.NamedAgg(column="Score", aggfunc=np.mean),
                Std=pd.NamedAgg(column="Score", aggfunc=np.std),
            )
        )
    return scores


In [24]:
if Notebook.RUN_MODEL_SELECTION:
    from sklearn.dummy import DummyClassifier

    naive_clf = DummyClassifier(strategy="constant", constant=1)
    spotCheckCV(naive_clf, X_train, y_train, pretty=False), "Naive Classifier"


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.combine import SMOTETomek

MODELS = {
    "Logistic Regression": LogisticRegression(random_state=0, C=1e2, max_iter=1000),
    "Gaussian NB": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
}

IMBALANCED_MODELS = {
    "Balanced Decision Tree": DecisionTreeClassifier(class_weight="balanced"),
    "Balanced SVM": SVC(gamma="scale", class_weight="balanced"),
    "Balanced Random Forest": BalancedRandomForestClassifier(
        n_estimators=10, class_weight="balanced_subsample"
    ),
    "Balanced XGBoost": XGBClassifier(
        use_label_encoder=False, scale_pos_weight=21, eval_metric="logloss"
    ),
}

RESAMPLERS = {
    "SMOTE": SMOTE(),
    "Undersampling": RandomUnderSampler(sampling_strategy="majority"),
    "SMOTE & Under": ImbPipeline(
        [
            ("oversampling", SMOTE(sampling_strategy=0.1)),
            ("undersampling", RandomUnderSampler(sampling_strategy=0.5)),
        ]
    ),
    "SMOTE & Tomek": SMOTETomek(tomek=TomekLinks(sampling_strategy="majority")),
}


In [34]:
from typing import NamedTuple, Any, Dict
from sklearn.base import BaseEstimator


In [69]:
Task = NamedTuple(
    "Task",
    [
        ("name", str),
        ("model", BaseEstimator),
        ("preprocessor", BaseEstimator),
        ("data_params", Dict[str, Any]),
        ("model_params", Dict[str, Any]),
    ],
)



In [91]:
TASKS = [
    Task(
        "LR",
        LogisticRegression,
        ct,
        some_params,
        {"random_state": 0, "C": 1e2, "max_iter": 1000},
    )
]


In [94]:
for task in TASKS:
    X, y = sci_variant_cached(SCIData(sci.head(10000)), **task.data_params)
    clf = Pipeline(
        steps=[
            ("preprocessor", task.preprocessor),
            ("classifier", task.model(**task.model_params)),
        ]
    )
    spotCheckCV(clf, X, y, pretty=True)


Unnamed: 0_level_0,Mean,Std
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
AUC,0.833291,0.018894
Accuracy,0.965149,0.001996
F1 Score,0.170111,0.063223
F2 Score,0.127861,0.050026
Precision,0.387931,0.126991
Recall,0.109756,0.04397
