In [2]:
import warnings, math, itertools, json, logging, os
from hashlib import md5

import numpy as np
import pandas as pd

from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns

%load_ext autoreload
%autoreload 1


In [3]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
sns.set_theme(style="darkgrid", palette="colorblind")
sns.set(rc={"figure.figsize": (11.5, 8.5), "figure.dpi": 100})

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
)


In [3]:
class Notebook:
    RUN_TRAINING = False
    SCRIPT = False

In [4]:
from dataset import SCIData, SCICols
%aimport dataset
#SCIData.load('data/sci.h5').clean_all().filter_vague_diagnoses().derive_readmission().omit_vbg().omit_ae().save()
if Notebook.RUN_TRAINING:
    sci = SCIData.load('data/sci_processed.h5').augment_hsmr(onehot=True).omit_news_extras().mandate_news().impute_blood().raw_news().mandate_diagnoses()

In [5]:
if Notebook.RUN_TRAINING:
    X, _ = sci.omit_redundant().xy()
    ys = {
        'CriticalCare48h': sci.derive_critical_care(within=2).xy(outcome="CriticalCare")[1],
        'Death48h': sci.derive_death_within(within=2).xy(outcome="DiedWithinThreshold")[1],
        'CriticalEvent48h': sci.derive_critical_event(within=2).xy(outcome="CriticalEvent")[1],
        'LOS48h': (sci.TotalLOS >= 48).to_numpy(),
        'ReadmissionWithin30Days': sci.Readmitted.fillna(False).to_numpy()
    }

In [6]:
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer
from sklearn.pipeline import Pipeline

numeric_pipeline = Pipeline(
    steps=[
        ("scale", MinMaxScaler()),
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ("one-hot", OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ]
)

ct = make_column_transformer(
    (numeric_pipeline, make_column_selector(dtype_include=np.number)),
    (categorical_pipeline, make_column_selector(dtype_include=object)),
)

full_processor = Pipeline(steps=[("columns", ct)])


In [7]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    roc_auc_score,
    f1_score,
    fbeta_score,
    make_scorer,
)

f2_score = make_scorer(fbeta_score, beta=2)

METRICS = {
    "Accuracy": "accuracy",
    "Precision": "precision",
    "Recall": "recall",
    "AUC": "roc_auc",
    "F1 Score": "f1",
    "F2 Score": f2_score,
}


In [8]:
from sklearn.model_selection import cross_validate


def spotCheckCV(model, X, y, cv=3, pretty=True):
    scores = cross_validate(model, X, y, scoring=METRICS, cv=cv)
    if pretty:
        display(
            pd.DataFrame(
                [
                    (name.split("_")[1], sc)
                    for name, score in scores.items()
                    if name.startswith("test")
                    for sc in score
                ],
                columns=["Metric", "Score"],
            )
            .groupby("Metric")
            .agg(
                Mean=pd.NamedAgg(column="Score", aggfunc=np.mean),
                Std=pd.NamedAgg(column="Score", aggfunc=np.std),
            )
        )
    else:
        return scores


In [9]:
from cross_validation import cross_validate_parallel_file
%aimport cross_validation

In [10]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.combine import SMOTETomek

MODELS = {
    "Logistic Regression": LogisticRegression(random_state=0, C=1e2, max_iter=1000),
    "Gaussian NB": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
}

IMBALANCED_MODELS = {
    "Balanced Decision Tree": DecisionTreeClassifier(class_weight="balanced"),
    "Balanced SVM": SVC(gamma="scale", class_weight="balanced"),
    "Balanced Random Forest": BalancedRandomForestClassifier(
        n_estimators=10, class_weight="balanced_subsample"
    ),
    "Balanced XGBoost": XGBClassifier(
        use_label_encoder=False, scale_pos_weight=21, eval_metric="logloss"
    ),
}

RESAMPLERS = {
    "SMOTE": SMOTE(),
    "Undersampling": RandomUnderSampler(sampling_strategy="majority"),
    "SMOTE-Tomek": SMOTETomek(tomek=TomekLinks(sampling_strategy="majority")),
}

resampled_models = {
    f"{classifier[0]} with {sampler[0]}": ImbPipeline(steps=[sampler, classifier])
    for sampler in RESAMPLERS.items()
    for classifier in MODELS.items()
}


In [11]:
if Notebook.SCRIPT:
    import argparse

    parser = argparse.ArgumentParser(description='Run test')
    parser.add_argument('--outcome', type=str,
                        help=f'Can be: {ys.keys()}')
    parser.add_argument('--filename', type=str,
                        help='Output filename')

    args = parser.parse_args()

    print(f'########## RUNNING {args.outcome} ##############')

In [12]:
from sklearn.model_selection import train_test_split
if Notebook.RUN_TRAINING:
    X = full_processor.fit_transform(X)
    y = ys[args.outcome]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25
    )


In [13]:
if Notebook.RUN_TRAINING:    
    m = {**MODELS, **resampled_models, **IMBALANCED_MODELS}
    cross_validate_parallel_file(
        filename=args.filename,
        models=m,
        X=X_train,
        y=y_train,
        scoring=METRICS,
        cv=2,
    )


## Results

In [4]:
df = pd.concat([pd.read_csv(f'results/ml_test_2/{_}').assign(test=_[:-4]) for _ in os.listdir('results\ml_test_2')], axis=0)

In [7]:
df.groupby(['test', 'model']).mean().loc['CriticalEvent']

Unnamed: 0_level_0,err,fit_time,score_time,test_Accuracy,test_Precision,test_Recall,test_AUC,test_F1 Score,test_F2 Score
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Balanced Decision Tree,0.0,38.768819,0.301163,0.971962,0.175507,0.185966,0.585609,0.180474,0.1837
Balanced Random Forest,0.0,3.207345,0.890289,0.845823,0.077179,0.75449,0.870676,0.140013,0.273746
Balanced SVM,0.0,888.57086,321.909117,0.90881,0.11258,0.650558,0.873553,0.191912,0.332539
Balanced XGBoost,0.0,60.993473,0.350846,0.963852,0.215077,0.444791,0.874887,0.289735,0.366228
Decision Tree,0.0,60.052651,0.36596,0.970157,0.176159,0.217761,0.60032,0.194636,0.207841
Decision Tree with SMOTE,0.0,73.041921,0.320252,0.964699,0.143646,0.225681,0.601438,0.175499,0.202495
Decision Tree with SMOTE-Tomek,0.0,538.906366,0.087339,0.964699,0.143646,0.225681,0.601438,0.175499,0.202495
Decision Tree with Undersampling,0.0,0.793403,0.328586,0.740922,0.046075,0.739221,0.740086,0.086735,0.18435
Gaussian NB,0.0,1.027507,0.43844,0.101116,0.017807,0.979481,0.534011,0.034978,0.082999
Gaussian NB with SMOTE,0.0,6.420742,0.881531,0.219882,0.019964,0.953666,0.586284,0.039109,0.0921
