In [1]:
import warnings, math, itertools, json, logging
from hashlib import md5

import numpy as np
import pandas as pd

from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns

%load_ext autoreload
%autoreload 1


In [2]:
pd.set_option("display.max_columns", None)
sns.set_theme(style="darkgrid", palette="colorblind")
sns.set(rc={"figure.figsize": (11.5, 8.5), "figure.dpi": 100})

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
)


In [11]:
from dataset import SCIData, SCICols
%aimport dataset
X, y = SCIData.load("data/sci_processed.h5").omit_redundant().omit_vbg().omit_ae().derive_critical_event().augment_hsmr(onehot=True).impute_blood().raw_news().mandate_news().mandate_diagnoses().xy(outcome='DiedDuringStay')

In [4]:
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer
from sklearn.pipeline import Pipeline

numeric_pipeline = Pipeline(steps=[("scale", MinMaxScaler()),])

categorical_pipeline = Pipeline(
    steps=[("one-hot", OneHotEncoder(handle_unknown="ignore", sparse=False)),]
)

ct = make_column_transformer(
    (numeric_pipeline, make_column_selector(dtype_include=np.number)),
    (categorical_pipeline, make_column_selector(dtype_include=object)),
)

full_processor = Pipeline(steps=[("columns", ct)])


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    full_processor.fit_transform(X), y, test_size=0.25
)

In [6]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    roc_auc_score,
    f1_score,
    fbeta_score,
    make_scorer,
)

f2_score = make_scorer(fbeta_score, beta=2)

METRICS = {
    "Accuracy": "accuracy",
    "Precision": "precision",
    "Recall": "recall",
    "AUC": "roc_auc",
    "F1 Score": "f1",
    "F2 Score": f2_score,
}

In [7]:
from sklearn.model_selection import cross_validate


def spotCheckCV(model, X, y, cv=3, pretty=True):
    scores = cross_validate(model, X, y, scoring=METRICS, cv=cv)
    if pretty:
        display(
            pd.DataFrame(
                [
                    (name.split("_")[1], sc)
                    for name, score in scores.items()
                    if name.startswith("test")
                    for sc in score
                ],
                columns=["Metric", "Score"],
            )
            .groupby("Metric")
            .agg(
                Mean=pd.NamedAgg(column="Score", aggfunc=np.mean),
                Std=pd.NamedAgg(column="Score", aggfunc=np.std),
            )
        )
    else:
        return scores


In [27]:
from multiprocessing import Pool
from functools import partial

def cross_validate_parallel(models, X, y, cv=3, threads=None):
    if not threads:
        threads = len(models)
    
    with Pool(threads) as p:
        keys, values = zip(*models.items())
        result = zip(keys, p.map(partial(cross_validate, X=X, y=y, cv=cv, scoring=METRICS), values))

    return dict(result)


In [28]:
from sklearn.dummy import DummyClassifier

models = {
    'Dummy1': DummyClassifier(strategy="constant", constant=1),
    'Dummy0': DummyClassifier(strategy="constant", constant=0),
    'Dummyhalf': DummyClassifier(strategy="constant", constant=0.5)
}

cross_validate_parallel(models, X_train, y_train, cv=3)

{'Dummy1': {'fit_time': array([0.0781076 , 0.07810926, 0.06248593]),
  'score_time': array([0.06248665, 0.03123999, 0.04686213]),
  'test_Accuracy': array([0.04694439, 0.04694439, 0.04690408]),
  'test_Precision': array([0.04694439, 0.04694439, 0.04690408]),
  'test_Recall': array([1., 1., 1.]),
  'test_AUC': array([0.5, 0.5, 0.5]),
  'test_F1 Score': array([0.08967885, 0.08967885, 0.0896053 ]),
  'test_F2 Score': array([0.19761438, 0.19761438, 0.19747151])},
 'Dummy0': {'fit_time': array([0.07810569, 0.07810497, 0.06248856]),
  'score_time': array([0.04686809, 0.04686475, 0.03124166]),
  'test_Accuracy': array([0.95305561, 0.95305561, 0.95309592]),
  'test_Precision': array([0., 0., 0.]),
  'test_Recall': array([0., 0., 0.]),
  'test_AUC': array([0.5, 0.5, 0.5]),
  'test_F1 Score': array([0., 0., 0.]),
  'test_F2 Score': array([0., 0., 0.])},
 'Dummyhalf': {'fit_time': array([0.06249285, 0.05452728, 0.03124475]),
  'score_time': array([0., 0., 0.]),
  'test_Accuracy': array([nan, nan,