In [1]:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
import math
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid", palette="colorblind")
sns.set(rc={"figure.figsize": (11.5, 8.5), "figure.dpi": 100})

from sklearn import metrics

from dataset import SCIData, SCICols
%load_ext autoreload
%autoreload 1
%aimport utils.data_profiling, dataset

In [2]:
class Notebook:
    RUN_MODEL_SELECTION = True

In [3]:
{
    'news_columns': ['impute', 'omit', 'mandate'],
    'blood_columns': ['impute', 'omit', 'mandate'],
    'news_format': ['raw', 'scored'],
    'diagnoses_grouping': ['icd10_3code', 'icd10_chapter', 'icd10_group', 'ccs', 'hsmr', 'shmi'],
    'diagnoses_onehot': [False, True]
    # Assumed steps:
    # - Mandate diagnoses, omit a&e text, omit VBG
}

{'news_columns': ['impute', 'omit', 'mandate'],
 'blood_columns': ['impute', 'omit', 'mandate'],
 'news_format': ['raw', 'scored'],
 'diagnoses_grouping': ['icd10_3code',
  'icd10_chapter',
  'icd10_group',
  'ccs',
  'hsmr',
  'shmi'],
 'diagnoses_onehot': [False, True]}

In [4]:
sci = (
    SCIData.load('data/sci_processed.h5')
    .omit_redundant()
    .omit_vbg()
    .omit_ae()
)

In [5]:
some_params = {
    'news_columns':'impute',
    'blood_columns':'impute',
    'news_format':'raw',
    'diagnoses_grouping': 'hsmr',
    'diagnoses_onehot': True
}

In [38]:
X, y = sci.preprocess_from_params(**some_params)

In [42]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer
from sklearn.pipeline import Pipeline

numeric, categorical = \
    X.select_dtypes(include='number').columns.tolist(),\
    X.select_dtypes(exclude='number').columns.tolist()

numeric_pipeline = Pipeline(steps=[
    ('scale', MinMaxScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

full_processor = Pipeline(steps=[
    ('columns', ColumnTransformer(transformers=[
        ('number', numeric_pipeline, numeric),
        ('category', categorical_pipeline, categorical)
    ]))
])

X = full_processor.fit_transform(X)

In [44]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [45]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, fbeta_score, make_scorer

f2_score = make_scorer(fbeta_score, beta=2)

METRICS = {
    'Accuracy': 'accuracy',
    'Precision': 'precision',
    'Recall': 'recall',
    'AUC': 'roc_auc',
    'F1 Score': 'f1',
    'F2 Score': f2_score
}

In [48]:
from sklearn.model_selection import cross_validate

def spotCheckCV(model, X, y, cv=3, pretty=True):
    scores = cross_validate(model, X, y, scoring=METRICS, cv=cv)
    r = pd.DataFrame(
        [(name.split('_')[1], sc) for name, score in scores.items() if name.startswith('test') for sc in score],
        columns=["Metric", "Score"]
    )
    if pretty:
        display(r.groupby("Metric").agg(
            Mean=pd.NamedAgg(column="Score", aggfunc=np.mean),
            Std=pd.NamedAgg(column="Score", aggfunc=np.std)
        ))
    return r

In [49]:
if Notebook.RUN_MODEL_SELECTION:
    from sklearn.dummy import DummyClassifier

    naive_clf = DummyClassifier(strategy='constant', constant=1)
    spotCheckCV(naive_clf, X_train, y_train), "Naive Classifier"


Unnamed: 0_level_0,Mean,Std
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
AUC,0.5,0.0
Accuracy,0.038513,0.0
F1 Score,0.07417,0.0
F2 Score,0.166861,0.0
Precision,0.038513,0.0
Recall,1.0,0.0
