In [2]:
import warnings, math, itertools, json, logging, os
from hashlib import md5

import numpy as np
import pandas as pd

from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns

%load_ext autoreload
%autoreload 1


In [205]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    roc_auc_score,
    f1_score,
    fbeta_score,
    make_scorer,
    precision_recall_curve
)

f2_score = make_scorer(fbeta_score, beta=2)

METRICS = {
    "Accuracy": "accuracy",
    "Precision": "precision",
    "Recall": "recall",
    "AUC": "roc_auc",
    "F1 Score": "f1",
    "F2 Score": f2_score,
}

def get_metrics(y_true, y_pred):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1 Score": f1_score(y_true, y_pred),
        "F2 Score": fbeta_score(y_true, y_pred, beta=2),
    }

In [4]:
from cross_validation import cross_validate_parallel
from sklearn.model_selection import train_test_split
%aimport cross_validation

def spotCheckCV(models, X, y, cv=3, pretty=True):
    models = {'Model':models} if type(models) != dict else models
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42
    )
    r = cross_validate_parallel(models, X_train, y_train, scoring=METRICS, cv=cv)
    return pd.concat([pd.DataFrame(r[_]).assign(model=_) for _ in r.keys()]).groupby('model').mean()   

In [119]:
from dataset import SCIData, SCICols
%aimport dataset
#SCIData.load('data/sci.h5').clean_all().filter_vague_diagnoses().derive_readmission().omit_vbg().omit_ae().save()
sci = SCIData.load('data/sci_processed.h5').derive_critical_event(within=2)

## LR with NEWS only

In [203]:
X, y = sci.mandate_news().xy(x=SCICols.news_data_raw, dtype=float, outcome="CriticalEvent")

In [13]:
from sklearn.linear_model import LogisticRegression
models = {
    'LR': LogisticRegression(random_state=42),
    'LR_Balanced': LogisticRegression(class_weight='balanced', random_state=42)
}
spotCheckCV(models, X, y)

Unnamed: 0_level_0,fit_time,score_time,test_Accuracy,test_Precision,test_Recall,test_AUC,test_F1 Score,test_F2 Score
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LR,0.445154,0.041655,0.982311,0.557528,0.045441,0.818121,0.083757,0.055616
LR_Balanced,0.46864,0.044205,0.852968,0.078048,0.668723,0.81885,0.139777,0.266021


In [206]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)
log = LogisticRegression(class_weight='balanced', random_state=42).fit(X_train, y_train)

precision, recall, thresholds = precision_recall_curve(y_train, log.predict_proba(X_train)[:, 1])
closest = thresholds[np.abs(precision/recall-target_ratio).argmin()]
y_pred = np.where(log.predict_proba(X_test)[:, 1] > closest, 1, 0)

In [212]:
pd.DataFrame(get_metrics(y_test, y_pred), index=['LR_Calibrated'])

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,F2 Score
LR_Calibrated,0.972249,0.246914,0.28169,0.263158,0.273973


## XGB with all features

In [213]:
X, y = sci.omit_redundant().omit_ae().impute_blood().raw_news().mandate_news().augment_hsmr().encode_ccs_onehot().xy(outcome='CriticalEvent')
X = X.drop(['ReadmissionBand', 'AgeBand', 'AandEPresentingComplaint'], axis=1).apply(lambda x:x.replace({True:1.0, False:0.0}))
X[X.select_dtypes(include=object).columns] = X.select_dtypes(include=object).astype('category')

In [221]:
spotCheckCV({
    'WGB_Unweighted':XGBClassifier(tree_method='approx', enable_categorical=True, scale_pos_weight=1),
    'XBG_Weighted':XGBClassifier(tree_method='approx', enable_categorical=True, scale_pos_weight=round(y.shape[0]/y.sum())) # Weight=56
}, X, y)

Unnamed: 0_level_0,fit_time,score_time,test_Accuracy,test_Precision,test_Recall,test_AUC,test_F1 Score,test_F2 Score
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
WGB_Unweighted,9.608938,0.298454,0.982421,0.525755,0.185677,0.927296,0.274239,0.213208
XBG_Weighted,9.555976,0.262892,0.974327,0.3438,0.480742,0.904182,0.400806,0.445189


In [227]:
X, y = sci.omit_redundant().omit_ae().impute_blood().mandate_news().raw_news().augment_shmi().encode_ccs_onehot().xy(outcome='CriticalEvent')
X = X.drop(['ReadmissionBand', 'AgeBand', 'AandEPresentingComplaint'], axis=1).apply(lambda x:x.replace({True:1.0, False:0.0}))
X[X.select_dtypes(include=object).columns] = X.select_dtypes(include=object).astype('category')

In [228]:
spotCheckCV({
    'WGB_Unweighted':XGBClassifier(tree_method='approx', enable_categorical=True, scale_pos_weight=1),
    'XBG_Weighted':XGBClassifier(tree_method='approx', enable_categorical=True, scale_pos_weight=round(y.shape[0]/y.sum())) # Weight=56
}, X, y)