In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, roc_curve
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV

# Import datasets

In [2]:
csv_path = os.path.join("..", "..", "data","LINKED_DATA", "TSR_ALL", "TSR_ALL1", "TSR_ALL1_MICE1.csv")
tsr_all1_df = pd.read_csv(csv_path)
tsr_all1_df.shape

(44850, 233)

# Convert the multiple feature and outcome into binary ones

In [3]:
mRS1 = tsr_all1_df.mrs_tx_1
mRS1[(mRS1 == 0) | (mRS1 == 1) | (mRS1 == 2)] = 1 #GOOD
mRS1[(mRS1 == 3) | (mRS1 == 4) | (mRS1 == 5) | (mRS1 == 6) | (mRS1 == 9)] = 0 #BAD
# discharged = tsr_all1_df.discharged_mrs
# discharged[(discharged == 0) | (discharged == 1) | (discharged == 2)] = 1 #GOOD
# discharged[(discharged == 3) | (discharged == 4) | (discharged == 5) | (discharged == 6) | (discharged == 9)] = 0 #BAD

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mRS1[(mRS1 == 0) | (mRS1 == 1) | (mRS1 == 2)] = 1 #GOOD
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mRS1[(mRS1 == 3) | (mRS1 == 4) | (mRS1 == 5) | (mRS1 == 6) | (mRS1 == 9)] = 0 #BAD


# Group all features and the outcome

In [4]:
nominal_features = ["edu_id", "pro_id", "opc_id", "toast_id", "offdt_id", "gender_tx", "hd_id", "pcva_id",
                    "pcvaci_id", "pcvach_id", "po_id", "ur_id", "sm_id", "ptia_id", "hc_id", "hcht_id",
                    "hchc_id", "ht_id", "dm_id", "pad_id", "al_id", "ca_id", "fahiid_parents_1",
                    "fahiid_parents_2", "fahiid_parents_3", "fahiid_parents_4", "fahiid_brsi_1",
                    "fahiid_brsi_2", "fahiid_brsi_3", "fahiid_brsi_4"]
ordinal_features = ["gcse_nm", "gcsv_nm", "gcsm_nm", "discharged_mrs", "feeding", "transfers",
                    "bathing", "toilet_use", "grooming", "mobility", "stairs", "dressing", "bowel_control",
                    "bladder_control", "nihs_1a_in", "nihs_1b_in", "nihs_1c_in", "nihs_2_in", "nihs_3_in", "nihs_4_in",
                    "nihs_5al_in", "nihs_5br_in", "nihs_6al_in", "nihs_6br_in", "nihs_7_in", "nihs_8_in", "nihs_9_in",
                    "nihs_10_in", "nihs_11_in", "nihs_1a_out", "nihs_1b_out", "nihs_1c_out", "nihs_2_out", "nihs_3_out",
                    "nihs_4_out", "nihs_5al_out", "nihs_5br_out", "nihs_6al_out", "nihs_6br_out", "nihs_7_out",
                    "nihs_8_out", "nihs_9_out", "nihs_10_out", "nihs_11_out"]
boolean = ["toastle_fl", "toastli_fl", "toastsce_fl", "toastsmo_fl", "toastsra_fl", "toastsdi_fl",
           "toastsmi_fl", "toastsantip_fl", "toastsau_fl", "toastshy_fl", "toastspr_fl", "toastsantit_fl",
           "toastsho_fl", "toastshys_fl", "toastsca_fl", "thda_fl", "thdh_fl", "thdi_fl", "thdam_fl", "thdv_fl",
           "thde_fl", "thdm_fl", "thdr_fl", "thdp_fl", "trman_fl", "trmas_fl", "trmti_fl", "trmhe_fl",
           "trmwa_fl", "trmia_fl", "trmfo_fl", "trmta_fl", "trmsd_fl", "trmre_fl", "trmen_fl", "trmag_fl",
           "trmcl_fl", "trmpl_fl", "trmlm_fl", "trmiv_fl", "trmve_fl", "trmng_fl", "trmdy_fl", "trmicu_fl",
           "trmsm_fl", "trmed_fl", "trmop_fl", "om_fl", "omas_fl", "omag_fl", "omti_fl", "omcl_fl", "omwa_fl",
           "ompl_fl", "omanh_fl", "omand_fl", "omli_fl", "am_fl", "amas_fl", "amag_fl", "amti_fl", "amcl_fl",
           "amwa_fl", "ampl_fl", "amanh_fl", "amand_fl", "amli_fl", "compn_fl", "comut_fl", "comug_fl",
           "compr_fl", "compu_fl", "comac_fl", "comse_fl", "comde_fl", "detst_fl", "dethe_fl", "detho_fl",
           "detha_fl", "detva_fl", "detre_fl", "detme_fl", "ct_fl", "mri_fl", "ecgl_fl", "ecga_fl", "ecgq_fl",
           "cortical_aca_ctr", "cortical_mca_ctr", "subcortical_aca_ctr", "subcortical_mca_ctr", "pca_cortex_ctr",
           "thalamus_ctr", "brainstem_ctr", "cerebellum_ctr", "watershed_ctr", "hemorrhagic_infarct_ctr",
           "old_stroke_ctci", "cortical_aca_ctl", "cortical_mca_ctl", "subcortical_aca_ctl", "subcortical_mca_ctl",
           "pca_cortex_ctl", "thalamus_ctl", "brainstem_ctl", "cerebellum_ctl", "watershed_ctl",
           "hemorrhagic_infarct_ctl", "old_stroke_ctch", "cortical_aca_mrir", "cortical_mca_mrir",
           "subcortical_aca_mrir", "subcortical_mca_mrir", "pca_cortex_mrir", "thalamus_mrir", "brainstem_mrir",
           "cerebellum_mrir", "watershed_mrir", "hemorrhagic_infarct_mrir", "old_stroke_mrici", "cortical_aca_mril",
           "cortical_mca_mril", "subcortical_aca_mril", "subcortical_mca_mril", "pca_cortex_mril",
           "thalamus_mril", "brainstem_mril", "cerebellum_mril", "watershed_mril", "hemorrhagic_infarct_mril",
           "old_stroke_mrich"]
continuous = ["height_nm", "weight_nm", "sbp_nm", "dbp_nm", "bt_nm", "hr_nm", "rr_nm", "hb_nm",
              "hct_nm", "platelet_nm", "wbc_nm", "ptt1_nm", "ptt2_nm", "ptinr_nm", "er_nm", "bun_nm",
              "cre_nm", "ua_nm", "tcho_nm", "tg_nm", "hdl_nm",
              "ldl_nm", "gpt_nm", "age", "hospitalised_time"]
labels = ["mrs_tx_1"]

# Machine Learning

## Preprocess input data (GOOD when Discharge)

In [156]:
## discharged mRS = GOOD (tsr_all1_df.discharged_mrs == 1)
mrs_dis1 = tsr_all1_df[(tsr_all1_df.discharged_mrs == 1) | (tsr_all1_df.discharged_mrs == 0) | (tsr_all1_df.discharged_mrs == 2)]

In [157]:
## input dataset
tsr_1G_input = mrs_dis1.drop(["icase_id", "idcase_id", "mrs_tx_1"], axis=1)
print(tsr_1G_input.shape)
tsr_1G_input = tsr_1G_input.astype("float64")
tsr_1G_input = np.array(tsr_1G_input.values)

(20706, 230)


In [158]:
## output dataset
tsr_1G_output = mrs_dis1.mrs_tx_1
print(tsr_1G_output.shape)
tsr_1G_output = tsr_1G_output.astype("float64")
tsr_1G_output = np.array(tsr_1G_output.values)

(20706,)


In [159]:
## train_test_split
G_X_train, G_X_test, G_y_train, G_y_test = train_test_split(tsr_1G_input, tsr_1G_output, test_size=0.3, random_state=19)
print("The shape of GOOD's X_train:", G_X_train.shape)
print("The shape of GOOD's y_train:", G_y_train.shape)
print("The shape of GOOD's X_test:", G_X_test.shape)
print("The shape of GOOD's y_test:", G_y_test.shape)

The shape of GOOD's X_train: (14494, 230)
The shape of GOOD's y_train: (14494,)
The shape of GOOD's X_test: (6212, 230)
The shape of GOOD's y_test: (6212,)


In [160]:
## scale G_X_train
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
G_X_train = pd.DataFrame(G_X_train)
G_X_train.columns = tsr_all1_df.drop(["icase_id", "idcase_id", "mrs_tx_1"], axis=1).columns

scaler = MinMaxScaler()
G_X_train[continuous] = scaler.fit_transform(G_X_train[continuous])

encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=6)
G_X_train[ordinal_features] = encoder.fit_transform(G_X_train[ordinal_features])

ohe = OneHotEncoder(sparse=False, handle_unknown = "ignore")
nominal_train = ohe.fit_transform(G_X_train[nominal_features])
G_X_train = pd.concat([G_X_train, pd.DataFrame(nominal_train)], axis=1)
G_X_train = G_X_train.drop(nominal_features, axis=1)

G_X_train = np.array(G_X_train.values)

In [161]:
## scale G_X_test
G_X_test = pd.DataFrame(G_X_test)
G_X_test.columns = tsr_all1_df.drop(["icase_id", "idcase_id", "mrs_tx_1"], axis=1).columns

G_X_test[continuous] = scaler.transform(G_X_test[continuous])

G_X_test[ordinal_features] = encoder.transform(G_X_test[ordinal_features])

nominal_test = ohe.transform(G_X_test[nominal_features])
G_X_test = pd.concat([G_X_test, pd.DataFrame(nominal_test)], axis=1)
G_X_test = G_X_test.drop(nominal_features, axis=1)

G_X_test = np.array(G_X_test.values)

## Undersampling

In [11]:
# Step 1 - TomekLinks
from collections import Counter
from imblearn.under_sampling import TomekLinks
print('Original dataset shape %s' % Counter(G_y_train))
tl = TomekLinks()
G_X_train_tl, G_y_train_tl = tl.fit_resample(G_X_train, G_y_train)
print('Resampled dataset shape %s' % Counter(G_y_train_tl))

Original dataset shape Counter({1.0: 14041, 0.0: 453})
Resampled dataset shape Counter({1.0: 13980, 0.0: 453})


## Algorithms

In [12]:
### base et 
et = ExtraTreesClassifier(random_state=19)
et.fit(G_X_train_tl, G_y_train_tl)
G_y_train_pred = et.predict_proba(G_X_train_tl)
fpr, tpr, thresholds = roc_curve(G_y_train_tl, G_y_train_pred[:, 1])
G_train_auroc = auc(fpr, tpr)
print('AUC of training set:', G_train_auroc)
G_y_test_pred = et.predict_proba(G_X_test)
fpr, tpr, thresholds = roc_curve(G_y_test, G_y_test_pred[:, 1])
G_test_auroc = auc(fpr, tpr)
print('AUC of testing set:', G_test_auroc)

# confusion_matrix(G_y_test, G_y_predicted)

AUC of training set: 1.0
AUC of testing set: 0.6995404529212713


In [13]:
hyperparameters_et = {"n_estimators": (50, 100, 150),
                      "criterion": ("gini", "entropy"),
                      "max_depth": (25, 50, 100),
                      "min_samples_split": (25, 50, 100),
                      "max_features": ("auto", "sqrt", "log2"),
                      "bootstrap": (True, False),
                      "class_weight": ('balanced', {0: 1, 1: 25}, {0: 25, 1: 1}),
                      "max_samples": (100, 0.7),
                      "oob_score": (True, False)}

etG_gscv = GridSearchCV(estimator=ExtraTreesClassifier(random_state=19),
                       param_grid=hyperparameters_et,
                       n_jobs=-1,
                       scoring='roc_auc',
                       verbose=5,
                       cv=5)

etG_gsCV = etG_gscv.fit(G_X_train_tl, G_y_train_tl)
print('--> Tuned Parameters Best Score: ', etG_gsCV.best_score_)
print('--> Best Parameters: \n', etG_gsCV.best_params_)

Fitting 5 folds for each of 3888 candidates, totalling 19440 fits




--> Tuned Parameters Best Score:  0.7416387094069498
--> Best Parameters: 
 {'bootstrap': True, 'class_weight': {0: 25, 1: 1}, 'criterion': 'gini', 'max_depth': 25, 'max_features': 'auto', 'max_samples': 0.7, 'min_samples_split': 100, 'n_estimators': 150, 'oob_score': True}


In [14]:
G_y_train_pred = etG_gsCV.predict_proba(G_X_train_tl)
fpr, tpr, thresholds = roc_curve(G_y_train_tl, G_y_train_pred[:, 1])
G_train_auroc = auc(fpr, tpr)
print('AUC of training set:', G_train_auroc)

G_y_test_pred = etG_gsCV.predict_proba(G_X_test)
fpr, tpr, thresholds = roc_curve(G_y_test, G_y_test_pred[:, 1])
G_test_auroc = auc(fpr, tpr)
print('AUC of testing set:', G_test_auroc)

AUC of training set: 0.9651785742482955
AUC of testing set: 0.7378698028822263


## CalibratedClassifierCV

In [88]:
etG_cccv = CalibratedClassifierCV(base_estimator=etG_gsCV.best_estimator_, cv=5)
etG_ccCV = etG_cccv.fit(G_X_train_tl, G_y_train_tl)

G_y_train_pred = etG_ccCV.predict_proba(G_X_train_tl)
fpr, tpr, thresholds = roc_curve(G_y_train_tl, G_y_train_pred[:, 1])
G_train_auroc = auc(fpr, tpr)
print('AUC of training set:', G_train_auroc)

G_y_test_pred = etG_ccCV.predict_proba(G_X_test)
fpr, tpr, thresholds = roc_curve(G_y_test, G_y_test_pred[:, 1])
G_test_auroc = auc(fpr, tpr)
print('AUC of testing set:', G_test_auroc)

AUC of training set: 0.9610114733441341
AUC of testing set: 0.7435878747722378


## Threshold

In [128]:
etG_sigma = etG_gsCV.best_estimator_.feature_importances_

In [129]:
etG_sigma_df = pd.DataFrame(etG_sigma)

In [130]:
etG_sigma_min = etG_sigma.min()
etG_sigma_std = etG_sigma.std()
etG_sigma_threshold = etG_sigma_min + etG_sigma_std

In [131]:
etG_sigma_index = etG_sigma_df[etG_sigma_df > etG_sigma_threshold].dropna().index

In [133]:
etG_sigma_index

Int64Index([  0,   1,   7,  10,  11,  13,  27,  34,  35,  43,  58,  59,  68,
             69,  70,  71,  74,  77,  80,  81,  82,  83,  84,  90,  91,  92,
            108, 110, 114, 116, 117, 118, 119, 120, 123, 128, 131, 134, 138,
            147, 149, 158, 160, 173, 174, 176, 177, 181, 188, 189, 190, 191,
            192, 194, 196, 198, 199, 200, 201, 204, 207, 211, 213, 222, 223,
            231, 233, 234, 236, 237, 239, 240, 251, 252, 257, 260, 261, 263,
            264, 266, 267, 269, 270, 278, 284, 293, 294, 295, 297, 299, 301,
            303, 305, 307],
           dtype='int64')

## Selected Columns

In [162]:
G_X_train = pd.DataFrame(G_X_train)
G_X_train_selected = G_X_train.iloc[:, etG_sigma_index]

In [163]:
G_X_test = pd.DataFrame(G_X_test)
G_X_test_selected = G_X_test.iloc[:, etG_sigma_index]

In [164]:
# Step 1 - TomekLinks
from collections import Counter
from imblearn.under_sampling import TomekLinks
print('Original dataset shape %s' % Counter(G_y_train))
tl = TomekLinks()
G_X_train_tl, G_y_train_tl = tl.fit_resample(G_X_train_selected, G_y_train)
print('Resampled dataset shape %s' % Counter(G_y_train_tl))

Original dataset shape Counter({1.0: 14041, 0.0: 453})
Resampled dataset shape Counter({1.0: 13956, 0.0: 453})


In [166]:
### base et 
et = ExtraTreesClassifier(random_state=19)
et.fit(G_X_train_tl, G_y_train_tl)
G_y_train_pred = et.predict_proba(G_X_train_tl)
fpr, tpr, thresholds = roc_curve(G_y_train_tl, G_y_train_pred[:, 1])
G_train_auroc = auc(fpr, tpr)
print('AUC of training set:', G_train_auroc)
G_y_test_pred = et.predict_proba(G_X_test_selected)
fpr, tpr, thresholds = roc_curve(G_y_test, G_y_test_pred[:, 1])
G_test_auroc = auc(fpr, tpr)
print('AUC of testing set:', G_test_auroc)

# confusion_matrix(G_y_test, G_y_predicted)

AUC of training set: 1.0
AUC of testing set: 0.7046588892306964


In [167]:
hyperparameters_et = {"n_estimators": (50, 100, 150),
                      "criterion": ("gini", "entropy"),
                      "max_depth": (25, 50, 100),
                      "min_samples_split": (25, 50, 100),
                      "max_features": ("auto", "sqrt", "log2"),
                      "bootstrap": (True, False),
                      "class_weight": ('balanced', {0: 1, 1: 25}, {0: 25, 1: 1}),
                      "max_samples": (100, 0.7),
                      "oob_score": (True, False)}

etG_gscv = GridSearchCV(estimator=ExtraTreesClassifier(random_state=19),
                       param_grid=hyperparameters_et,
                       n_jobs=-1,
                       scoring='roc_auc',
                       verbose=5,
                       cv=5)

etG_gsCV = etG_gscv.fit(G_X_train_tl, G_y_train_tl)
print('--> Tuned Parameters Best Score: ', etG_gsCV.best_score_)
print('--> Best Parameters: \n', etG_gsCV.best_params_)

Fitting 5 folds for each of 3888 candidates, totalling 19440 fits




--> Tuned Parameters Best Score:  0.7555806358239706
--> Best Parameters: 
 {'bootstrap': True, 'class_weight': {0: 1, 1: 25}, 'criterion': 'entropy', 'max_depth': 25, 'max_features': 'auto', 'max_samples': 0.7, 'min_samples_split': 100, 'n_estimators': 100, 'oob_score': True}


In [168]:
G_y_train_pred = etG_gsCV.predict_proba(G_X_train_tl)
fpr, tpr, thresholds = roc_curve(G_y_train_tl, G_y_train_pred[:, 1])
G_train_auroc = auc(fpr, tpr)
print('AUC of training set:', G_train_auroc)

G_y_test_pred = etG_gsCV.predict_proba(G_X_test_selected)
fpr, tpr, thresholds = roc_curve(G_y_test, G_y_test_pred[:, 1])
G_test_auroc = auc(fpr, tpr)
print('AUC of testing set:', G_test_auroc)

AUC of training set: 0.9257194006771203
AUC of testing set: 0.7324943798954069


In [172]:
etG_cccv = CalibratedClassifierCV(base_estimator=etG_gsCV.best_estimator_, cv=5)
etG_ccCV = etG_cccv.fit(G_X_train_tl, G_y_train_tl)

G_y_train_pred = etG_ccCV.predict_proba(G_X_train_tl)
fpr, tpr, thresholds = roc_curve(G_y_train_tl, G_y_train_pred[:, 1])
G_train_auroc = auc(fpr, tpr)
print('AUC of training set:', G_train_auroc)

G_y_test_pred = etG_ccCV.predict_proba(G_X_test_selected)
fpr, tpr, thresholds = roc_curve(G_y_test, G_y_test_pred[:, 1])
G_test_auroc = auc(fpr, tpr)
print('AUC of testing set:', G_test_auroc)

AUC of training set: 0.9088301486159276
AUC of testing set: 0.7055188243924371
