# 0 Initialise

### Import Packages

In [47]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from CardiomegalyBiomarkers.Cardiomegaly_Classification.src.xgboost_functions import SplitData, train_test_xgboost

### Paths

In [48]:
# Path for data location
data_path = '../Cardiomegaly_Classification/MIMIC_features/MIMIC_features.pkl'

# Path for model location and error graphs
model_folder = '../Cardiomegaly_Classification/models/xgboost/'

### Model Parameters

In [33]:
# Data parameters
TrainTestFractions = [0.9, 0.1] # Spits of data for training/validation and testing sets
FoldFractions = [0.2, 0.2, 0.2, 0.2, 0.2] # Spits of data for folds

# XGBoost parameters
eval_metric = 'logloss'
scale_pos_weight = 0.3/0.7
colsample_bytree = 0.75
gamma = 0
lr = 0.1
subsample = 0.75

max_depth_shallow = 3
max_depth_deep = 8

early_stopping = 15

In [35]:
# Feature selection for model
base = ['subject_id', 'hadm_id', 'stay_id', 'path'] # base features

vital = ['HR_mean', 'ABPs_mean', 'ABPd_mean', 'ABPm_mean', 'NBPs_mean', 'NBPd_mean',  # vital sign features (V)
         'NBPm_mean', 'RR_mean', 'PO2_mean', 'SaO2_mean', 'PCO2_mean', 'PH_mean', 'SpO2_mean', 
         'Cholesterol_mean', 'Sodium_mean', 'Temp(F)_mean', 'FiO2_mean', 'Albumin_mean', 
         'PlateletCount_mean', 'HR_max', 'ABPs_max', 'ABPd_max', 'ABPm_max', 'NBPs_max', 
         'NBPd_max', 'NBPm_max', 'RR_max', 'PCO2_max', 'Sodium_max', 'FiO2_max', 'HR_min', 
         'ABPs_min', 'ABPd_min', 'ABPm_min', 'NBPs_min', 'NBPd_min', 'NBPm_min', 'RR_min', 
         'PCO2_min', 'SpO2_min', 'Sodium_min']


lab = ['Hemoglobin_mean', 'Lactate_mean', 'Tidal_Volume_mean', 'ALT_mean', 'Alkaline_Phosphatase_mean', # laboratory features (L)
       'Bilirubin', 'Calcium_Total_mean', 'Chloride_mean', 'Creatinine_mean', 'Glucose_mean', 
       'Magnesium_mean', 'Phosphate_mean', 'Potassium_mean', 'Troponin-T_mean', 'Urea_Nitrogren_mean', 
       'Fibrinogen', 'Hematocrit_mean', 'PTT_mean', 'WBC_mean', 'Hemoglobin_max', 'Potassium_max', 
       'Troponin-T_max', 'WBC_max', 'Hemoglobin_min', 'Potassium_min', 'WBC_min']

meta_cont = ['los', 'anchor_age']                # continuous metadata features (M)
meta_cat = ['gender']                            # categorical metadata features (M)
meta_cat_encoded = ['gender_F', 'gender_M']      # encoded catgeorical metadata features

bmrks = ['CTR', 'CPAR']     # biomarker features (BMRK)

# Select modality combinations tested (list of lists)
modalities_combinations = [[vital + lab + meta_cont + meta_cat_encoded, 'M_L_V'],
                           [bmrks, 'BMRK'], 
                           [vital + lab + meta_cont + meta_cat_encoded + bmrks, 'M_L_V_BMRK']]

# 1 Data Prepocessing

In [6]:
# Read 
data = pd.read_pickle(data_path)

# Change name of column to indicate class deoaration
data.rename(columns={'Cardiomegaly':'class'}, inplace=True)

[TrainVal_df, Test_df] = SplitData(data, TrainTestFractions)

# Split trainval into folds
[Fold1_df, Fold2_df, Fold3_df, Fold4_df, Fold5_df] = SplitData(TrainVal_df, FoldFractions)

In [7]:
# Preprocess folds for xgboost use by selecting features and encoding categorical features
Fold1_df = pd.concat([Fold1_df[base + vital + lab + meta_cont + bmrks], pd.get_dummies(Fold1_df[meta_cat]), Fold1_df['class']], axis=1)
Fold2_df = pd.concat([Fold2_df[base + vital + lab + meta_cont + bmrks], pd.get_dummies(Fold2_df[meta_cat]), Fold2_df['class']], axis=1)
Fold3_df = pd.concat([Fold3_df[base + vital + lab + meta_cont + bmrks], pd.get_dummies(Fold3_df[meta_cat]), Fold3_df['class']], axis=1)
Fold4_df = pd.concat([Fold4_df[base + vital + lab + meta_cont + bmrks], pd.get_dummies(Fold4_df[meta_cat]), Fold4_df['class']], axis=1)
Fold5_df = pd.concat([Fold5_df[base + vital + lab + meta_cont + bmrks], pd.get_dummies(Fold5_df[meta_cat]), Fold5_df['class']], axis=1)

# Preprocess test set for xgboost use by selecting features and encoding categorical features
Test_df = pd.concat([Test_df[base + vital + lab + meta_cont + bmrks], pd.get_dummies(Test_df[meta_cat]), Test_df['class']], axis=1)

# 2 Model Training and Testing

In [None]:
# define dict of model parameters
model_params = {'eval_metric' : eval_metric,
                'scale_pos_weight' : scale_pos_weight,
                'colsample_bytree' : colsample_bytree,
                'gamma' : gamma,
                'lr' : lr,
                'subsample' : subsample, 
                'max_depth_shallow' : max_depth_shallow,
                'max_depth_deep' : max_depth_deep,
                'early_stopping' : early_stopping}


# getmodels, train and test, return pandas of results using k-fold cross validation
# -> if lossFigure and saveModels are True, then figures of training and validation losses and models are saved in ./MOdel/xgboost/
results_fold1 = train_test_xgboost([Fold2_df, Fold3_df, Fold4_df, Fold5_df], Fold1_df, 1, Test_df, modalities_combinations, model_params, model_folder, lossFigure = True, exportModels = True)
results_fold2 = train_test_xgboost([Fold1_df, Fold3_df, Fold4_df, Fold5_df], Fold2_df, 2, Test_df, modalities_combinations, model_params, model_folder, lossFigure = False, exportModels = False)
results_fold3 = train_test_xgboost([Fold2_df, Fold1_df, Fold4_df, Fold5_df], Fold3_df, 3, Test_df, modalities_combinations, model_params, model_folder, lossFigure = False, exportModels = False)
results_fold4 = train_test_xgboost([Fold2_df, Fold3_df, Fold1_df, Fold5_df], Fold4_df, 4, Test_df, modalities_combinations, model_params, model_folder, lossFigure = False, exportModels = False)
results_fold5 = train_test_xgboost([Fold2_df, Fold3_df, Fold4_df, Fold1_df], Fold5_df, 5, Test_df, modalities_combinations, model_params, model_folder, lossFigure = False, exportModels = False)


In [None]:
# These lines are inlcuded to show how to load an exported xgboost model for predictions on new data
'''
model_load = XGBClassifier()
model_load.load_model(model_folder + 'BMRK_fold1_model.json')
'''

# 3 Results Analysis

In [None]:
# display results per fold
print(f'#######   VAL FOLD: fold 1   ########')
display(results_fold1)

print(f'#######   VAL FOLD: fold 2   ########')
display(results_fold2)

print(f'#######   VAL FOLD: fold 3   ########')
display(results_fold3)

print(f'#######   VAL FOLD: fold 4   ########')
display(results_fold4)

print(f'#######   VAL FOLD: fold 5   ########')
display(results_fold5)

In [None]:
# Average performance scores over 5 folds and split per feature used
results = pd.concat([results_fold1, results_fold2, results_fold3, results_fold4, results_fold5])

modalities = list(set(results.index.tolist()))

for modality in modalities:
    results_summary = pd.concat([results['Accuracy'].loc[modality].describe(), results['ROC AUC'].loc[modality].describe(), results['F1 score'].loc[modality].describe()], axis=1)
    results_summary.columns = ['Accuracy', 'ROC AUC', 'F1 score']
    print(f'#######   AVG OVER 5 FOLDS: {modality}    ########')
    display(results_summary.iloc[1:3])