In [1]:
import xgboost
import shap
import matplotlib.pylab as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import os
from sklearn.metrics import roc_auc_score, average_precision_score
import argparse
from sklearn.model_selection import GridSearchCV
import pickle
from tqdm import tqdm

In [2]:
def label(permth, mortstat, month):
    if permth > month:
        return 0
    else:
        if mortstat == 1:
            return 1
        else:
            return 2

In [3]:
def bootstrap_ci(y_pre, y_label, sample_size, repetitions = 1000, alpha = 0.05): 
    y_pre = np.array(y_pre)
    y_label = np.array(y_label)
    
    auc = []
    ap = []
    for i in range(repetitions):
        np.random.seed(i)
        idx = list(np.random.choice(len(y_pre), replace = True, size = sample_size))
        y_pre_bootstrap = y_pre[idx]
        y_label_bootstrap = y_label[idx]
        auc.append(roc_auc_score(y_label_bootstrap, y_pre_bootstrap))
        ap.append(average_precision_score(y_label_bootstrap, y_pre_bootstrap))
    # confidence interval
    left_auc = np.percentile(auc, alpha/2*100)
    right_auc = np.percentile(auc, 100-alpha/2*100)
    left_ap = np.percentile(ap, alpha/2*100)
    right_ap = np.percentile(ap, 100-alpha/2*100)
    # point estimate
    print('average AUROC', np.mean(auc))
    print((1-alpha)*100,'%','confidence interval for the AUROC:', (round(left_auc,4), round(right_auc,4)))
    print('average AP', np.mean(ap))
    print((1-alpha)*100,'%','confidence interval for the AP:', (round(left_ap,4), round(right_ap,4)))
    return auc, left_auc, right_auc, ap, left_ap, right_ap

In [4]:
year_num = 5
path = './model/mortality_risk_scores_feature_elimination/'
if not os.path.isdir(path):
    os.mkdir(path)

In [11]:
X = pd.read_csv('./data/NHANES/NHANES.csv')

if str(year_num)+'_year_label' not in X.columns:
    X[str(year_num)+'_year_label'] = X.apply(lambda x: label(x['permth_int'], x['mortstat'], 12*int(year_num)), axis=1)
    
X = X[X[str(year_num)+'_year_label']!=2]
y = X[str(year_num)+'_year_label']

if int(year_num) not in [1,2,3,4,5]:
    X = X.drop([str(year_num)+'_year_label'], axis=1)

mortstat = X['mortstat']
permth_int = X['permth_int']
drop_list = ["mortstat", "permth_int", '1_year_label', '2_year_label', '3_year_label', '4_year_label', '5_year_label']
X = X.drop(drop_list, axis=1)
X = X.drop(['Demographics_ReleaseCycle'], axis=1)
fea_list = pd.read_csv('./data/NHANES/NHANES_feature_list.csv')
nominal_fea = fea_list[fea_list['Nominal']==1]['Type_Short_Name'].tolist()
nominal_fea = list(set(nominal_fea) & set(X.columns))
X = pd.get_dummies(X, columns=nominal_fea, drop_first=True)
print(X.columns)
print('After encoding', X.shape)

Index(['Questionnaire_SelfReportedWeight10YrAgo',
       'Laboratory_WhiteBloodCellCount', 'Laboratory_Cotinine',
       'Laboratory_Sodium', 'Examination_BPDiastolic3',
       'Laboratory_MeanCellVolume', 'Questionnaire_SelfReportedGreatestWeight',
       'Laboratory_CholesterolSI', 'Examination_ExBPMaxInflationLevel',
       'Laboratory_Monocyte',
       ...
       'Examination_BPReading3_2.0', 'Demographics_RaceEthnicity_2.0',
       'Demographics_RaceEthnicity_3.0', 'Demographics_RaceEthnicity_4.0',
       'Demographics_RaceEthnicity_5.0', 'Demographics_Gender_2.0',
       'Questionnaire_MentalHealthProfessional_2.0',
       'Questionnaire_DoctorCongestiveHeartFailure_2.0',
       'Questionnaire_HighBloodPressure_2.0',
       'Questionnaire_LabDietarySupplement_2.0'],
      dtype='object', length=151)
After encoding (35854, 151)


In [12]:
print(X.columns)
print(X.shape)
print('# samples: ', X.shape[0])
print('# positive samples: ', sum(y==1))
print('# negative samples: ', sum(y==0))
print('# features: ', X.shape[1])  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=7)

y_train = np.array(y_train); y_test = np.array(y_test); y_val = np.array(y_val)

Index(['Questionnaire_SelfReportedWeight10YrAgo',
       'Laboratory_WhiteBloodCellCount', 'Laboratory_Cotinine',
       'Laboratory_Sodium', 'Examination_BPDiastolic3',
       'Laboratory_MeanCellVolume', 'Questionnaire_SelfReportedGreatestWeight',
       'Laboratory_CholesterolSI', 'Examination_ExBPMaxInflationLevel',
       'Laboratory_Monocyte',
       ...
       'Examination_BPReading3_2.0', 'Demographics_RaceEthnicity_2.0',
       'Demographics_RaceEthnicity_3.0', 'Demographics_RaceEthnicity_4.0',
       'Demographics_RaceEthnicity_5.0', 'Demographics_Gender_2.0',
       'Questionnaire_MentalHealthProfessional_2.0',
       'Questionnaire_DoctorCongestiveHeartFailure_2.0',
       'Questionnaire_HighBloodPressure_2.0',
       'Questionnaire_LabDietarySupplement_2.0'],
      dtype='object', length=151)
(35854, 151)
# samples:  35854
# positive samples:  3074
# negative samples:  32780
# features:  151


In [13]:
display_name = pd.read_csv('./data/NHANES/NHANES_feature_list_Display_name.csv')
display_col=[]
for col in X.columns:
    display_col.append(list(display_name.loc[display_name['Type_Short_Name']==col, 'Display_Name'])[0])
col_dict = dict(zip(X.columns, display_col))

In [14]:
feature_num_list = [X_train.shape[1], 145, 140, 135, 130, 125, 120, 115, 110, 105, 100, 95, 90, 85, 80, 75, 70, 65, 60, 55, 50, 45, 40, 35, 30, 25, 20, 15, 10,9,8,7,6,5,4,3,2,1]
X_train_all = X_train.copy()
X_test_all = X_test.copy()
X_val_all = X_val.copy()

features_ranking_dict = {}
auc_dict = {}
ap_dict = {}
left_auc_dict = {}
right_auc_dict = {}
left_ap_dict = {}
right_ap_dict = {}
ranked_features = X_train.columns

In [None]:
features_input = {}
ranked_features = X_train_all.columns
for feature_num in feature_num_list:
    print('# features: ', feature_num)
    X_train = X_train_all.loc[:, ranked_features[:feature_num]]
    X_test = X_test_all.loc[:, ranked_features[:feature_num]]
    X_val = X_val_all.loc[:, ranked_features[:feature_num]]
    features_input[feature_num] = ranked_features[:feature_num]
    if feature_num > 2:
        xlf = xgboost.XGBClassifier(n_estimators=1000, max_depth=4, subsample=0.5, min_child_weight=3, objective='binary:logistic', random_state=7)
        xlf.fit(X_train, y_train, eval_set = [(X_val, y_val)], early_stopping_rounds=100, verbose=False)
    else:
        xlf = xgboost.XGBClassifier(n_estimators=1000, subsample=0.5, objective='binary:logistic', random_state=7)
        xlf.fit(X_train, y_train, eval_set = [(X_val, y_val)], early_stopping_rounds=100, verbose=False)
    model_train = xlf
    pickle.dump(model_train, open(path+"model_"+str(feature_num)+".pickle.dat", "wb"))
    y_pre = model_train.predict_proba(X_test)[:, 1]
    auc, left_auc, right_auc, ap, left_ap, right_ap = bootstrap_ci(y_pre, y_test, len(y_test), repetitions = 1000, alpha = 0.05)
    auc_dict[feature_num] = auc
    ap_dict[feature_num] = ap
    left_auc_dict[feature_num] = left_auc
    right_auc_dict[feature_num] = right_auc
    left_ap_dict[feature_num] = left_ap
    right_ap_dict[feature_num] = right_ap    
    if len(X_train)>=5000:
        back_data = X_train.sample(n=5000, random_state=428)
    else:
        back_data = X_train
    if len(X_test)>=2000:
        fore_data = X_test.sample(n=2000, random_state=528)
        fore_data_label = pd.DataFrame(y_test).sample(n=2000, random_state=528)
    else:
        fore_data = X_test
        fore_data_label = pd.DataFrame(y_test)

    explainer = shap.TreeExplainer(model_train, data=back_data)
    shap_values = explainer.shap_values(fore_data, check_additivity=False)
    ranked_features = X_train.columns[np.argsort(-np.sum(np.abs(shap_values), axis=0))]
    features_ranking_dict[feature_num] = ranked_features

In [None]:
pickle.dump(auc_dict, open(path+"auc_dict.pickle.dat", "wb"))
pickle.dump(ap_dict, open(path+"ap_dict.pickle.dat", "wb"))
pickle.dump(left_auc_dict, open(path+"left_auc_dict.pickle.dat", "wb"))
pickle.dump(right_auc_dict, open(path+"right_auc_dict.pickle.dat", "wb"))
pickle.dump(left_ap_dict, open(path+"left_ap_dict.pickle.dat", "wb"))
pickle.dump(right_ap_dict, open(path+"right_ap_dict.pickle.dat", "wb"))
pickle.dump(features_ranking_dict, open(path+"features_ranking_dict.pickle.dat", "wb"))
pickle.dump(features_input, open(path+"features_input.pickle.dat", "wb"))