# Training for SepsisML

1. Preprocess
    -  Resample sparse matrix (longitdudinal data) to hourly longitudinal data

In [1]:
# import all libraries

import sepsis_ml as ml
import numpy as np
import pandas as pd
import shap
import os
import shutil
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score, accuracy_score
import xgboost as xgb
from hyperopt import STATUS_OK, hp, fmin, tpe
from tqdm import tqdm

In [2]:
# will not print the following columns to this notebook due to PHI
hide = ["csn", "pat_id", "age", "is_Female", "is_asian", "is_white", "is_black", "is_other", "hospital_discharge_date_time", "hospital_admission_date_time"]

## 1. Preprocess

### 1.0 Read Data

In [13]:
merged = pd.read_csv("longitudinal_2021.csv")
stat = pd.read_csv("stat_2021.csv")
merged.loc[:, ~merged.columns.isin(hide)].head(5)

Unnamed: 0,recorded_time,HR,O2Sat,Temp,SBP,DBP,MAP,Resp,EtCO2,o2_flow_rate,...,RBC,RDW-CV,RDW-SD,SaO2,Sodium,TroponinI,WBC,pH,urine_output,gcs_total_score
0,2021-03-08 16:11:00,84.0,,36.8,175.0,,,21.0,,,...,,,,,,,,,,
1,2021-03-08 17:38:00,91.0,99.0,,146.0,,,16.0,,,...,,,,,,,,,,
2,2021-03-08 19:52:00,84.0,,36.7,134.0,,,18.0,,,...,,,,,,,,,,
3,2021-08-03 09:34:00,76.0,,36.7,134.0,,,18.0,,,...,,,,,,,,,,
4,2021-11-29 16:02:00,87.0,96.0,36.8,185.0,,,18.0,,,...,,,,,,,,,,


In [17]:
merged = pd.merge(merged, stat[["csn", "pat_id", "hospital_admission_date_time"]], on = ["csn", "pat_id"], how = "left")

In [18]:
merged["hospital_admission_date_time"] = pd.to_datetime(merged["hospital_admission_date_time"])
merged["recorded_time"] = pd.to_datetime(merged["recorded_time"])

In [19]:
merged.dtypes

pat_id                                 float64
csn                                      int64
recorded_time                   datetime64[ns]
HR                                     float64
O2Sat                                  float64
Temp                                   float64
SBP                                    float64
DBP                                    float64
MAP                                    float64
Resp                                   float64
EtCO2                                  float64
AST                                    float64
Alkalinephos                           float64
BUN                                    float64
BaseExcess                             float64
Bilirubin_total                        float64
Calcium                                float64
Chloride                               float64
Creatinine                             float64
FiO2                                   float64
Glucose                                float64
HCO3         

### 1.1 Clean Data
- Skip due to data extracted from MODS folder

### 1.2 Pivot Columns
- Skip due to data extracted from MODS folder

### 1.3 Resample sparse matrix to hourly longitudinal data

In [None]:
df = ml.resampling(merged)

In [4]:
df.loc[:, ~df.columns.isin(hide)].head(5)

Unnamed: 0,rel_time,HR,O2Sat,Temp,SBP,DBP,MAP,Resp,EtCO2,o2_flow_rate,...,RBC,RDW-CV,RDW-SD,SaO2,Sodium,TroponinI,WBC,pH,urine_output,gcs_total_score
0,1.0,85.0,98.0,36.3,170.0,,,16.0,,,...,,,,,,,,,,
1,2.0,57.0,99.0,,137.0,,,16.0,,,...,,,,,,,,,,
2,3.0,,,,,,,,,,...,,,,,,,,,,
3,4.0,62.0,98.0,,136.0,,99.5,,36.0,,...,,,,,,,,,,
4,5.0,59.0,99.0,,143.0,,101.0,,36.0,,...,,,,,,,,,,


#### (only for Training Dataset) Filter 1-50 hour data

In [5]:
k = df.copy()
k = k[k.rel_time >= 1]
k = k[k.rel_time <= 50]

### 1.4 Aggregate data into 3 hour bins with 6 hour sliding window (median)

In [6]:
variables = list(k.columns)[3:]
print(variables)

['HR', 'O2Sat', 'Temp', 'SBP', 'DBP', 'MAP', 'Resp', 'EtCO2', 'o2_flow_rate', 'height_cm', 'daily_weight_kg', 'o2_supp', 'AST', 'Albumin', 'Alkalinephos', 'Anion_Gap', 'BUN', 'BaseExcess', 'Bilirubin_direct', 'Bilirubin_total', 'Calcium', 'Chloride', 'Creatinine', 'FiO2', 'Fibrinogen', 'Glucose', 'HCO3', 'Hct', 'Hgb', 'INR', 'Lactate', 'MCH', 'MCHC', 'MPV', 'Magnesium', 'PT', 'PTT', 'PaCO2', 'PaO2', 'Phosphate', 'Phosphorus', 'Platelets', 'Potassium', 'Protein', 'RBC', 'RDW-CV', 'RDW-SD', 'SaO2', 'Sodium', 'TroponinI', 'WBC', 'pH', 'urine_output', 'gcs_total_score']


In [None]:
new = k.groupby(["pat_id", "csn"]).apply(lambda v: rolling_overlap(v, 6, variables, 3))
final_df = new.drop(["pat_id", "csn"], axis = 1).reset_index(drop = False).rename(columns = {"level_2" : "LOS"})

### 1.5 Merge Static data

In [None]:
# merge statistical features
final_df = final_df.merge(stat, on = ["pat_id", "csn"], how = "left")

#### (optional) merge vent features

In [None]:
# final_df = final_df.merge(new_vent, on = ["pat_id", "csn"], how = "left")

In [3]:
final_df = pd.read_csv("../Emory_NYU_Sync/final_3hr_sampled.csv")


Columns (63) have mixed types.Specify dtype option on import or set low_memory=False.


In [4]:
final_df.loc[:, ~final_df.columns.isin(hide)].head(5)

Unnamed: 0,LOS,rel_time,HR,O2Sat,Temp,SBP,DBP,MAP,Resp,EtCO2,...,WBC,pH,urine_output,gcs_total_score,In_hospital_death,year,t_sepsis3,is_female,vent_start_time,vent_stop_time
0,0,1.0,85.0,98.0,36.3,170.0,,,16.0,,...,,,,,False,2021.0,,0,,
1,3,4.0,62.0,98.0,36.3,137.0,,99.5,16.0,36.0,...,,,,,False,2021.0,,0,,
2,0,1.0,55.0,92.5,,133.0,,104.0,19.0,,...,,,,,False,2021.0,,0,,
3,3,4.0,51.5,96.5,,117.5,,87.0,17.75,,...,,,,15.0,False,2021.0,,0,,
4,6,7.0,52.0,96.0,36.85,117.0,,85.0,17.0,,...,,,,15.0,False,2021.0,,0,,


#### (only for Training Dataset) Create Sepsis Label & Shifted Sepsis Label (marks sepsis 3 hours earlier)

In [5]:
final_df["hospital_admission_date_time"] = pd.to_datetime(final_df["hospital_admission_date_time"])
final_df["abs_time"] = final_df["hospital_admission_date_time"] + pd.to_timedelta(final_df['rel_time'], unit='h')
final_df["t_sepsis3"] = pd.to_datetime(final_df["t_sepsis3"])
final_df["SepsisLabel"] = (final_df["abs_time"] >= final_df["t_sepsis3"])
final_df["shifted_t_sepsis3"] = final_df["t_sepsis3"] - pd.to_timedelta(6, unit='h')
final_df["ShiftedSepsisLabel"] = (final_df["abs_time"] >= final_df["shifted_t_sepsis3"])

#### (optional) extract MV indicators

In [None]:

# final_df["MV"] = ((final_df["abs_time"] >= final_df["vent_start_time"]) & (final_df["abs_time"] <= (final_df["vent_stop_time"])+pd.to_timedelta(3, unit='h')))
# final_df["MV_min"] = ((final_df["abs_time"] - final_df["vent_start_time"]).dt.total_seconds() / 60)  * final_df["MV"]
# final_df["MV_sub"] = ((final_df["abs_time"] - final_df["vent_stop_time"]).dt.total_seconds() / 60)  * final_df["MV"]
# final_df.loc[final_df["MV_sub"] < 0, "MV_sub"] = 0
# final_df["minute_ventilation"] = final_df["MV_min"] - final_df["MV_sub"]

# final_df = final_df.drop(["MV_min", "MV_sub"], axis = 1)

#### (only for Training Dataset) Select features

In [6]:
sep_index = ['AST', 'Alkalinephos', 'BUN', \
             'BaseExcess',  'Bilirubin_total', 'Calcium',\
             'Chloride', 'Creatinine', 'FiO2', \
             'Glucose', 'HCO3', 'Hct', \
             'Hgb', 'Lactate', 'Magnesium', \
             'PTT', 'PaCO2', 'PaO2', 'Phosphate',\
             'Platelets','Potassium', 'SaO2',\
             'Sodium','WBC', 'pH']

con_index = ['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2']
dem_index = ['age', 'is_female', 'gcs_total_score']

final_df = final_df.loc[:,final_df.columns.isin(['csn', 'pat_id', 'recorded_time', 'LOS', 'ShiftedSepsisLabel', 'SepsisLabel', 'rel_time'] + sep_index + con_index + dem_index)]

#### (only for Training Dataset) Define and match 10:1 non-sepsis to sepsis patient ratio

In [7]:
# sepsis cases
sepsis_csn_f = final_df[final_df.SepsisLabel == 1].csn.unique()
nosepsis_csn_f = final_df[final_df.SepsisLabel == 0].csn.unique()
nosepsis_csn_f = set(nosepsis_csn_f) - set(sepsis_csn_f)

np.save("sepsis_csn_final.npy", sepsis_csn_f)
np.save("nosepsis_csn_final.npy", nosepsis_csn_f)

np.random.seed(20123)
nosepsis_random_48 = np.random.choice(list(nosepsis_csn_f),(len(sepsis_csn_f)*10), replace = False)

np.save("nosepsis_csn_final_sampled.npy", nosepsis_random_48)

total_csn = list(nosepsis_random_48) + list(sepsis_csn_f)


In [8]:
print("number of sepsis patients from 2021: ", len(sepsis_csn_f))
print("number of non-sepsis patients from 2021: ",len(nosepsis_random_48))
print("total number of encounters in 2021 cohort: ", len(total_csn))

number of sepsis patients from 2021:  1554
number of non-sepsis patients from 2021:  15540
total number of encounters in 2021 cohort:  17094


In [9]:
df = final_df[final_df.csn.isin(total_csn)].copy()
df = df[df.LOS <= 48]

In [10]:
list(df.columns)

['pat_id',
 'csn',
 'LOS',
 'rel_time',
 'HR',
 'O2Sat',
 'Temp',
 'SBP',
 'DBP',
 'MAP',
 'Resp',
 'EtCO2',
 'AST',
 'Alkalinephos',
 'BUN',
 'BaseExcess',
 'Bilirubin_total',
 'Calcium',
 'Chloride',
 'Creatinine',
 'FiO2',
 'Glucose',
 'HCO3',
 'Hct',
 'Hgb',
 'Lactate',
 'Magnesium',
 'PTT',
 'PaCO2',
 'PaO2',
 'Phosphate',
 'Platelets',
 'Potassium',
 'SaO2',
 'Sodium',
 'WBC',
 'pH',
 'gcs_total_score',
 'age',
 'is_female',
 'SepsisLabel',
 'ShiftedSepsisLabel']

### 1.6 Feature extraction/missing value imputation

In [11]:
processed_df = ml.preprocess(df.iloc[:100], vm = False)

  0%|          | 0/11 [00:00<?, ?it/s]

Extracting informative features


100%|██████████| 11/11 [00:06<00:00,  1.61it/s]


Completed Extracting informative features
Extracting Rolling features
Completed Extracting Rolling features
Extracting Score Features
Completed Extracting Score Features
Preprocessing completed with total of 180 features


In [12]:
processed_df.loc[:, ~processed_df.columns.isin(hide)].head(5)

Unnamed: 0,LOS,rel_time,HR,O2Sat,Temp,SBP,DBP,MAP,Resp,EtCO2,...,Resp_dstd,HR_score,Temp_score,Resp_score,MAP_score,Creatinine_score,qsofa,Platelets_score,Bilirubin_score,SIRS
0,0,1.0,,,,,,,,,...,,,,,,,,,,False
1,3,4.0,63.0,97.5,36.8,139.5,,,18.25,,...,,0.0,1.0,0.0,,,0.0,,,False
2,6,7.0,63.0,97.5,36.8,139.5,,,18.25,,...,1.414214,0.0,1.0,0.0,,,0.0,,,False
3,9,10.0,53.75,99.0,36.7,128.25,,92.75,16.25,,...,1.154701,0.0,1.0,0.0,0.0,,0.0,,,False
4,12,13.0,53.75,99.0,36.7,128.25,,92.75,16.25,,...,1.532631,0.0,1.0,0.0,0.0,,0.0,,,False


In [62]:
list(processed_df.columns)

['pat_id',
 'csn',
 'LOS',
 'rel_time',
 'HR',
 'O2Sat',
 'Temp',
 'SBP',
 'DBP',
 'MAP',
 'Resp',
 'EtCO2',
 'AST',
 'Alkalinephos',
 'BUN',
 'BaseExcess',
 'Bilirubin_total',
 'Calcium',
 'Chloride',
 'Creatinine',
 'FiO2',
 'Glucose',
 'HCO3',
 'Hct',
 'Hgb',
 'Lactate',
 'Magnesium',
 'PTT',
 'PaCO2',
 'PaO2',
 'Phosphate',
 'Platelets',
 'Potassium',
 'SaO2',
 'Sodium',
 'WBC',
 'pH',
 'gcs_total_score',
 'age',
 'is_female',
 'SepsisLabel',
 'ShiftedSepsisLabel',
 'HR_interval_f1',
 'HR_interval_f2',
 'HR_diff',
 'O2Sat_interval_f1',
 'O2Sat_interval_f2',
 'O2Sat_diff',
 'Temp_interval_f1',
 'Temp_interval_f2',
 'Temp_diff',
 'SBP_interval_f1',
 'SBP_interval_f2',
 'SBP_diff',
 'MAP_interval_f1',
 'MAP_interval_f2',
 'MAP_diff',
 'DBP_interval_f1',
 'DBP_interval_f2',
 'DBP_diff',
 'Resp_interval_f1',
 'Resp_interval_f2',
 'Resp_diff',
 'EtCO2_interval_f1',
 'EtCO2_interval_f2',
 'EtCO2_diff',
 'AST_interval_f1',
 'AST_interval_f2',
 'AST_diff',
 'Alkalinephos_interval_f1',
 'Alk

In [13]:
print(len(processed_df.columns))

180


## 2. Train Model

In [14]:
id_sepsis = np.load("sepsis_csn_final.npy")
id_nosepsis = np.load("nosepsis_csn_final_sampled.npy")

print("Number of sepsis patients: {}".format(len(id_sepsis)))
print("Number of non-sepsis patients: {}".format(len(id_nosepsis)))

Number of sepsis patients: 1554
Number of non-sepsis patients: 15540


In [15]:
save_model_dir = './xgb_model_/'
os.mkdir(save_model_dir)

FileExistsError: [Errno 17] File exists: './xgb_model_/'

In [16]:
from sklearn.model_selection import train_test_split, KFold

train_nosepsis, test_nosepsis = train_test_split(id_nosepsis, test_size=0.15, random_state=12306)
train_sepsis, test_sepsis = train_test_split(id_sepsis, test_size=0.15, random_state=12306)

test_set = np.append(test_nosepsis, test_sepsis)

#train_nosepsis = np.load("./real_time_data/train_nosepsis_0712.npy")
#train_sepsis = np.load("./real_time_data/train_sepsis_0712.npy")
#train_set = np.load("./real_time_data//test_set.npy")

np.save("train_nosepsis_0712.npy", train_nosepsis)
np.save("train_sepsis_0712.npy", train_sepsis)
np.save("test_set_0712.npy", test_set)

kfold = KFold(n_splits=5, shuffle=True, random_state=np.random.seed(12306))


## Training

In [17]:
def downsample(x):
    
    pos = x[x["SepsisLabel"] == 1]
    neg = x[x["SepsisLabel"] == 0]
    
    if len(pos) < len(neg):
        neg = neg.sample(n=len(pos), replace = False, random_state = 10002)
        
    new = pos.append(neg)
    new = new.sample(frac = 1, replace = False)
    
    return new

In [18]:
def BO_TPE(X_train, y_train, X_val, y_val):
    "Hyperparameter optimization"
    train = xgb.DMatrix(X_train, label=y_train)
    val = xgb.DMatrix(X_val, label=y_val)
    X_val_D = xgb.DMatrix(X_val)

    def objective(params):
        xgb_model = xgb.train(params, dtrain=train, num_boost_round=1000, evals=[(val, 'eval')],
                              verbose_eval=False, early_stopping_rounds=80)
        y_vd_pred = xgb_model.predict(X_val_D, ntree_limit=xgb_model.best_ntree_limit)
        y_val_class = [0 if i <= 0.5 else 1 for i in y_vd_pred]

        acc = accuracy_score(y_val, y_val_class)
        loss = 1 - acc

        return {'loss': loss, 'params': params, 'status': STATUS_OK}

    max_depths = [3, 4]
    learning_rates = [0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.15, 0.2]
    subsamples = [0.5, 0.6, 0.7, 0.8, 0.9]
    colsample_bytrees = [0.5, 0.6, 0.7, 0.8, 0.9]
    reg_alphas = [0.0, 0.005, 0.01, 0.05, 0.1]
    reg_lambdas = [0.8, 1, 1.5, 2, 4]

    space = {
        'max_depth': hp.choice('max_depth', max_depths),
        'learning_rate': hp.choice('learning_rate', learning_rates),
        'subsample': hp.choice('subsample', subsamples),
        'colsample_bytree': hp.choice('colsample_bytree', colsample_bytrees),
        'reg_alpha': hp.choice('reg_alpha', reg_alphas),
        'reg_lambda': hp.choice('reg_lambda', reg_lambdas),
    }

    best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=20)

    best_param = {'max_depth': max_depths[(best['max_depth'])],
                  'learning_rate': learning_rates[(best['learning_rate'])],
                  'subsample': subsamples[(best['subsample'])],
                  'colsample_bytree': colsample_bytrees[(best['colsample_bytree'])],
                  'reg_alpha': reg_alphas[(best['reg_alpha'])],
                  'reg_lambda': reg_lambdas[(best['reg_lambda'])]
                  }

    return best_param

def train_model(k, X_train, y_train, X_val, y_val, save_model_dir):
  
    print('*************************************************************')
    print('{}th training ..............'.format(k + 1))
    print('Hyperparameters optimization')
    best_param = BO_TPE(X_train, y_train, X_val, y_val)
    print("obtained best_param")
    xgb_model = xgb.XGBClassifier(max_depth = best_param['max_depth'],
                                  eta = best_param['learning_rate'],
                                  n_estimators = 1000,
                                  subsample = best_param['subsample'],
                                  colsample_bytree = best_param['colsample_bytree'],
                                  reg_alpha = best_param['reg_alpha'],
                                  reg_lambda = best_param['reg_lambda'],
                                  objective = "binary:logistic"
                                  )

    xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='error',
                  early_stopping_rounds=80, verbose=False)

    y_tr_pred = (xgb_model.predict_proba(X_train, ntree_limit=xgb_model.best_ntree_limit))[:, 1]
    
    train_auc = roc_auc_score(y_train, y_tr_pred)
    print('training dataset AUC: ' + str(train_auc))
    y_tr_class = [0 if i <= 0.5 else 1 for i in y_tr_pred]
    acc = accuracy_score(y_train, y_tr_class)
    print('training dataset acc: ' + str(acc))

    y_vd_pred = (xgb_model.predict_proba(X_val, ntree_limit=xgb_model.best_ntree_limit))[:, 1]

    valid_auc = roc_auc_score(y_val, y_vd_pred)
    print('validation dataset AUC: ' + str(valid_auc))
    y_val_class = [0 if i <= 0.5 else 1 for i in y_vd_pred]
    acc = accuracy_score(y_val, y_val_class)
    print('validation dataset acc: ' + str(acc))
    print('************************************************************')
    # save the model
    
    np.save("y_train" + str(k)+".npy", y_train)
    np.save("y_train_pred" + str(k)+".npy", y_tr_pred)
    np.save("y_val" + str(k)+".npy", y_val)
    np.save("y_val_pred" + str(k)+".npy", y_vd_pred)
    
    save_model_path = save_model_dir + 'model{}.mdl'.format(k + 1)
    xgb_model.get_booster().save_model(fname=save_model_path)


In [19]:
train_sets = []
val_sets = []
ks = []
for (k, (train0_index, val0_index)), (k, (train1_index, val1_index)) in zip(enumerate(kfold.split(train_nosepsis)), enumerate(kfold.split(train_sepsis))):
    train_sets.append(np.append(train_nosepsis[train0_index], train_sepsis[train1_index]))
    val_sets.append(np.append(train_nosepsis[val0_index], train_sepsis[val1_index]))
    ks.append(k)

In [33]:
drop_features = ['pat_id', 'csn', 'LOS', 'rel_time', 'SepsisLabel', 'ShiftedSepsisLabel']
for k in ks:
    print(k)
    train_set = train_sets[k]
    train_set = processed_df.csn.unique()
    case = processed_df[processed_df.csn.isin(train_set)].copy().reset_index(drop= True)
    #case = case.groupby(by = ["Unit1", "Unit2"]).apply(lambda x: downsample(x)).reset_index(drop = True)
    case = downsample(case)
 
    x_train = case.drop(drop_features, axis = 1).values
    y_train = case["ShiftedSepsisLabel"].values

    print(np.shape(x_train))
    print(np.shape(y_train))
    print(sum(y_train))
    
    
    val_set = val_sets[k]
    case = processed_df[processed_df["csn"].isin(val_set)].reset_index(drop= True)
    #case = case.groupby(by = ["Unit1", "Unit2"]).apply(lambda x: downsample(x)).reset_index(drop = True)
    case = downsample(case)
    
    x_val = case.drop(drop_features, axis = 1).values
    y_val = case["ShiftedSepsisLabel"].values

    train_model(k, x_train, y_train, x_val, y_val, save_model_dir = save_model_dir)

    

0
(0, 174)
(0,)
0
*************************************************************
1th training ..............
Hyperparameters optimization
  0%|          | 0/20 [00:00<?, ?trial/s, best loss=?]

job exception: [17:14:27] /workspace/src/objective/regression_obj.cu:64: Check failed: info.labels_.Size() != 0U (0 vs. 0) : label set cannot be empty
Stack trace:
  [bt] (0) /opt/rh/rh-python36/root/usr/xgboost/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x24) [0x7f56de3eecb4]
  [bt] (1) /opt/rh/rh-python36/root/usr/xgboost/libxgboost.so(xgboost::obj::RegLossObj<xgboost::obj::LinearSquareLoss>::GetGradient(xgboost::HostDeviceVector<float> const&, xgboost::MetaInfo const&, int, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*)+0xf0) [0x7f56de5f9720]
  [bt] (2) /opt/rh/rh-python36/root/usr/xgboost/libxgboost.so(xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*)+0x345) [0x7f56de488505]
  [bt] (3) /opt/rh/rh-python36/root/usr/xgboost/libxgboost.so(XGBoosterUpdateOneIter+0x35) [0x7f56de3ebaa5]
  [bt] (4) /lib64/libffi.so.6(ffi_call_unix64+0x4c) [0x7f5731993e2c]
  [bt] (5) /lib64/libffi.so.6(ffi_call+0x1f5) [0x7f5731993755]
  [bt] (6) /opt/rh/rh

  0%|          | 0/20 [00:00<?, ?trial/s, best loss=?]


XGBoostError: [17:14:27] /workspace/src/objective/regression_obj.cu:64: Check failed: info.labels_.Size() != 0U (0 vs. 0) : label set cannot be empty
Stack trace:
  [bt] (0) /opt/rh/rh-python36/root/usr/xgboost/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x24) [0x7f56de3eecb4]
  [bt] (1) /opt/rh/rh-python36/root/usr/xgboost/libxgboost.so(xgboost::obj::RegLossObj<xgboost::obj::LinearSquareLoss>::GetGradient(xgboost::HostDeviceVector<float> const&, xgboost::MetaInfo const&, int, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*)+0xf0) [0x7f56de5f9720]
  [bt] (2) /opt/rh/rh-python36/root/usr/xgboost/libxgboost.so(xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*)+0x345) [0x7f56de488505]
  [bt] (3) /opt/rh/rh-python36/root/usr/xgboost/libxgboost.so(XGBoosterUpdateOneIter+0x35) [0x7f56de3ebaa5]
  [bt] (4) /lib64/libffi.so.6(ffi_call_unix64+0x4c) [0x7f5731993e2c]
  [bt] (5) /lib64/libffi.so.6(ffi_call+0x1f5) [0x7f5731993755]
  [bt] (6) /opt/rh/rh-python36/root/usr/lib64/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x651) [0x7f5731ba8071]
  [bt] (7) /opt/rh/rh-python36/root/usr/lib64/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(+0xeedf) [0x7f5731ba4edf]
  [bt] (8) /opt/rh/rh-python36/root/usr/lib64/libpython3.6m.so.rh-python36-1.0(_PyObject_FastCallDict+0x90) [0x7f573f653190]



## 3. Testing

In [59]:
test_set = np.load('test_set_0712.npy')
test_data_path = preprocessed_df[preprocessed_df["csn"].isin(test_set)]
model_path = '../Emory_NYU_Sync/xgb_model_0813'

result = ml.predict(test_set, test_data_path, model_path, 0.48, vm = False, drop_features = drop_features)
result.to_csv("prediction_results_.csv", index = False)

## 4. Result visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from sklearn.metrics import roc_curve, auc, matthews_corrcoef, f1_score, accuracy_score, roc_auc_score, make_scorer, average_precision_score, recall_score, confusion_matrix, precision_recall_curve


In [None]:
Y_test = test_scores_df["SepsisLabel"].values
ytestpred = test_scores_df['PredictedProbability'].values

test_auc = roc_auc_score(Y_test, ytestpred)
lr_fpr, lr_tpr, _ = roc_curve(Y_test, ytestpred)


In [None]:
fig = plt.figure(figsize=(10, 5))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
    
ax1.set_title(f'AUROC Curve, Sepsis')
ax1.set_xlabel('1 - Specificity (FPR)')
ax1.set_ylabel('Sensitivity (TPR)')
ax1.set_xlim(0,1)
ax1.set_ylim(0,1)
    
ax2.set_title(f'Precision Recall Curve, Sepsis')
ax2.set_xlabel('Recall (Sensitivity)')
ax2.set_ylabel('Precision (PPV)')
ax2.set_xlim(0,1)
ax2.set_ylim(0,1)
    
ax1.plot(lr_fpr, lr_tpr, label=f'AUC = {test_auc:.3f}')
ax1.plot([0,1], [0,1], label='Random Choice')



precision, recall, _ = precision_recall_curve(Y_test, ytestpred)
test_avg_prec = auc(recall, precision)
ax2.plot(recall, precision, label=f'Average Precision = {test_avg_prec:.3f}')
    
ax1.grid()
ax1.legend()
ax2.grid()
ax2.legend(loc='lower right')
    
plt.tight_layout()   