#### IV fluid dose prediction model

In [None]:
import pandas as pd
import xgboost as xgb
import shap
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import xgboost as xgb 
from sklearn.linear_model import  LogisticRegression, LinearRegression
from sklearn import metrics
import matplotlib.pyplot as plt 
import mlxtend
import imblearn 
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.model_selection import  cross_validate, cross_val_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
#imputed 
traindata = pd.read_csv('../doseprediction/data/sl_train.csv')
testdata = pd.read_csv('../doseprediction/data/sl_val.csv')
print(traindata.shape)
print(testdata.shape)

In [None]:
keyfeatures = ['Gender','Ventilator', 'diabetes', 'metastatic_cancer', 'qsofa_sysbp_score',
       'qsofa_resprate_score', 'qsofa_gcs_score', 
        'Age', 'Weight', 'Height', 'bmi', 'ANION_GAP',
       'APTT', 'Albumine', 'Art_BE', 'Art_PH',  'Bicarbonaat',
       'Calcium', 'Chloride', 'DIA', 'FiO2', 'Glucose', 'HB', 'HeartRate',
       'Ion_Ca', 'Kalium', 'LEU', 'Lactate', 'MAP', 'Magnesium', 'Natrium',
       'PF_ratio', 'PaCO2', 'PaO2', 'RespRate', 'SYS', 'Shock_Index',
       'Sirs_score', 'Sofa_score', 'Temp', 'Trombo', 'elixhauser',
       'elixhauser_hospital', 'qsofa', 'mingcs', 'lods', 
       'SpO2', 'Ureum', 'Creat', 'ALAT',
       'ASAT', 'Bili', 'INR' ]# 'blood_culture_positive','race_white', 'race_black', 'race_hispanic', 'race_other']

fluidfeatures = ['cumm_fluid_balance','total_IV_prev', 'max_VP_prev', 
                            'Running_total_UP', 'total_UP']

target = ['max_VP', 'total_IV', 'discrete_action', 'discrete_VP', 
          'discrete_IV']

In [None]:
#training predictors
X_train = traindata[keyfeatures]
X_test = testdata[keyfeatures]
X_train_extra =traindata[keyfeatures+fluidfeatures]
X_test_extra =testdata[keyfeatures+fluidfeatures]

# training targets 
Y_train = traindata[target]
Y_test = testdata[target]
#binarize VP as it is highly imbalance 
Y_train['binary_VP']=  np.where(Y_train.discrete_VP==0, 0, 1)
Y_test = testdata[target]
Y_test['binary_VP']=  np.where(Y_test.discrete_VP==0, 0, 1)

# Y all  for visualization 
Y = pd.concat([Y_train, Y_test])
X = pd.concat([X_train, X_test])
X_extra = pd.concat([X_train_extra, X_test_extra])

#### Build train test spli model 

In [None]:
def eval_regression(model, x_train, y_train, x_test, y_test, modelname):
    model.fit(x_train, y_train)
    predict = model.predict(x_test)
    mae = metrics.mean_absolute_error(y_test, predict)
    rmse = metrics.mean_squared_error(y_test, predict, squared = False)
    r2 = metrics.r2_score(y_test, predict)
    eval = {'ModelName' : modelname,
            'MAE' : mae,
            'RMSE' : rmse,
            'R2' : r2}
    return model, pd.DataFrame(eval, index=[0])
#dataframe to store model performances
performance = pd.DataFrame()

#### Regression models with base features 

In [None]:
#Linear Regression base model
modelname='LM-base' 
model1 = LinearRegression()
model1, per = eval_regression(model1, X_train, Y_train.total_IV,
                            X_test, Y_test.total_IV, modelname )
performance= performance.append(per)

#RandomForest base model
modelname='RF-base' 
model2 = RandomForestRegressor(random_state=112, n_jobs=-1)
model2, per = eval_regression(model2, X_train, Y_train.total_IV,
                            X_test, Y_test.total_IV, modelname )
performance= performance.append(per)

#XGB-base model 
modelname='XGB-base' 
model3 = xgb.XGBRegressor(random_state=112, n_jobs=-1, eval_metric='rmse')
model3, per = eval_regression(model3, X_train, Y_train.total_IV,
                            X_test, Y_test.total_IV, modelname )
performance= performance.append(per)

#### Regression model with base+ features 

In [None]:
#Linear Regression base model
modelname='LM-base+' 
model4 = LinearRegression()
model4, per = eval_regression(model4, X_train_extra, Y_train.total_IV,
                            X_test_extra, Y_test.total_IV, modelname )
performance= performance.append(per)

#RandomForest base model
modelname='RF-base+' 
model5 = RandomForestRegressor(random_state=112, n_jobs=-1)
model5, per = eval_regression(model5, X_train_extra, Y_train.total_IV,
                            X_test_extra, Y_test.total_IV, modelname )
performance= performance.append(per)



In [None]:
#XGB-base model 
modelname='XGB-base+' 
model6 = xgb.XGBRegressor(random_state=112, n_jobs=-1, eval_metric='rmse')
model6, per = eval_regression(model6, X_train_extra, Y_train.total_IV,
                            X_test_extra, Y_test.total_IV, modelname )
performance= performance.append(per)

In [None]:
#XGB-base model optimal parameters
modelname='XGB-base+optimize' 
model7 = xgb.XGBRegressor(random_state=112, n_jobs=-1,
                             eval_metric='rmse', n_estimators=271,
                             min_child_weight=5, max_depth=8, gamma=1,
                             colsample_bytree=0.96)
model7, per = eval_regression(model7, X_train_extra, Y_train.total_IV,
                            X_test_extra, Y_test.total_IV, modelname )
performance= performance.append(per)

In [None]:
performance

##### Significance test RF and XGB model 

In [None]:
#### Compare results based on paired test 
from mlxtend.evaluate import paired_ttest_5x2cv

### Compare RF-base + (model 5) and XGB base + (model 6)  
t,p = paired_ttest_5x2cv(estimator1=model5,
                          estimator2=model6,
                          X=X_extra, y=Y.total_IV,
                          random_seed=112, 
                          scoring=make_scorer(metrics.mean_squared_error))

print('stat:', t)
print('p-value:', p)

# interpret the p-value
alpha = 0.05
if p > alpha:
    print('Same proportions of errors (fail to reject H0), \
    and may conclude that the performance of the two algorithms is not significantly different.')
else:
    print('Different proportions of errors (reject H0), model are significantly different')
        

### Boot strap model

In [None]:
r2_values = []
rmse_values = []
mae_values = []

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
df = X_extra
df['total_IV'] = Y.total_IV
n_iteration = 80
n_size = int(len(X)) * 0.80
stats = list()
for i in range(n_iteration):
    x_train, x_test, y_train, y_test = train_test_split(X_extra, Y.total_IV, test_size=0.20,
                                           random_state=0+i)
    
    model = xgb.XGBRegressor(eval_metric='rmse', n_jobs=-1)
    model.fit(x_train, y_train)
    prediction=model.predict(x_test)
    r2 = metrics.r2_score(y_test, prediction )
    rmse = metrics.mean_squared_error(y_test, prediction, squared=False)
    mae = metrics.mean_absolute_error(y_test, prediction)
    #append metrices
    r2_values.append(r2)
    mae_values.append(mae)
    rmse_values.append(rmse)
    

print('Final average accuracy',np.mean(r2_values))
print('Final average rmse',np.mean(rmse_values))
print('Final average mae',np.mean(mae_values))



In [None]:
plt.hist(rmse_values)
plt.figure(figsize = (10,5))
plt.show()
plt.hist(r2_values)
plt.figure(figsize = (10,5))
plt.show()


In [None]:
#Lets find Confidence intervals
stats= r2_values
a = 0.95 # for 95% confidence
p = ((1.0 - a)/2.0) * 100 #tail regions on right and left .25 on each side indicated by P value (border)
                          # 1.0 is total area of this curve, 2.0 is actually .025 thats the space we would want to be 
                            #left on either side
lower = max(0.0, np.percentile(stats,p))

p = (a + ((1.0 - a)/ 2.0)) * 100 #p is limits
upper = min(1.0, np.percentile(stats,p))
print('%.1f confidence interval %.1f%% and %.1f%%' %(a*100, lower*100, upper*100))

In [None]:
### save bootstrap results 
iv_bootstrap = {'r2_boot': r2_values,
                'mae_boot': mae_values,
                'rmse_boot' : rmse_values}
import joblib
joblib.dump(iv_bootstrap, '../doseprediction/Final_IV_bootstrap_results.pkl')

##### Hyper parameter tuning the Final best models

In [None]:
from hyperopt import STATUS_OK, Trials , fmin, hp, tpe
space={'max_depth': hp.quniform("max_depth", 3, 9, 2),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': hp.randint('n_estimators', 50, 300)
    }


def hyperparameter_tuning_reg(space):
    x_train = X_train_extra
    y_train = Y_train.total_IV
    x_test = X_test_extra
    y_test = Y_test.total_IV
    model=xgb.XGBRegressor(n_estimators =space['n_estimators'], 
                            max_depth = int(space['max_depth']), 
                          gamma = space['gamma'],
                         min_child_weight=space['min_child_weight'],
                         colsample_bytree=space['colsample_bytree'])
    
    evaluation = [( x_train, y_train), ( x_test, y_test)]
    
    model.fit(x_train, y_train,
            eval_set=evaluation, eval_metric="rmse",
            early_stopping_rounds=10,verbose=False)

    pred = model.predict(x_test)
    mse= metrics.mean_squared_error(y_test, pred, squared=False)
    r2_score= metrics.r2_score(y_test, pred)
    print('r2 ', r2_score, 'rmse ', mse)
    return {'loss':mse, 'r2_score':r2_score, 'status': STATUS_OK, 'model': model}


iv_trials = Trials()
best_iv_params =  fmin(fn=hyperparameter_tuning_reg,
            space=space,
            algo=tpe.suggest,
            max_evals=50,
            trials=iv_trials)

print('The best parameters for total IV prediction are', '\n')
print(best_iv_params)

#### Checked but optimize model does not give best performance 