In [1]:
#import libraries
import numpy as np
import pandas as pd
import csv
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
import sklearn.metrics as metrics
import sys
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:
#reading training and test data
train_features=pd.read_csv("train_features.csv")
train_labels=pd.read_csv("train_labels.csv")
test_features=pd.read_csv("test_features.csv")

In [3]:
t=np.array([1,2,3,4,5,6,7,8,9,10,11,12])
t_array=np.tile(t,18995)
time1=pd.DataFrame(data=t_array,columns={'Time_std'})

In [4]:
#data_cleaning for the training features
#replacing the time value as 1 to 12 uniformly for all patients
train_features['Time'] = time1['Time_std'].values
#making the time series values horizontal, thereby creating single row for each patient
train_features=train_features.pivot(index='pid',columns='Time')
#removing the 12 duplicates for the age variable and replacing by a single column
age=pd.DataFrame(data=np.array(train_features['Age'][1]),columns={'Age'})
train_features=train_features.drop(['Age'],axis=1)
train_features['Age'] = age['Age'].values

In [5]:
#performing the above set of data cleaning for the test features
t_array_test=np.tile(t,12664)
time1_test=pd.DataFrame(data=t_array_test,columns={'Time_std'})
test_features['Time'] = time1_test['Time_std'].values
test_features=test_features.pivot(index='pid',columns='Time')
age_test=pd.DataFrame(data=np.array(test_features['Age'][1]),columns={'Age'})
test_features=test_features.drop(['Age'],axis=1)
test_features['Age'] = age_test['Age'].values

In [6]:
#sorting the training labels by pid 
train_labels=train_labels.sort_values(by=['pid'])
train_labels=train_labels.set_index('pid')

In [29]:
#Adding median/max/minimum (depending on the feature) across the 12 hours, as new predictor columns 
train_features['EtCO2_impute']=np.nanmedian(train_features['EtCO2'],axis=1)
rows=np.array([1,1,2,3,5,6,7,9,10,11,12,13,14,15,16,17,20,23,24,26,27,28,30,31,33])
names=np.array(['EtCO2','PTT','BUN','Lactate','Hgb','HCO3','BaseExcess','Fibrinogen','Phosphate','WBC','Creatinine','PaCO2','AST','FiO2','Platelets','SaO2','Magnesium','Calcium','Alkalinephos','Bilirubin_direct','Chloride','Hct','Bilirubin_total','TroponinI','pH','consol.'])
for i in range(1,25):
    b=12*rows[i]
    e=b+12
    s="_"
    s=s.join([names[i],names[25]])
    train_features[s]=np.nanmedian(train_features.iloc[:,b:e],axis=1)
train_features['Temp_impute']=np.nanmax(train_features['Temp'],axis=1)
train_features['Glucose_min']=np.nanmin(train_features['Glucose'],axis=1)
train_features['Glucose_max']=np.nanmax(train_features['Glucose'],axis=1)
train_features['Pot_min']=np.nanmin(train_features['Potassium'],axis=1)
train_features['Pot_max']=np.nanmax(train_features['Potassium'],axis=1)
train_features['ABPd_impute']=np.nanmax(train_features['ABPd'],axis=1)
train_features['ABPs_impute']=np.nanmax(train_features['ABPs'],axis=1)
train_features['RRate_impute']=np.nanmedian(train_features['RRate'],axis=1)
train_features['ABPm_impute']=np.nanmedian(train_features['ABPm'],axis=1)
train_features['SpO2_impute']=np.nanmedian(train_features['SpO2'],axis=1)
train_features['Heartrate_impute']=np.nanmedian(train_features['Heartrate'],axis=1)

  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,
  train_features['Temp_impute']=np.nanmax(train_features['Temp'],axis=1)
  train_features['Glucose_min']=np.nanmin(train_features['Glucose'],axis=1)
  train_features['Glucose_max']=np.nanmax(train_features['Glucose'],axis=1)
  train_features['Pot_min']=np.nanmin(train_features['Potassium'],axis=1)
  train_features['Pot_max']=np.nanmax(train_features['Potassium'],axis=1)
  train_features['ABPd_impute']=np.nanmax(train_features['ABPd'],axis=1)
  train_features['ABPs_impute']=np.nanmax(train_features['ABPs'],axis=1)


In [30]:
#Adding median/max/minimum (depending on the feature) across the 12 hours, as new predictor columns 
test_features['EtCO2_impute']=np.nanmedian(test_features['EtCO2'],axis=1)
rows=np.array([1,1,2,3,5,6,7,9,10,11,12,13,14,15,16,17,20,23,24,26,27,28,30,31,33])
names=np.array(['EtCO2','PTT','BUN','Lactate','Hgb','HCO3','BaseExcess','Fibrinogen','Phosphate','WBC','Creatinine','PaCO2','AST','FiO2','Platelets','SaO2','Magnesium','Calcium','Alkalinephos','Bilirubin_direct','Chloride','Hct','Bilirubin_total','TroponinI','pH','consol.'])
for i in range(1,25):
    b=12*rows[i]
    e=b+12
    s="_"
    s=s.join([names[i],names[25]])
    test_features[s]=np.nanmedian(test_features.iloc[:,b:e],axis=1)
test_features['Temp_impute']=np.nanmax(test_features['Temp'],axis=1)
test_features['Glucose_min']=np.nanmin(test_features['Glucose'],axis=1)
test_features['Glucose_max']=np.nanmax(test_features['Glucose'],axis=1)
test_features['Pot_min']=np.nanmin(test_features['Potassium'],axis=1)
test_features['Pot_max']=np.nanmax(test_features['Potassium'],axis=1)
test_features['ABPd_impute']=np.nanmax(test_features['ABPd'],axis=1)
test_features['ABPs_impute']=np.nanmax(test_features['ABPs'],axis=1)
test_features['RRate_impute']=np.nanmedian(test_features['RRate'],axis=1)
test_features['ABPm_impute']=np.nanmedian(test_features['ABPm'],axis=1)
test_features['SpO2_impute']=np.nanmedian(test_features['SpO2'],axis=1)
test_features['Heartrate_impute']=np.nanmedian(test_features['Heartrate'],axis=1)

  test_features['Temp_impute']=np.nanmax(test_features['Temp'],axis=1)
  test_features['Glucose_min']=np.nanmin(test_features['Glucose'],axis=1)
  test_features['Glucose_max']=np.nanmax(test_features['Glucose'],axis=1)
  test_features['Pot_min']=np.nanmin(test_features['Potassium'],axis=1)
  test_features['Pot_max']=np.nanmax(test_features['Potassium'],axis=1)
  test_features['ABPd_impute']=np.nanmax(test_features['ABPd'],axis=1)
  test_features['ABPs_impute']=np.nanmax(test_features['ABPs'],axis=1)


In [31]:
#creating the test_label data frame
column_names=['LABEL_BaseExcess','LABEL_Fibrinogen','LABEL_AST','LABEL_Alkalinephos','LABEL_Bilirubin_total','LABEL_Lactate','LABEL_TroponinI','LABEL_SaO2','LABEL_Bilirubin_direct','LABEL_EtCO2','LABEL_Sepsis','LABEL_RRate','LABEL_ABPm','LABEL_SpO2','LABEL_Heartrate']
test_label=pd.DataFrame(columns=column_names)
test_label['pid']=test_features.index.values
test_label=test_label.set_index('pid')
train_features_class=train_features
test_features_class=test_features

In [32]:
#defining a function for predicting labels based on specified model
def modelfit(alg,train_features_class,train_label,test_features_class):
    #if useTrainCV:
    xgb_param=alg.get_xgb_params()
    xgtrain= xgb.DMatrix(train_features_class, label=train_label)
        #cvresult=xgb.cv(xgb_param,xgtrain,nfold=cv_folds,metrics='auc',early_stopping_rounds=early_stopping_rounds)
        #alg.set_params(n_estimators=cvresult.shape[0])
    alg.fit(train_features_class,train_label)
    preds=alg.predict(train_features_class)
    prob=alg.predict_proba(test_features_class)[:,1]
    test_label.iloc[:,i]=prob
    #print(cvresult.shape[0])
    print("\nModel Report")
    print("Precision = {}".format(metrics.precision_score(train_label, preds)))
    print("Recall = {}".format(metrics.recall_score(train_label, preds)))
    print("Accuracy = {}".format(metrics.accuracy_score(train_label, preds)))
    print("AUC = {}".format(metrics.roc_auc_score(train_label, preds)))   
#feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
#feat_imp.plot(kind='bar', title='Feature Importances')
#plt.ylabel('Feature Importance Score')

In [33]:
#tuning hyperparameters for each target variable individually in multiple steps of grid search
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2),
 'reg_alpha':[0.01,0.1,1,10,100]
}
for i in range(10,11):
    train_label=pd.DataFrame(data=train_labels.iloc[:,i],index=train_labels.index.values)
    train_label=train_label.values.ravel()
    gsearch = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=100, max_depth=5,
    min_child_weight=1, gamma=0.3, subsample=0.9, colsample_bytree=0.9,
    objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
    param_grid = param_test1, scoring='roc_auc',n_jobs=4,cv=5)
    gsearch.fit(train_features_class,train_label) 
    print(gsearch.best_params_)
    print(gsearch.best_score_)

{'max_depth': 5, 'min_child_weight': 1, 'reg_alpha': 10}
0.7403860512712781


In [37]:
#running a loop to predict labels for subtask 1 and 2 
names=np.array(['BaseExcess','Fibrinogen','AST','Alkalinephos','Bilirubin_total','Lactate','TroponinI','SaO2','Bilirubin_direct','EtCO2','Sepsis','LABEL'])
max_depth_=np.array([9,5,7,5,9,5,5,5,3,3,3])
min_child_=np.array([5,5,3,1,5,5,1,5,1,5,1])
for i in range(0,11):
    s="_"
    s=s.join([names[11],names[i]])
    train_label=pd.DataFrame(data=train_labels.iloc[:,i],index=train_labels.index.values)
    train_label=train_label.values.ravel()
    xgb1=XGBClassifier(
    learning_rate=0.1,
    max_depth=max_depth_[i],
    min_child_weight=min_child_[i])
    print(s)
    modelfit(xgb1,train_features_class,train_label,test_features_class)

LABEL_BaseExcess

Model Report
Precision = 0.9856054191363252
Recall = 0.9136577708006279
Accuracy = 0.9732561200315872
AUC = 0.9543826662478568
LABEL_Fibrinogen

Model Report
Precision = 0.967479674796748
Recall = 0.425
Accuracy = 0.9565675177678337
AUC = 0.7119316567206593
LABEL_AST

Model Report
Precision = 0.9864982578397212
Recall = 0.497364953886693
Accuracy = 0.8778625954198473
AUC = 0.7476091440716617
LABEL_Alkalinephos

Model Report
Precision = 0.9365798414496036
Recall = 0.3686204591040784
Accuracy = 0.8449591997894182
AUC = 0.6804502902082289
LABEL_Bilirubin_total

Model Report
Precision = 0.9935691318327974
Recall = 0.6085339168490154
Accuracy = 0.9048697025533035
AUC = 0.8036430416134158
LABEL_Lactate

Model Report
Precision = 0.9161877394636015
Recall = 0.5030239284775178
Accuracy = 0.8912871808370624
AUC = 0.7457523539175372
LABEL_TroponinI

Model Report
Precision = 0.8848207475209764
Recall = 0.6121372031662269
Accuracy = 0.9533561463543038
AUC = 0.8016533969047508
LABE

In [35]:
#running a loop to predict values for subtask 3 after parameter tuning
names=np.array(['RRate','ABPm','SpO2','Heartrate'])
alpha_=np.array([25,75,25,25])
learn_rate = [0.08,0.07,0.08,0.06]
for i in range(0,4):
    train_label=pd.DataFrame()
    train_label=train_labels.iloc[:,(i+11)]
    train_label=train_label.values.ravel()
    #param={'learning_rate':0.01,'objective':'reg:squarederror','seed':1,'max_depth': 5,'min_child_weight':1,'reg_alpha':alpha_[i],'metrics':'rmse'}
    xgtrain= xgb.DMatrix(train_features_class, label=train_label)
    #cvresult=xgb.cv(dtrain=xgtrain,params=param,num_boost_round=1000,nfold=5,stratified=False)
    #best_round=cvresult.shape[0]
    xgb2 = xgb.XGBRegressor(learning_rate=learn_rate[i])
    model=xgb2.fit(train_features_class,train_label)
    pred=xgb2.predict(train_features_class)
    #print(cvresult.shape[0])
    print(metrics.r2_score(train_label,pred))
    pred_reg=xgb2.predict(test_features_class)
    test_label.iloc[:,(i+11)]=pred_reg

0.642518102240846
0.7467815072105362
0.6666123197548975
0.7706687594300767


In [36]:
#writing the final predictions and saving as a zip file
compression_opts = dict(method='zip',
                        archive_name='prediction_final.csv')  
test_label.to_csv('prediction_final.zip', index=True,float_format='%.3f',compression=compression_opts)  