In [None]:
import pandas as pd
import timeit
import csv
from timeit import default_timer as timer
import numpy as np
import random
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree.export import export_graphviz
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import StratifiedShuffleSplit
import warnings; warnings.simplefilter('ignore')
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestClassifier
import pyodbc
from datetime import timedelta
import ast
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.calibration import calibration_curve
%matplotlib inline
import matplotlib.lines as mlines
import matplotlib.transforms as mtransforms
from sklearn.calibration import CalibratedClassifierCV
from scipy import stats
import lightgbm as lgb
from matplotlib import pyplot
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from scipy import interp
import pickle
from math import sqrt
from hyperopt import fmin
from hyperopt import Trials
from hyperopt import tpe
from hyperopt import STATUS_OK
from hyperopt import hp

# Functions

In [None]:
#model:fitted model
#inputx:data to be predicted
#inputdata: data to be predicted+labels
#threshold: threshold used to get the predicted label. if predicted prob>threshold, predict 1
#same: same equal to the same parameter in function nested()
def getscores(model,inputx,inputdata,threshold,same):
    
    x=inputx.copy()
    data=inputdata.copy()
    data['predicted_prob']=model.predict_proba(x)[:,1]
   
    #get calibration curve
    cali_y, cali_x = calibration_curve(data['ards'],data['predicted_prob'],n_bins=10, normalize=True)
     
    #get predicted label
    data.loc[data['predicted_prob']>=threshold,'predicted']=1
    data.loc[data['predicted_prob']<threshold,'predicted']=0

    #ENCOUNTER level predition
    
    aggregation_functions={'predicted':'max','ards':'max'}
    dataenc = data.groupby(['EncounterID']).aggregate(aggregation_functions).reset_index(drop=False)
    #get list of encounterid that the predicted label and true label are both 1
    agree=dataenc[(dataenc['ards']==1 )&(dataenc['predicted']==1)].EncounterID.unique().tolist()
    
    data=data.sort_values(['EncounterID','time'], ascending=[True,True])
    data_hours=data[data['EncounterID'].isin(agree)]
    data_hours=data_hours[data_hours['predicted']==1]
    

    #get time difference
    data_hours.time=pd.to_datetime(data_hours.time)
    
    data_hours.time=pd.to_datetime(data_hours.time)
    data_hours=data_hours.groupby(['EncounterID'])['time'].first().reset_index(drop=False)
    data_hours=pd.merge(data_hours,ards[['EncounterID','ards_time']],how='left',on='EncounterID')
    data_hours.ards_time=pd.to_datetime(data_hours.ards_time)
    if same:
        data_hours['diff']=(((data_hours['time']+timedelta(hours=bintrain)))-data_hours['ards_time'])/ np.timedelta64(1, 'h')
    else:
        data_hours['diff']=(((data_hours['time']+timedelta(hours=bintest)))-data_hours['ards_time'])/ np.timedelta64(1, 'h')
    data_hours.loc[data_hours['diff']<=0,'earlydiff']=abs(data_hours['diff'])
    data_hours.loc[data_hours['diff']>0,'latediff']=abs(data_hours['diff'])
    #get median time difference when the prediction was made earlier than ards_time
    early_avg_diff=np.nanmedian(data_hours['earlydiff'])
    #get median time difference when the prediction was made later than ards_time
    late_avg_diff=np.nanmedian(data_hours['latediff'])
    #get percentage of early prediction
    earlypct=len(data_hours[data_hours['diff']<=0])/len(data_hours)*100
    #get percentage of late prediction
    latepct=len(data_hours[data_hours['diff']>0])/len(data_hours)*100
    
    avg_diff=(round(earlypct,2),round(early_avg_diff,2),round(latepct,2),round(late_avg_diff,2))
    
    
    #save the data used to plot time curve
    rows_list = []
    for i in range(-48,49):
        dic1 = {}
        dic1['Time to ards_time(hours)']=i
        dic1['Encounter(%)']=len(data_hours[data_hours['diff']<=i])/len(dataenc[dataenc['ards']==1])*100
        rows_list.append(dic1)

    timecurve = pd.DataFrame(rows_list) 
    
    #get sensitivity, specificity and ppv
    CM = confusion_matrix(dataenc['ards'],dataenc['predicted'])
    tn, fp, fn, tp =CM.ravel()
    
    recall=tp/(tp+fn)
    recall_ad=(tp+2)/(tp+fn+4)
    sensitivity=recall
    
    sp111=tn/(tn+fp)
    sp111_a=(tn+2)/(tn+fp+4)
    specificity=sp111
    
    sp111=tp/(tp+fp)
    sp111_a=(tp+2)/(tp+fp+4)
    ppv=sp111
    
    
    return round(sensitivity*100,1), round(specificity*100,1),round(ppv*100,1),avg_diff,cali_y,cali_x,timecurve

#get bin rocauc and prc score
def getroc(model,y,x):
    
    fpr, tpr, threshold = metrics.roc_curve(y, model.predict_proba(x)[:,1])
    roc_auc = metrics.auc(fpr, tpr)
    
    #prc
    precision, recall, thresholds = precision_recall_curve(y, model.predict_proba(x)[:,1]) 
    pr_auc = metrics.auc(recall, precision)
    
    return round(roc_auc,3),round(pr_auc,3)

#get encounter rocauc and prc score
def getroc_encounter(model,inputx,inputdata,outputname=0):
    x=inputx.copy()
    data=inputdata.copy()
    data['predicted']=model.predict_proba(x)[:,1]
    
    
    aggregation_functions={'predicted':'max','ards':'max'}
    data = data.groupby(['EncounterID']).aggregate(aggregation_functions).reset_index(drop=False)
   
    fpr, tpr, threshold = metrics.roc_curve(data['ards'],data['predicted'])
    roc_auc = metrics.auc(fpr, tpr)

    #save the data to plot rocauc curve
    if outputname!=0:
        output=pd.DataFrame(columns=['fpr','tpr'])
        output['fpr']=fpr
        output['tpr']=tpr
        output.to_csv(PATH4+outputname+'_rocauc.csv',index=False)

    precision, recall, thresholds = precision_recall_curve(data['ards'], data['predicted']) 
    #retrieve probability of being 1(in second column of probs_y)
    pr_auc = metrics.auc(recall, precision)
    
    #save the data to plot precision recall curve
    if outputname!=0:
        output=pd.DataFrame(columns=['precision','recall'])
        output['precision']=precision
        output['recall']=recall
        output.to_csv(PATH4+outputname+'_prc.csv',index=False)

    #plot Precision-Recall vs Threshold Chart
    plt.title("Precision-Recall vs Threshold Chart")
    plt.plot(thresholds, precision[: -1], "b--", label="Precision")
    plt.plot(thresholds, recall[: -1], "r--", label="Recall")
    plt.ylabel("Precision, Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="lower left")
    plt.ylim([0,1])
    plt.show()
    
  
    return round(roc_auc, 3),round(pr_auc,3)



In [None]:
# Assuming there is more than two columns to the data, you can just read in
# the two columns of data, try to sort it, and then save the index of the
# sorted dataframe. Then you can read in the original data and rearrange it
# using the sorted index.
def sortdf(data,temp,colstosort):

    asc=[True]*len(colstosort)
    temp=temp.sort_values(colstosort, ascending=asc)

    index=temp.index

    data=data.reindex(index)

    return data

In [None]:
#parameters:
#model: sklearn model with defined hyperparameters
#inputtraindata: the data used to train the model (binned every 6 hours)
#inputtraindata2: the data used to test the model (binned every 2 hours)
#lda: True if the model is lda
#useweight: True if we want to use sampleweight
def crossvalidate(model,inputtraindata,inputtraindata2,lda=False,useweight=True):
    traindata=inputtraindata.copy()
    traindata=traindata.drop('time',1)
    
    traindata2=inputtraindata2.copy()
    traindata2=traindata2.drop('time',1)

    #split the data by patientid
    split=traindata[['PatientID','ards']].groupby(['PatientID']).sum().reset_index()
    split.loc[split['ards']>0,'ards']=1

    skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=1)
    skf.get_n_splits(split['PatientID'], split['ards'])

    X,y=split['PatientID'], split['ards']
    final=0
    i=1
    #inner cross validation
    for train_index, test_index in skf.split(X, y):
        print('inner',i)
        i+=1
        #print("TRAIN:", train_index, "TEST:", test_index)
        trainset, testset = X[train_index], X[test_index]
        
        X_train=traindata[traindata['PatientID'].isin(trainset)].drop(['EncounterID','PatientID','ards'],1)
        sampleweight=X_train['sampleweight']
        X_train=X_train.drop('sampleweight',1)
        y_train=traindata[traindata['PatientID'].isin(trainset)]['ards']

        X_val=traindata2[traindata2['PatientID'].isin(testset)].drop(['EncounterID','PatientID','ards','sampleweight'],1)
        y_val=traindata2[traindata2['PatientID'].isin(testset)]['ards']

        sc = StandardScaler()  

        X_train_sc= sc.fit_transform(X_train)
        X_val_sc= sc.transform (X_val)

        if lda:
            model.fit(X_train_sc, y_train)

        else:
            if useweight:
                model.fit(X_train_sc, y_train,sample_weight=sampleweight)
            else:
                model.fit(X_train_sc, y_train)
                
        roc,prc=getroc(model,y_val,X_val_sc)
        final=final+roc
       
        
    final=final/5
    print(final)
     

    return final

#model: fitted model
#inputx: data to be predicted
#inputdata: data to be predicted+true labels
def get_threshold(model,inputx,inputdata):
    x=inputx.copy()
    data=inputdata.copy()
    #get predicted probability
    data['predicted_prob']=model.predict_proba(x)[:,1]
    
    #get each encounter's max predicted_probability and labels
    #ards==1 if the encounter had had ards, otherwise 0
    aggregation_functions={'predicted_prob':'max','ards':'max'}
    data = data.groupby(['EncounterID']).aggregate(aggregation_functions).reset_index(drop=False)
    
    #get list of precision, recall, and corresponding thresholds
    precision, recall, thresholds = precision_recall_curve(data['ards'], data['predicted_prob']) 

    
    temp=pd.DataFrame(columns=['precision', 'recall', 'thresholds'])
    temp['precision']=precision[: -1]
    temp['recall']=recall[: -1]
    temp['thresholds']=thresholds
    temp=temp.sort_values(['recall'], ascending=[True])
    temp=temp[temp['recall']>=0.85]
    
    if len(temp)>0:
        final_threshold=temp['thresholds'].iloc[0]
    else:
        final_threshold=0
     
    return final_threshold

#parameters:
#model: what model we want to use? e.g. "logistic regression",'random forest','lightgbm','lda'
#inputdata: the data used to train the model (binned every 6 hours)
#inputdata2: the data used to test the model (binned every 2 hours)
#same =True if inputdata==inputdata2
#parameters: set of hyperparameters
#outputname: the name of output files. need to read back in later for plotting all the models together
#isintubated: True if we want to test on intubated patients only
#useweight: True if we want to use sample weight
#calibrate: True if we want to calibrate the output probability
#inputintubated: subset of input data that only contains intubated patients; 0 if isintubated==False
def nested(model,inputdata,inputdata2,same, outputname,isintubated=False,useweight=True,calibrate=False,inputintubated=0):
    
    data=inputdata.copy()
    data2=inputdata2.copy()
    intubated=inputintubated.copy()
    
    #if we want to test on intubated patients only, the intubated patientsID will be split into 5 folds
    #patients in each fold will be used as test set so the test set only includes intubated patients
    #patients not in that fold will be used as train set no matter they have been intubated or not
    if isintubated:
        split=intubated[['PatientID','ards']].groupby(['PatientID']).sum().reset_index()
        split.loc[split['ards']>0,'ards']=1
        
    else:
        # test on all the patients no matter they are intubated or not
        split=data[['PatientID','ards']].groupby(['PatientID']).sum().reset_index()
        split.loc[split['ards']>0,'ards']=1

    skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=1)
    skf.get_n_splits(split['PatientID'], split['ards'])
    X,y=split['PatientID'], split['ards']
    
    
    
    #lists to store the outputs when looping through each fold
    #The outer cross validation has 5 folds so the length of list is 5
    final1=[] #bin rocauc
    final2=[] #enc rocauc
    final11=[] #bin prauc
    final22=[] #enc prauc
    final3=[] #sensitivity
    final4=[] #specificity
    final5=[] #ppv
    final6=[] #time diff (perc of early prediction, median timediff in hours,perc of late prediction, median timediff in hours)
    final7=[] #calibration plot  y axix
    final8=[] #calibration plot x axix
    final9=[] # dataframe used to plot the timecurve
    bestparam=[] #best set of hyperparameter for current training set
    trainauc1=[] # train set's bin rocauc 
    trainauc2=[] # train set's enc rocauc
  
   
  
    ###variables needed for plotting rocauc curve
    n_classes=5
    fpr={}
    tpr={}
    precision={}
    recall={}
    
    global trainset
    global trainset2
    i=0
    #outer cross validation
    for train_index, test_index in skf.split(X, y):
        print('************outer',i)
        
        #get patientid for trainset, patientID for testset
        trainindex, testindex = X[train_index], X[test_index]

        #trainset is used for training in inner cross validation
        trainset=data[~(data['PatientID'].isin(testindex))]
        #trainset2 is used for testing in inner cross validation
        trainset2=data2[~(data2['PatientID'].isin(testindex))]
        #testset is used for testing in outer cross validation
        testset=data2[data2['PatientID'].isin(testindex)]
        #We always train on every 6 hours and test on every 2 hours no matter it's inner or outer crossvalidation 
        
        #inner corss validation
        #find the set of hyperparameters with the highest bin rocauc, retrain model on the full training set 
       
        #run Bayesian optimization
        # Optimize
        if model=='logistic regression':
            tempspace=space1
        elif model=='random forest':
            tempspace=space2
        elif model=='lightgbm':
            tempspace=space3
        elif model=='lda':
            tempspace=space4
            
        #configure hyperopt
        global  ITERATION
        #Optimization Algorithm
        global tpe_algorithm
        # Trials object to track progress
        global bayes_trials
        
        best = fmin(fn = objective, space = tempspace, algo = tpe_algorithm , 
                    max_evals = MAX_EVALS, trials = bayes_trials)
        
        
        historydata=pd.read_csv( PATH4+'trials_'+file+'_'+choose_model+'_nested.csv')
        historydata=historydata.sort_values(by='loss',ascending=True).reset_index(drop=True)

        best=historydata['params'].iloc[0]

        sampleweight=trainset['sampleweight']

        #standardize
        sc = StandardScaler()  
        X_train_sc= sc.fit_transform(trainset.drop(['EncounterID','PatientID','ards','time','sampleweight'],1))
        X_test_sc= sc.transform (testset.drop(['EncounterID','PatientID','ards','time','sampleweight'],1))

        #define models
        if model=='logistic regression':
            clf = LogisticRegression(**ast.literal_eval(best))  
           
        elif model=='lda':
            clf = LinearDiscriminantAnalysis(**ast.literal_eval(best))

        elif model=='random forest':
            clf = RandomForestClassifier(**ast.literal_eval(best))

        elif model=='lightgbm':
            clf = lgb.LGBMClassifier(**ast.literal_eval(best))
        
        #True if we want to calibrate the model
        if calibrate:
            clf=CalibratedClassifierCV(clf, method='sigmoid', cv=5)
        
        #Fit the model to training set
        #True if we want to use sampleweight
        if useweight and model!='lda':
            clf.fit(X_train_sc, trainset['ards'],sampleweight)
        else:
            clf.fit(X_train_sc, trainset['ards'])
                
        
        #get the test scores
        rocauc,prauc=getroc(clf,testset['ards'],X_test_sc)
        rocauc_encounter,prauc_encounter=getroc_encounter(clf,X_test_sc,testset,outputname=outputname)
        #get the train scores
        rocauc_train,prauc_train=getroc(clf,trainset['ards'],X_train_sc)
        rocauc_encounter_train,prauc_encounter_train=getroc_encounter(clf,X_train_sc,trainset)
        
        #save each fold's fpr,tpr precision,recall to dictionaries
        #will be used later to get avaeraged rocauc curve and precison recall curve
        temp=pd.read_csv(PATH4+outputname+'_rocauc.csv')
        fpr[i], tpr[i]=temp['fpr'],temp['tpr']
        
        temp=pd.read_csv(PATH4+outputname+'_prc.csv')
        precision[i], recall[i]=temp['precision'],temp['recall']
  
        i+=1
    
        #get the threshold so the test set sensitivity==85%
        final_threshold_test=get_threshold(clf,X_test_sc,testset)
        
        final_threshold_train=get_threshold(clf,X_train_sc,trainset)
    
               
        print('final_threshold',final_threshold_test)
        #get the test scores
        sensitivity, specificity, ppv,timediff,caliy,calix,timecurve=getscores(clf,X_test_sc,testset,final_threshold_test,same)
        #get the train scores
        sensitivity_train, specificity_train, ppv_train,timediff_train,caliy_train,calix_train,timecurve_train=getscores(clf,X_train_sc,trainset,final_threshold_train,same)
     
        bestparam.append(best)
        final1.append(rocauc)
        final2.append(rocauc_encounter)
        final11.append(prauc)
        final22.append(prauc_encounter)
        final3.append(sensitivity)
        final4.append(specificity)
        final5.append(ppv)
        final6.append(timediff)
        final7.append(caliy)
        final8.append(calix)
        final9.append(timecurve)
        trainauc1.append(rocauc_train)
        trainauc2.append(rocauc_encounter_train)
        
        
        print('*************',best,rocauc,rocauc_encounter,prauc,prauc_encounter,sensitivity, specificity, ppv,timediff)
        print('*************train',rocauc_train,rocauc_encounter_train,prauc_train,prauc_encounter_train,sensitivity_train, specificity_train, ppv_train,timediff_train)

        
        #configure hyperopt
        ITERATION = 0
        #Optimization Algorithm
        tpe_algorithm = tpe.suggest
        # Trials object to track progress
        bayes_trials = Trials()

        #save result history
        # File to save first results
        out_file = PATH4+'trials_'+file+'_'+choose_model+'_nested.csv'
        of_connection = open(out_file, 'w')
        writer = csv.writer(of_connection)

        # Write the headers to the file
        writer.writerow(['loss', 'params', 'iteration','train_time'])
        of_connection.close()

    #plot averaged calibration curve            
    try:
        print("################averaged calibration plot")
        #calibration plot
        y=np.zeros(shape=(10))
        x=np.zeros(shape=(10))
        for i in range(5):
            y=y+np.array(final7[i])/5
            x=x+np.array(final8[i])/5
            
        fig = plt.figure(1, figsize=(10, 10))
        ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
        #ax2 = plt.subplot2grid((3, 1), (2, 0))

        ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
        
        fraction_of_positives, mean_predicted_value = caliy,calix

        ax1.plot(x, y, "s-",
                 label="%s" % (model))

        ax1.set_ylabel("Fraction of positives")
        ax1.set_ylim([-0.05, 1.05])
        ax1.legend(loc="lower right")
        ax1.set_title('Calibration plots  (reliability curve)')
        plt.show()
        fig.savefig(PATH4+outputname+"_calibration_bin.pdf", bbox_inches='tight')

    except:
        print('calibration plot error')
        
    #plot averaged time curve
    print("################averaged time curve")
    #time curve
    timex=np.array(final9[0]['Time to ards_time(hours)'])
    timey=np.array(final9[0]['Encounter(%)'])
    for i in range(1,len(final9)):
        timex=np.array(final9[i]['Time to ards_time(hours)'])+timex
        timey=np.array(final9[i]['Encounter(%)'])+timey
        
    timex=timex/5
    timey=timey/5
    
    output=pd.DataFrame(columns=['Time to ards_time(hours)','Encounter(%)'])
    output['Time to ards_time(hours)']=timex
    output['Encounter(%)']=timey
    output.to_csv(PATH4+outputname+'_timecurve.csv',index=False)
    
    fig = plt.figure()
    ax = fig.gca()
    ax.set_xticks(np.arange(-48, 48, 12))
    ax.set_yticks(np.arange(0, 100, 10))
    plt.xlabel('Time to ards_time(hours)')
    plt.ylabel('Encounter(%)')
    plt.plot(timex, timey)
    plt.grid()
    plt.show()
    
    
    ###save averaged rocauc curve and precision recall cruve

    # Compute macro-average ROC curve and ROC area

    # First aggregate all false positive rates

    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    output=pd.DataFrame(columns=['fpr','tpr'])
    output['fpr'] = all_fpr
    output['tpr'] = mean_tpr
    output.to_csv(PATH4+outputname+'_rocauc_final.csv')
    
    ##prc
    all_precision = np.unique(np.concatenate([precision[i] for i in range(n_classes)]))

    # interpolate all curves at this points
    mean_recall = np.zeros_like(all_precision)
    for i in range(n_classes):
        mean_recall += interp(all_precision, precision[i], recall[i])

    # Finally average it
    mean_recall /= n_classes

    output=pd.DataFrame(columns=['precision','recall'])
    output['precision'] = all_precision
    output['recall'] = mean_recall
    output.to_csv(PATH4+outputname+'_prc_final.csv')
    
 
    
    return bestparam,final1,final2,final11,final22,final3,final4,final5,final6,trainauc1,trainauc2


# Set paramters

In [None]:
PATH1='Z:\patient-adjudication-results\\'
PATH2='Z:\project-datasets\ARDS\ml_algorithms\\final_datasets_alternative\\'
PATH3='Z:\project-datasets\ARDS\ml_algorithms\\'
PATH4='Z:\project-datasets\ARDS\ml_algorithms\model_outputs_testing\\'

In [None]:
test_on_intubated=True
use_sample_weight=True
choose_model='logistic regression'
classweight=None #None or 'balanced'
ifcalibrate=False
#what data we want to use: structured, structured+unstructured, structured+unstructured+order, or structured+unstructured+clinicalnotes
file='structured'
if choose_model=='logistic regression':
    MAX_EVALS = 20
else:
    MAX_EVALS = 50

In [None]:
#configure hyperopt
global  ITERATION
ITERATION = 0
#Optimization Algorithm
global tpe_algorithm
tpe_algorithm = tpe.suggest

# Trials object to track progress
global bayes_trials
bayes_trials = Trials()

#save result history
# File to save first results
out_file = PATH4+'trials_'+file+'_'+choose_model+'_nested.csv'
of_connection = open(out_file, 'w')
writer = csv.writer(of_connection)

# Write the headers to the file
writer.writerow(['loss', 'params', 'iteration','train_time'])
of_connection.close()

In [None]:
###hyperopt
#objective function
global trainset
global trainset2
trainset=0
trainset2=0
def objective(params):
    # Keep track of evals
    global ITERATION
    global trainset
    global trainset2
    
    ITERATION += 1
    
    
    start = timer()
    
    # Perform 5_folds cross validation
    if choose_model=='logistic regression':
        score =crossvalidate(LogisticRegression(**params),trainset,trainset2,useweight=use_sample_weight)
    elif choose_model=='random forest':
        score =crossvalidate(RandomForestClassifier(**params),trainset,trainset2,useweight=use_sample_weight)
    elif choose_model=='lda':
        score =crossvalidate(LinearDiscriminantAnalysis(**params),trainset,trainset2,useweight=use_sample_weight)
    elif choose_model=='lightgbm':
        score =crossvalidate(lgb.LGBMClassifier(**params),trainset,trainset2,useweight=use_sample_weight)
    
    run_time = timer() - start
    
    
    # Loss must be minimized
    loss = 1-score
    
   
    # Write to the csv file ('a' means append)
    of_connection = open(out_file, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([loss, params, ITERATION,run_time])
    
    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'iteration': ITERATION,
            'train_time': run_time, 'status': STATUS_OK}


# Define the search space
def uniform_int(name, lower, upper):
    # `quniform` returns:
    # round(uniform(low, high) / q) * q
    return hp.quniform(name, lower, upper, q=1)

def loguniform_int(name, lower, upper):
    # Do not forget to make a logarithm for the
    # lower and upper bounds.
    return hp.qloguniform(name, np.log(lower), np.log(upper), q=1)

#logistic regression
space1={'penalty' : hp.choice('penalty', ['l1']),
 'C':hp.uniform('C', 0.001,0.2),
 'class_weight':hp.choice('class_weight', [classweight]),
 'n_jobs':hp.choice('n_jobs', [-1]),
 'random_state':hp.choice('random_state', [1234])}

#random forest

space2 = {'bootstrap': hp.choice('bootstrap', [True]),
'max_depth': uniform_int('max_depth', 2, 30),
'max_features': hp.choice('max_features', ['auto']),
'min_samples_leaf': uniform_int('min_samples_leaf', 2, 30),
'min_samples_split': uniform_int('min_samples_split',10, 200),
'n_estimators': hp.choice('n_estimators', [200,400]),
'class_weight':hp.choice('class_weight', [classweight]),
'n_jobs':hp.choice('n_jobs', [-1]),
'random_state':hp.choice('random_state', [1234])}

#lightgbm

space3 = {
        'class_weight':hp.choice('class_weight', [classweight]),
        'num_leaves': hp.quniform('num_leaves', 4, 32, 1),
        'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.1)),
        'subsample': hp.uniform('subsample', 0.2, 0.6), #alias "subsample"
        'min_data_in_leaf': hp.qloguniform('min_data_in_leaf', 10, 200, 1),
        'reg_alpha': hp.uniform('reg_alpha', 0.1, 0.6), #alias "subsample"
        'colsample_bytree': hp.uniform('colsample_bytree', 0.1, 0.5),
        'max_depth': uniform_int('max_depth', 2, 30),
        'objective':'binary',
        'silent':False,
        'n_estimators': hp.choice('n_estimators', [200,400]),
        'random_state':hp.choice('random_state', [1234]),
        'n_jobs':hp.choice('n_jobs', [-1])
    
    }



#lda
space4={'solver':'lsqr',
            'shrinkage': hp.uniform('shrinkage', 0, 0.9),
            'n_components':None}

# Read in data

In [None]:
ards=pd.read_csv(PATH1+'current-ards-review-results_2_25_2020.csv',dtype={'mrn': str})
ards.rename(columns={'encounterid':'EncounterID'},inplace=True)
ards.loc[ards['ards_time']=='.','ards_time']=np.nan
ards.ards_time=pd.to_datetime(ards.ards_time)

In [None]:
#We've settled on using data binned every 6h for training, and data binned every 2h for validation and testing
#This script can run on any training data in the format of 'EncounterID','PatientID','ards','time','sampleweight',features
if 'only' in file:
    bintrain=0
    bintest=0
    filename1=PATH2+file+'_train.csv'
    filename2=PATH2+file+'_train.csv'
else:
    filename1=PATH2+file+'_6Htrain.csv'
    filename2=PATH2+file+'_2Htrain.csv'
    bintrain=int(filename1.split('_')[-1][0])
    bintest=int(filename2.split('_')[-1][0])
    
train_str=pd.read_csv(filename1)
train_str2=pd.read_csv(filename2)

In [None]:
#get list of encounters that have been intubated 
structured=pd.read_csv(PATH2+'structured_data.csv')
trainset=ards[(ards['year']==2016)&(ards['not_reviewed']==0)&(ards['not_cohort']==0)&(pd.notnull(ards['pt_ards']))].EncounterID.unique().tolist()
intubated=structured[structured['EncounterID'].isin(trainset)]
intubated=intubated[intubated['support']=='invasive']
intubated_encounters=intubated.EncounterID.unique().tolist()

# Nested cross validation

In [None]:
bestparam,scores1,scores2,scores11,scores22,scores3,scores4,scores5,scores6,trainauc1,trainauc2=nested(choose_model,train_str,train_str,same=True,outputname=file+' '+choose_model+'_nested',isintubated=test_on_intubated,useweight=use_sample_weight,calibrate=ifcalibrate,inputintubated=train_str[train_str['EncounterID'].isin(intubated_encounters)])

In [None]:
bestparam

In [None]:
print('bin rocauc:',np.mean(scores1,axis=0),'enc rocauc:',np.mean(scores2,axis=0),'bin prcauc',np.mean(scores11,axis=0),'enc prcauc',np.mean(scores22,axis=0))

In [None]:
print('train bin rocauc:',np.mean(trainauc1,axis=0),'train enc rocauc:',np.mean(trainauc2,axis=0))

In [None]:
print('sensitivity:',np.mean(scores3,axis=0),'specificity:',np.mean(scores4,axis=0),'ppv:',np.mean(scores5,axis=0),'timediff:',np.mean(scores6,axis=0))

In [None]:
###time curve
filenames=[PATH4+file+' '+choose_model+'_nested'+'_timecurve.csv']
n_classes=len(filenames)

fig = plt.figure()
ax = fig.gca()
ax.set_xticks(np.arange(-48, 48, 12))
ax.set_yticks(np.arange(0, 100, 10))
plt.xlabel('Time to ards_time(hours)')
plt.ylabel('Encounter(%)')
colors=['red','green','blue','orange','purple','gray']
for i in range(n_classes):
    temp=pd.read_csv(filenames[i])
    plt.plot(temp['Time to ards_time(hours)'], temp['Encounter(%)'],
             label=filenames[i].split('\\')[-1].split('_')[0],color=colors[i],linewidth=0.8,linestyle='-')
    plt.legend(loc="lower right")

plt.grid()
plt.show()
fig.savefig(PATH4+file+' '+choose_model+'_nested'+"_timecurve.pdf", bbox_inches='tight')

In [None]:
#roc auc
filenames=[PATH4+file+' '+choose_model+'_nested'+'_rocauc.csv']
n_classes=len(filenames)
fpr={}
tpr={}
roc_auc ={}

for i in range(n_classes):
    temp=pd.read_csv(filenames[i])
    fpr[i], tpr[i]=temp['fpr'],temp['tpr']
    roc_auc[i] = metrics.auc(fpr[i], tpr[i])
    

# Plot all ROC curves
fig=plt.figure()

colors=['red','green','blue','orange','purple','gray']
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i],
             label=filenames[i].split('\\')[-1].split('_')[0]+'(area = {0:0.4f})'
                   ''.format(roc_auc[i]),
             color=colors[i],  linewidth=0.8,linestyle='-')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC AUC Curve')
plt.legend(loc="best")
plt.show()
fig.savefig(PATH4+file+' '+choose_model+'_nested'+"_rocauc.pdf", bbox_inches='tight')

In [None]:
#prc
filenames=[PATH4+file+' '+choose_model+'_nested'+'_prc.csv']
n_classes=len(filenames)
precision={}
recall={}
prc ={}

for i in range(n_classes):
    temp=pd.read_csv(filenames[i])
    precision[i], recall[i]=temp['precision'],temp['recall']
    prc[i] = metrics.auc( recall[i],precision[i])
    

# Plot all ROC curves
fig=plt.figure()

colors=['red','green','blue','orange','purple','gray']
for i in range(n_classes):
    plt.plot(recall[i],precision[i], 
             label=filenames[i].split('\\')[-1].split('_')[0]+'(area = {0:0.4f})'
                   ''.format(prc[i]),
             color=colors[i],  linewidth=0.8,linestyle='-')


plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision Recall Curve')
plt.legend(loc="best")
plt.show()
fig.savefig(PATH4+file+' '+choose_model+'_nested'+"_prc.pdf", bbox_inches='tight')