In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta

# Functions

In [None]:
# Assuming there is more than two columns to the data, you can just read in
# the two columns of data, try to sort it, and then save the index of the
# sorted dataframe. Then you can read in the original data and rearrange it
# using the sorted index.
def sortdf(data,temp,colstosort):

    asc=[True]*len(colstosort)
    temp=temp.sort_values(colstosort, ascending=asc)

    index=temp.index

    data=data.reindex(index)

    return data

In [None]:
#Fill in the values that are still missing
#get dummy variables
#convert ards_scale_1 to sample weight
def process(data,trainset,testset):

    structured_bin=data.copy()
    structured_bin.replace([np.inf, -np.inf], np.nan,inplace=True)

    #if data is still missing, carry forward 1 bin
    structured_bin=structured_bin.groupby(['EncounterID']).ffill(limit=1)

    #Fill in the values that are still missing
    structured_bin=structured_bin.groupby(['EncounterID','PatientID']).fillna(structured_bin.median()).reset_index(drop=False)

    structured_bin=structured_bin.fillna('unknown')


    #convert ards_scale_1 to sample weight
    structured_bin.loc[(structured_bin['ards_scale_1']<=1) & (structured_bin['ards_scale_1']>=0.5),'sampleweight']=structured_bin['ards_scale_1']
    structured_bin.loc[(structured_bin['ards_scale_1']<0.5) ,'sampleweight']=1-structured_bin['ards_scale_1']
    structured_bin=structured_bin.drop(['ards_scale_1'],1)

    #get dummy varibale
    structured_bin=structured_bin.drop(['level_2'],1)
    temp=structured_bin[['EncounterID','PatientID','ards','time']].copy()
    structured_bin=pd.get_dummies(structured_bin.drop(['EncounterID','PatientID','ards','time'],1))
    structured_bin['PatientID']=temp['PatientID'].copy()
    structured_bin['ards']=temp['ards'].copy()
    structured_bin['time']=temp['time'].copy()
    structured_bin['EncounterID']=temp['EncounterID'].copy()


    #train test split
    test_str=structured_bin[structured_bin['EncounterID'].isin(testset)]
    train_str=structured_bin[structured_bin['EncounterID'].isin(trainset)]

 

    return train_str,test_str

In [None]:
def merge_large(data1,data2,how,mergeon):
    encounterids=data1.EncounterID.unique().tolist()
    size=100
    list_of_encounter = [encounterids[i:i+size] for i in range(0, len(encounterids),size)]

    k=0
    for e in list_of_encounter:
        #print(k)
        k+=1
        temp=pd.merge(data1[data1.EncounterID.isin(e)],data2[data2.EncounterID.isin(e)],how=how,on=mergeon)
        temp=sortdf(temp,temp[['EncounterID','time']],['EncounterID','time'])
        if k==1:
            temp.to_csv(PATH5+'temp.csv',index=False,header='column_names')
        else:
            temp.to_csv(PATH5+'temp.csv', mode='a', header=None,index=False)
        del temp
    
    merged=pd.read_csv(PATH5+'temp.csv')
    merged.time=pd.to_datetime(merged.time)
    return merged

In [None]:
def carryhours(data,col,hours):
    newcol=col+'carry'
    data.loc[pd.notnull(data[col]),newcol]=data['time']
    data[newcol]=data.groupby(['EncounterID'])[newcol].ffill()
    data[col]=data.groupby(['EncounterID'])[col].ffill()
    data.loc[(data['time']-data[newcol])>timedelta(hours=hours),col]=np.nan
    data=data.drop(newcol,1)
    return data

In [None]:
def preprocessing(include_unstructured,include_othernotes_merge,include_orders,clinicalnotes_only,ctakes_only,unigrambigram_only,unstructured_only,medication_only,include_radiology50,bintrain,bintest):
    #read in data

    ards=pd.read_csv(PATH1+'current-ards-review-results_2_25_2020.csv',dtype={'mrn': str})
    ards.rename(columns={'encounterid':'EncounterID'},inplace=True)
    ards.loc[ards['admitdate']=='.','admitdate']=np.nan
    ards.admitdate=pd.to_datetime(ards.admitdate)

    structured=pd.read_csv(PATH2+'structured_data.csv')
    pid=structured[['EncounterID','PatientID']].drop_duplicates(keep='first')
    structured.time=pd.to_datetime(structured.time)
    structured=structured.drop(['fio2_carry','mairp_carry','invasive', 'noninvasive', 'supl','hfnc','ra',
                               'vent_start', 'msc_rt',
                                'niv_mode', 'niv_start'],1) 
    
    structured=sortdf(structured,structured[['EncounterID','time']],['EncounterID','time'])
    
    #calculate percentage of missing value

    structured['null_percent']=structured.isnull().sum(axis=1)

    structured['null_percent']=structured['null_percent']/(len(structured.columns)-3)
    
    #categorical variable mapping classes
    #carry forward vend_mode
    structured['vent_mode']=structured.groupby(['EncounterID','support'])['vent_mode'].ffill()
    structured['resp_support']=structured.groupby(['EncounterID','support'])['resp_support'].ffill()
    mapping=pd.read_csv(PATH2+'resp_map.csv')

    for index,row in mapping.iterrows():
        row['old']=row['old'].split('_')[2]
        if row['old']!='unknown':
            structured.loc[structured['resp_support']==row['old'],'resp_support']=row['new']
        else:
            structured.loc[structured['resp_support']==row['old'],'resp_support']='unknown'
        
    #combine support and resp_support
    structured.loc[(structured['resp_support']=='nc') & (structured['support']=='supl'),'support']='supl_nc'
    structured.loc[(structured['resp_support']=='mask')& (structured['support']=='supl'),'support']='supl_mask'
    structured.loc[(structured['resp_support']=='nrb')& (structured['support']=='supl'),'support']='supl_nrb'
    structured.loc[(structured['resp_support']=='trach')& (structured['support']=='supl'),'support']='supl_trach'
    structured.loc[(structured['resp_support']=='ra') & (structured['support']=='supl'),'support']='ra'
    structured.loc[(structured['resp_support']=='hhfnc') & (structured['support']=='supl'),'support']='hfnc'
    structured.loc[(structured['resp_support']=='unknown')& (structured['support']=='supl'),'support']='unknown'
    structured.loc[(structured['support']=='supl'),'support']='unknown'
    
    structured=structured.drop('resp_support',1)
    
    structured['support']=structured.groupby(['EncounterID'])['support'].ffill()


    #get structured data's column names
    structured_cols=structured.drop(['EncounterID','PatientID','time'],1).columns.tolist()


    if clinicalnotes_only or ctakes_only or unigrambigram_only or unstructured_only or medication_only:
        if unstructured_only:
            data=pd.read_csv(PATH2+'unstructured_reports_no_unigram_updated.csv')
            data=data[['EncounterID','time', 'azzam_neg',
               'herasevich_neg','ards_neg', 'edema_neg', 'infiltrate_neg',
               'pneumonia_neg', 'airspace_neg', 'aspiration_neg', 'opacity_neg']]
            
        elif ctakes_only:
            data=pd.read_csv(PATH2+'chest_xray_ctakes_1000_split1.csv')
        
        elif unigrambigram_only:
            data=pd.read_csv(PATH2+'unigram+bigram.csv')
        
        elif clinicalnotes_only:
            data=pd.read_csv(PATH2+'clinical_notes_ctakes_1000_split1.csv') 
            
        elif medication_only:
            data=pd.read_csv(PATH2+'treatments_orders_split1.csv')
        
        data.time=pd.to_datetime(data.time)
        
        data=merge_large(data,pid,'left','EncounterID')

        if not medication_only:
            #merge with pf
            pf=structured[['EncounterID','time','pf']]
            data=merge_large(data,pf,'outer',['EncounterID','time'])
            #data=pd.merge(data,pf,how='outer',on=['EncounterID','time'])
            cols=data.drop(['EncounterID','PatientID','time'],1).columns.tolist()
            #carry forward 24h
            k=0
            for col in cols:
                #print(k)
                k+=1
                data=carryhours(data,col,24)
                #data[col]=data.groupby(['EncounterID',pd.Grouper(key='time', freq='24H')])[col].ffill()

            data=data.dropna(how='any')
            
            #print(data.shape)
        
        #merge with ards_scale
        data=merge_large(data,ards[['EncounterID','ards_scale_1']],'left','EncounterID')
        #data=pd.merge(data,ards[['EncounterID','ards_scale_1']],how='left',on='EncounterID')
        
        #merge with support
        data=merge_large(data,structured[['EncounterID','time','support']],'outer',['EncounterID','time'])
        cols=data.drop(['EncounterID','PatientID','time'],1).columns.tolist()
        #carry forward 
        k=0
        for col in cols:
            #print(k)
            k+=1
            data[col]=data.groupby(['EncounterID'])[col].ffill()

        data=data.dropna(how='any')

        #add labels
        #replace missing value with nan
        ards.loc[ards['ards_time']=='.','ards_time']=np.nan
        ards.ards_time=pd.to_datetime(ards.ards_time)

        #merge with ards_time
        data=merge_large(data,ards[['EncounterID','ards_time']],'left','EncounterID')
        data.ards_time=pd.to_datetime(data.ards_time)
        #data=pd.merge(data,ards[['EncounterID','ards_time']],how='left',on='EncounterID')

        #if there is no ards_time, the patient had never had ards so the label will be 0
        data.loc[pd.isnull(data['ards_time']),'ards']=0
        #if the patient had ards,records after ards_time labeled as 1
        data.loc[(data['ards_time']-timedelta(hours=0))<=data['time'] ,'ards']=1
        #if the patient had ards,records before ards_time labeled as 1
        data.loc[pd.isnull(data['ards']),'ards']=0

        data=data.drop(['ards_time'],1)
        
        #set ards_scale_1
        #if pt_ards==0
        data.loc[(data['EncounterID'].isin(ards[ards['pt_ards']==0].EncounterID.unique()))&(~data['support'].isin(['invasive','noninvasive','hfnc'])),'ards_scale_1']=1

        #if pt_ards==1
        data.loc[(data['EncounterID'].isin(ards[ards['pt_ards']==1].EncounterID.unique()))&(~data['support'].isin(['invasive','noninvasive','hfnc']))&(data['ards']==0),'ards_scale_1']=1
        data.loc[(data['EncounterID'].isin(ards[ards['pt_ards']==1].EncounterID.unique()))&(data['support'].isin(['invasive','noninvasive','hfnc']))&(data['ards']==0),'ards_scale_1']=4
        data.loc[(data['EncounterID'].isin(ards[ards['pt_ards']==1].EncounterID.unique()))&(~data['support'].isin(['invasive','noninvasive','hfnc']))&(data['ards']==1),'ards_scale_1']=4

        data=data.drop(['support'],1)
        #normalize the ards_scale_1
        data['ards_scale_1']=(data['ards_scale_1']-min(data['ards_scale_1']))/(max(data['ards_scale_1'])-min(data['ards_scale_1']))   
        
        #train test split
        trainset=ards[(ards['year']==2016)&(ards['not_reviewed']==0)&(ards['not_cohort']==0)&(pd.notnull(ards['pt_ards']))].EncounterID.unique().tolist()
        testset=ards[(ards['year']==2017)&(ards['not_reviewed']==0)&(ards['not_cohort']==0)&(pd.notnull(ards['pt_ards']))].EncounterID.unique().tolist()

        train_str,test_str=process(data,trainset,testset)
      
        if len(train_str.columns)>=len(test_str.columns):
            train_str=train_str[test_str.columns]
        else:
            test_str=test_str[train_str.columns]

        test_str=test_str[train_str.columns] 
        
        if clinicalnotes_only:
            filename='clinicalnotes_only1000'
        if ctakes_only:
            filename='ctakes_only_1000'
        if unigrambigram_only:
            filename='unigrambigram_only'
        if unstructured_only:
            filename='unstructured_only'
        if medication_only:
            filename='medication_only'

        train_str.to_csv(PATH5+filename+'_'+'train.csv',index=False)
        test_str.to_csv(PATH5+filename+'_'+'test.csv',index=False)
        print(filename+'_'+'train.csv')
        print(filename+'_'+'test.csv')
        
        return 'finished'
    

    if include_unstructured:
        #merge with the unstructured data
        unstructured=pd.read_csv(PATH2+'unstructured_reports_no_unigram_updated.csv')
        unstructured.time=pd.to_datetime(unstructured.time)
        unstructured=unstructured[['EncounterID','time', 'azzam_neg',
               'herasevich_neg','ards_neg', 'edema_neg', 'infiltrate_neg',
               'pneumonia_neg', 'airspace_neg', 'aspiration_neg', 'opacity_neg']]

        #add PatientID
        unstructured=pd.merge(unstructured,pid,how='left',on='EncounterID')

        structured=pd.merge(structured,unstructured,how='outer',on=['EncounterID','PatientID','time'])
        #sort
        structured=sortdf(structured,structured[['EncounterID','time']],['EncounterID','time'])

        #carry forward unstructured data within each encounter
        unstructuredcols=unstructured.drop(['EncounterID','PatientID','time'],1).columns.tolist()
        for col in unstructuredcols:
            structured[col]=structured.groupby(['EncounterID'])[col].ffill()

        #drop rows that only have unstructured data
        structured = structured.dropna(subset=structured_cols, how='all')

        #fill in 0 when the unstructured data is missing
        structured[unstructuredcols] = structured[unstructuredcols].fillna(value=0)
        
        
    if include_radiology50:
        #merge with the unstructured data
        unstructured=pd.read_csv(PATH2+'chest_xray_ctakes_50_split1.csv')
        unstructured.time=pd.to_datetime(unstructured.time)

        #add PatientID
        unstructured=pd.merge(unstructured,pid,how='left',on='EncounterID')

        structured=pd.merge(structured,unstructured,how='outer',on=['EncounterID','PatientID','time'])
        #sort
        structured=sortdf(structured,structured[['EncounterID','time']],['EncounterID','time'])

        #carry forward unstructured data within each encounter
        unstructuredcols=unstructured.drop(['EncounterID','PatientID','time'],1).columns.tolist()
        #rename
        for col in unstructuredcols:
            tempcol=col+'xray'
            structured.rename(columns={col: tempcol},inplace=True)
        
        for col in unstructuredcols:
            tempcol=col+'xray'
            structured[tempcol]=structured.groupby(['EncounterID'])[tempcol].ffill()

        #drop rows that only have unstructured data
        structured = structured.dropna(subset=structured_cols, how='all')

        unstructuredcols=[i+'xray' for i in unstructuredcols]
        #fill in 0 when the unstructured data is missing
        structured[unstructuredcols] = structured[unstructuredcols].fillna(value=0)

    if include_orders:
        #merge with medication orders
        orders=pd.read_csv(PATH2+'treatments_orders_split1.csv')
        orders.time=pd.to_datetime(orders.time)

        #add patient ID
        orders=pd.merge(orders,pid,how='left',on='EncounterID')
        structured=pd.merge(structured,orders,how='outer',on=['EncounterID','PatientID','time'])

        structured=structured.sort_values(['EncounterID','time'], ascending=[True,True])

        #carry forward 72 h
        orderscols=orders.drop(['EncounterID','PatientID','time'],1).columns.tolist()
        for col in orderscols:
            structured=carryhours(structured,col,72)
            structured[col]=structured[col].fillna(0)

        #drop rows that only have orders data
        structured = structured.dropna(subset=structured_cols, how='all')
            

    #merge with clinical notes' ctakes features 
    if include_othernotes_merge:

        othernotes=pd.read_csv(PATH2+'clinical_notes_ctakes_250_split1.csv')
        othernotes.time=pd.to_datetime(othernotes.time)
        othernotes=pd.merge(othernotes,pid,how='left',on='EncounterID')
        othercols=othernotes.drop(['EncounterID','PatientID','time'],1).columns.tolist()

        encounterids=structured.EncounterID.unique()
        size=100
        list_of_encounter = [encounterids[i:i+size] for i in range(0, len(encounterids),size)]

        #print(len(list_of_encounter))
        k=0
        for e in list_of_encounter:
            #print(k)
            k+=1
            temp=pd.merge(structured[structured.EncounterID.isin(e)],othernotes[othernotes.EncounterID.isin(e)],how='outer',on=['EncounterID','PatientID','time'])
            temp=sortdf(temp,temp[['EncounterID','time']],['EncounterID','time'])
            for col in othercols:
                temp[col]=temp.groupby(['EncounterID'])[col].ffill()
           
            temp = temp.dropna(subset=structured_cols, how='all')
            temp[othercols]=temp[othercols].fillna(0)
            if k==1:
                temp.to_csv(PATH5+'merge_clinical.csv',index=False,header='column_names')
            else:
                temp.to_csv(PATH5+'merge_clinical.csv', mode='a', header=None,index=False)
            del temp

        del othernotes
        del structured

        structured=pd.read_csv(PATH5+'merge_clinical.csv')
        structured.time=pd.to_datetime(structured.time)       
        
    if include_othernotes_merge and include_radiology50:
        #merge two columns if the cui code is the same
        allcols=structured.columns.tolist()
        
        notecols=[col for col in allcols if col[1:].isdecimal()]
        
        for col in notecols:
            tempcol=col+'xray'
            if tempcol in allcols:
                structured.loc[structured[tempcol]==1,col]=1
                structured=structured.drop(tempcol,1)
                
        
    #add ards_scale_1
    structured=pd.merge(structured,ards[['EncounterID','ards_scale_1']],how='left',on='EncounterID')
    

    #add labels
    #replace missing value with nan
    ards.loc[ards['ards_time']=='.','ards_time']=np.nan
    ards.ards_time=pd.to_datetime(ards.ards_time)

    #merge with ards_time
    structured=pd.merge(structured,ards[['EncounterID','ards_time']],how='left',on='EncounterID')

    #if there is no ards_time, the patient had never had ards so the label will be 0
    structured.loc[pd.isnull(structured['ards_time']),'ards']=0
    #if the patient had ards,records after ards_time labeled as 1
    structured.loc[(structured['ards_time']-timedelta(hours=0))<=structured['time'] ,'ards']=1
    #if the patient had ards,records before ards_time labeled as 1
    structured.loc[pd.isnull(structured['ards']),'ards']=0

    structured=structured.drop(['ards_time'],1)


    structured=sortdf(structured,structured[['EncounterID','time']],['EncounterID','time'])
    
    #deal with missing height and weight
    #each encounter was assigned with 1 height and 1 weight
    height=structured.groupby(['EncounterID','PatientID'])['height'].mean().reset_index(drop=False)

    weight=structured.groupby(['EncounterID','PatientID'])['weight'].mean().reset_index(drop=False)
    
    age=structured.groupby(['EncounterID','PatientID'])['AgeInYears'].first().reset_index(drop=False)

    gender=structured.groupby(['EncounterID','PatientID'])['GenderCode'].first().reset_index(drop=False)


    structured=structured.drop(['height','weight','AgeInYears','GenderCode'],1)

    structured=pd.merge(structured,height,how='left',on=['EncounterID','PatientID'])
    structured=pd.merge(structured,weight,how='left',on=['EncounterID','PatientID'])
    structured=pd.merge(structured,age,how='left',on=['EncounterID','PatientID'])
    structured=pd.merge(structured,gender,how='left',on=['EncounterID','PatientID'])
    
    
    ##calculate compliance and VR
    usevtset=['VC+/AC'
    ,'VC+/IMV'
    ,'VC/AC'
    ,'VC/IMV'
    ,'VC/MMV']
    usevtobs=['PC-IMV+/BiLevel'
    ,'PC/PSV'
    ,'PC/AC'
    ,'PC/IMV'
    ]

    structured.loc[structured['vent_mode'].isin(usevtset),'Compliance']=structured['Vtset']/(structured['plat']-structured['peep'])
    structured.loc[structured['vent_mode'].isin(usevtobs),'Compliance']=structured['Vte']/(structured['plat']-structured['peep'])
    structured.loc[pd.isnull(structured['vent_mode']),'Compliance']=structured['Vtset']/(structured['plat']-structured['peep'])
    structured.loc[(structured['Compliance']<0)|(structured['Compliance']>200),'Compliance']=np.nan

    structured=structured.drop('vent_mode',1)
    
    #VR – Minute ventilation (VE) * PaCO2 * 1000 / (predicted body weight * 100 *37.5)
    structured['VR']=structured['ve']*structured['paco2']*1000/(structured['weight']*100*37.5)
    structured.loc[(structured['VR']<0.4)|(structured['VR']>10),'VR']=np.nan
    
    ####deal with missing data
    structured.time=pd.to_datetime(structured.time)
    #carry forward
    carrydic={'temp':'8H','hr':'8H','rr':'8H','sbp':'8H','dbp':'8H','gcs':'24H','rass':'24H','shock_indx':'8H',

             'spo2':'8H','fio2':'8H','pf':'48H','sf':'48H','support':'encounter',

             'peep':'support','plat':'support','mairp':'support','ve':'support',

            'o2flow_rate':'support', 'Vte':'support', 'Vtset':'support',

            'Compliance':'support', 'VR':'support', 'oi':'support',

            'lactate':'48H','ph':'48H','paco2':'48H','pao2': '48H',

            'na': 'encounter','k': 'encounter','hco2': 'encounter','bun': 'encounter','cr':'encounter',

             'alb':'encounter','tp':'encounter','tbili':'encounter','ast':'encounter','hgb': 'encounter','wbc':'encounter',

             'plt': 'encounter','inr': 'encounter','ptt': 'encounter','bnp':'encounter','trop':'encounter','procalcitonin':'encounter',

             'd-dimer':'encounter'}

    for col in carrydic.keys():
        #carry forward if the encounterid didn't change
        if carrydic[col]=='encounter':
            structured[col]=structured.groupby(['EncounterID'])[col].ffill()
        #carry forward if the support type didn't change
        elif carrydic[col]=='support':
            structured[col]=structured.groupby(['EncounterID','support'])[col].ffill()
        #carry forward N hours
        else:
            structured=carryhours(structured,col,int(carrydic[col][:-1]))
            #structured[col]=structured.groupby(['EncounterID',pd.Grouper(key='time', freq=carrydic[col])])[col].ffill()


    #set ards_scale_1
    #if pt_ards==0
    structured.loc[(structured['EncounterID'].isin(ards[ards['pt_ards']==0].EncounterID.unique()))&(~structured['support'].isin(['invasive','noninvasive','hfnc'])),'ards_scale_1']=1

    #if pt_ards==1
    structured.loc[(structured['EncounterID'].isin(ards[ards['pt_ards']==1].EncounterID.unique()))&(~structured['support'].isin(['invasive','noninvasive','hfnc']))&(structured['ards']==0),'ards_scale_1']=1
    structured.loc[(structured['EncounterID'].isin(ards[ards['pt_ards']==1].EncounterID.unique()))&(structured['support'].isin(['invasive','noninvasive','hfnc']))&(structured['ards']==0),'ards_scale_1']=4
    structured.loc[(structured['EncounterID'].isin(ards[ards['pt_ards']==1].EncounterID.unique()))&(~structured['support'].isin(['invasive','noninvasive','hfnc']))&(structured['ards']==1),'ards_scale_1']=4

    #normalize the ards_scale_1
    structured['ards_scale_1']=(structured['ards_scale_1']-min(structured['ards_scale_1']))/(max(structured['ards_scale_1'])-min(structured['ards_scale_1']))


    ###bining the data 

    aggregation_functions = {}
    floatcols=structured.loc[:, structured.dtypes == np.float64].columns.tolist()

    
    minmaxcols=['plat','mairp','pf','sf','oi','peep']
    maxcols=['temp' ,'hr', 'rr', 'shock_indx' ,'fio2','o2flow_rate', 'bnp','procalcitonin','inr','fluid_bal','ve','paco2','VR','ptt','lactate','paco2','wbc', 'ddimer','trop']
    mincols=['dbp','sbp','gcs','rass', 'spo2','alb','plt', 'tp','hgb','Vtset', 'pao2','Compliance','ph', 'na','k','hco2', 'ast']
    meancols=['ards_scale_1','ra','null_percent','height','weight']
    #set the aggregation function for binning the data
    for col in structured.columns:
        if col=='ards' :
            aggregation_functions[col]='max'
        elif include_orders and col in orderscols :
            aggregation_functions[col]='max'
        elif include_othernotes_merge and col in othercols:
            aggregation_functions[col]='max'
        elif (include_unstructured or include_radiology50) and col in unstructuredcols:
            aggregation_functions[col]='max'
        elif col in meancols:
            aggregation_functions[col]='mean'
        elif col in ['EncounterID','PatientID','time']:
            continue
        elif col in minmaxcols:
            name=col+'_min'
            aggregation_functions[name]='min'
            name=col+'_max'
            aggregation_functions[name]='max'
        elif col in maxcols:
            aggregation_functions[col]='max'
        elif col in mincols:
            aggregation_functions[col]='min'
        elif col in floatcols:   
            aggregation_functions[col]='min'
        else:
            aggregation_functions[col]='last'
            
    
    for col in minmaxcols:
        name=col+'_min'
        structured[name]=structured[col].copy()
        name=col+'_max'
        structured[name]=structured[col].copy()
        structured=structured.drop(col,1)
    
    print(aggregation_functions)

    
    structured_bin = structured.groupby(['EncounterID','PatientID',pd.Grouper(key='time', freq=(str(bintrain)+'H'))]).aggregate(aggregation_functions).reset_index(drop=False)
    structured_bin2 = structured.groupby(['EncounterID','PatientID',pd.Grouper(key='time', freq=(str(bintest)+'H'))]).aggregate(aggregation_functions).reset_index(drop=False)

    #fill in missing values with predefined values
    fillin={'temp':98.3,'hr':80,'rr':20,'sbp':110,'dbp':60,'gcs':15,'rass':0,'shock_indx':0.7,
            'spo2':98, 'fio2':21,'pf':400,'sf':400,'peep':0,'plat':5,'mairp':5,'ve':5,
            'o2flow_rate':0,'Vte':400,'Vtset':400,'oi':1, 'Compliance':50,'VR':1,
            'lactate':0,'ph':7.4,'paco2':40,'pao2':90,'na':140,
            'k':4,'hco2':24,'bun':25,'hgb':12,'wbc':10,'plt':150,'inr':1,'ptt':1}
    
    for col in fillin.keys():
        if col in minmaxcols:
            name=col+'_min'
            structured_bin[name]=structured_bin[name].fillna(fillin[col])
            structured_bin2[name]=structured_bin2[name].fillna(fillin[col])
            name=col+'_max'
            structured_bin[name]=structured_bin[name].fillna(fillin[col])
            structured_bin2[name]=structured_bin2[name].fillna(fillin[col])
        else:
            structured_bin[col]=structured_bin[col].fillna(fillin[col])
            structured_bin2[col]=structured_bin2[col].fillna(fillin[col])

    
    
    #train test split
    trainset=ards[(ards['year']==2016)&(ards['not_reviewed']==0)&(ards['not_cohort']==0)&(pd.notnull(ards['pt_ards']))].EncounterID.unique().tolist()
    testset=ards[(ards['year']==2017)&(ards['not_reviewed']==0)&(ards['not_cohort']==0)&(pd.notnull(ards['pt_ards']))].EncounterID.unique().tolist()


    train_str,_=process(structured_bin,trainset,testset)
    train_str2,test_str=process(structured_bin2,trainset,testset)

    if len(train_str.columns)>=len(train_str2.columns):
        train_str=train_str[train_str2.columns]
    else:
        train_str2=train_str2[train_str.columns]

    test_str=test_str[train_str.columns] 
    
    filename='structured'
    if include_unstructured:
        filename+='+unstructured'
    if include_radiology50:
        filename+='+radiology50'
    if include_othernotes_merge:
        filename+='+clinical_notes250'
    if include_orders:
        filename+='+medication'
    
   

    train_str.to_csv(PATH5+filename+'_'+str(bintrain)+'Htrain.csv',index=False)
    train_str2.to_csv(PATH5+filename+'_'+str(bintest)+'Htrain.csv',index=False)
    test_str.to_csv(PATH5+filename+'_'+str(bintest)+'Htest.csv',index=False)
    print(filename+'_'+str(bintrain)+'Htrain.csv')
    print(filename+'_'+str(bintest)+'Htrain.csv')
    print(filename+'_'+str(bintest)+'Htest.csv')
    
    return 'finished'
    

# Set Paramters

In [None]:
# 'include_unstructured' True if we want to add the unstructured data
# 'include_othernotes_merge' True if we want to add the ctakes features from clinical notes
# 'include_orders' True if we want to add the orders/medications
# #Train the model on data binned every N hours
# bintrain=6
# #evaluate and test the model on data binned every N hours
# bintest=2
# 'clinicalnotes_only' True if we want to generate a dataset only have clinical notes ctakes features
# 'ctakes_only' True if we want to generate a dataset only have chest x ray ctakes features
# 'unigrambigram_only' True if we want to generate a dataset only have chest x ray unigram+bigram features
# 'unstructured_only' True if we want to generate a dataset only have chest x ray keywords+sniffer features

In [None]:
PATH1='Z:\patient-adjudication-results\\'
PATH2='Z:\project-datasets\ARDS\ml_algorithms\\final_datasets\\'
PATH3='Z:\project-datasets\ARDS\ml_algorithms\\'
PATH4='Z:\project-datasets\ARDS\ml_algorithms\\model_outputs\\'
PATH5='Z:\project-datasets\ARDS\ml_algorithms\\final_datasets\\'

In [None]:
parameters=[ #structured
             {'include_unstructured':False,'include_othernotes_merge':False,'include_orders':False,'bintrain':6,'bintest':2,'clinicalnotes_only':False, 'ctakes_only':False, 'unigrambigram_only':False, 
             'unstructured_only':False,'include_radiology50':False,'medication_only':False},
            #structured+medication
             {'include_unstructured':False,'include_othernotes_merge':False,'include_orders':True,'bintrain':6,'bintest':2,'clinicalnotes_only':False, 'ctakes_only':False, 'unigrambigram_only':False, 
             'unstructured_only':False,'include_radiology50':False,'medication_only':False},
            #structured+unstructured
             {'include_unstructured':True,'include_othernotes_merge':False,'include_orders':False,'bintrain':6,'bintest':2,'clinicalnotes_only':False, 'ctakes_only':False, 'unigrambigram_only':False, 
             'unstructured_only':False,'include_radiology50':False,'medication_only':False},
            #Structured+radiology50
            {'include_unstructured':False,'include_othernotes_merge':False,'include_orders':False,'bintrain':6,'bintest':2,'clinicalnotes_only':False, 'ctakes_only':False, 'unigrambigram_only':False, 
             'unstructured_only':False,'include_radiology50':True,'medication_only':False},
            #structured+radiology50+clinicalnotes250
             {'include_unstructured':False,'include_othernotes_merge':True,'include_orders':False,'bintrain':6,'bintest':2,'clinicalnotes_only':False, 'ctakes_only':False, 'unigrambigram_only':False, 
             'unstructured_only':False,'include_radiology50':True,'medication_only':False},
            #structured+radiology50+clinicalnotes250+medication
             {'include_unstructured':False,'include_othernotes_merge':True,'include_orders':True,'bintrain':6,'bintest':2,'clinicalnotes_only':False, 'ctakes_only':False, 'unigrambigram_only':False, 
             'unstructured_only':False,'include_radiology50':True,'medication_only':False},
            #Structured+unstructured+clinical notes c-takes
            {'include_unstructured':True,'include_othernotes_merge':True,'include_orders':False,'bintrain':6,'bintest':2,'clinicalnotes_only':False, 'ctakes_only':False, 'unigrambigram_only':False, 
             'unstructured_only':False,'include_radiology50':False,'medication_only':False}
           
            
            ]

In [None]:
for p in parameters:
    print('***********',p)
    preprocessing(**p)