In [1]:
# This notebook reads in the discretised input data and then preprocesses the model features
# Firstly, values deemed excessively high/low are capped
# Relevant binary features and normally/log-normally features are standardised accordingly
# Training and test sets are split - 70% train, 10% validation, 20% test
# Resulting datasets are saved to file.

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import DataFrame

In [2]:
disc_inp_data = pd.read_csv("../data/discretised_input_data.csv")
disc_inp_data.head()

Unnamed: 0,bloc,icustayid,charttime,gender,age,elixhauser,re_admission,died_in_hosp,mortality_90d,Weight_kg,...,SIRS,median_dose_vaso,max_dose_vaso,input_total_tev,input_4hourly_tev,output_total,output_4hourly,cumulated_balance_tev,vaso_input,iv_input
0,1,3,7245052800,0,17639.826435,0.0,0,0,1,77.5,...,4,0.0,0.0,3500.0,2100.0,230.0,230.0,3270.0,0.0,4.0
1,2,3,7245067200,0,17639.826435,0.0,0,0,1,77.5,...,3,0.0,0.0,5499.0,1999.0,697.0,467.0,4802.0,0.0,4.0
2,3,3,7245081600,0,17639.826435,0.0,0,0,1,77.5,...,3,0.0,0.0,5604.0,105.0,2302.0,1605.0,3302.0,0.0,2.0
3,4,3,7245096000,0,17639.826435,0.0,0,0,1,77.5,...,3,0.0,0.0,5664.0,60.0,2922.0,620.0,2742.0,0.0,2.0
4,5,3,7245110400,0,17639.826435,0.0,0,0,1,77.5,...,3,0.0,0.0,5724.0,60.0,3352.0,430.0,2372.0,0.0,2.0


In [3]:
disc_inp_data.columns

Index(['bloc', 'icustayid', 'charttime', 'gender', 'age', 'elixhauser',
       're_admission', 'died_in_hosp', 'mortality_90d', 'Weight_kg', 'GCS',
       'HR', 'SysBP', 'MeanBP', 'DiaBP', 'Shock_Index', 'RR', 'SpO2', 'Temp_C',
       'FiO2_1', 'Potassium', 'Sodium', 'Chloride', 'Glucose', 'BUN',
       'Creatinine', 'Magnesium', 'Calcium', 'Ionised_Ca', 'CO2_mEqL', 'SGOT',
       'SGPT', 'Total_bili', 'Albumin', 'Hb', 'WBC_count', 'Platelets_count',
       'PTT', 'PT', 'INR', 'Arterial_pH', 'paO2', 'paCO2', 'Arterial_BE',
       'Arterial_lactate', 'HCO3', 'PaO2_FiO2', 'mechvent', 'SOFA', 'SIRS',
       'median_dose_vaso', 'max_dose_vaso', 'input_total_tev',
       'input_4hourly_tev', 'output_total', 'output_4hourly',
       'cumulated_balance_tev', 'vaso_input', 'iv_input'],
      dtype='object')

In [4]:
disc_inp_data.SIRS.value_counts()

1    78594
2    74063
3    41299
0    39953
4     8541
Name: SIRS, dtype: int64

In [37]:
# add reward based on whether died in hospital or not at the terminal timestep
disc_inp_data['reward'] = 0
for i in disc_inp_data.index:
    if i == 0:
        continue
    else:
        if disc_inp_data.loc[i, 'icustayid'] != disc_inp_data.loc[i-1, 'icustayid']:
            if disc_inp_data.loc[i-1, 'died_in_hosp'] == 1:
                disc_inp_data.loc[i-1,'reward'] = -100
            elif disc_inp_data.loc[i-1, 'died_in_hosp'] == 0:
                disc_inp_data.loc[i-1,'reward'] = 100
            else:
                print ("error in row", i-1)
if disc_inp_data.loc[len(disc_inp_data)-1, 'died_in_hosp'] == 1:
    disc_inp_data.loc[len(disc_inp_data)-1, 'reward'] = -100
elif disc_inp_data.loc[len(disc_inp_data)-1, 'died_in_hosp'] == 0:
     disc_inp_data.loc[len(disc_inp_data)-1, 'reward'] = 100

disc_inp_data['reward'].value_counts()

In [35]:
# add reward based on whether patient is on or off the mechinical ventilation in each timestep
mech_counter = 1
for i in disc_inp_data.index:
    if i == 0:
        continue
    else:
        if disc_inp_data.loc[i, 'icustayid'] != disc_inp_data.loc[i-1, 'icustayid']:
            mech_counter = 1
            
    if disc_inp_data.loc[i, 'mechvent'] == 1: 
        disc_inp_data.loc[i,'reward'] -= mech_counter
        mech_counter +=1
    else:
        mech_counter = 1

if (disc_inp_data.loc[len(disc_inp_data)-1, 'mechvent'] == 1) and (disc_inp_data.loc[len(disc_inp_data)-1, 'icustayid'] == disc_inp_data.loc[len(disc_inp_data)-2, 'icustayid']):
    disc_inp_data.loc[len(disc_inp_data)-1, 'reward'] -= mech_counter
elif (disc_inp_data.loc[len(disc_inp_data)-1, 'mechvent'] == 1):
    disc_inp_data.loc[len(disc_inp_data)-1, 'reward'] -= 1
        
        
disc_inp_data['reward'].value_counts()


 0      119173
 100     12078
-1       10450
-2       10041
-3        9597
         ...  
 93         27
 96         25
 95         21
 94         19
-101         9
Name: reward, Length: 62, dtype: int64


In [None]:
# add reward based on whether patient has normal or abnormal creatinin level indicating kidney health

for i in disc_inp.data.index:
    if i == 0:
        pass
    else:
        current_creatinine = disc_inp.data.loc[i, 'creatinine']
        reward = 0
        if current_creatinine > 1.25:
            reward = 1.2 - current_creatinine
        disc_input_data.loc[i, 'reward'] += reward

In [37]:
# now split into train/validation/test sets
import random
unique_ids = disc_inp_data['icustayid'].unique()
random.shuffle(unique_ids)
train_sample = 0.7
val_sample = 0.1
test_sample = 0.2
train_num = int(len(unique_ids) * 0.7)
val_num = int(len(unique_ids)*0.1) + train_num
train_ids = unique_ids[:train_num]
val_ids = unique_ids[train_num:val_num]
test_ids = unique_ids[val_num:]

In [38]:
train_set = DataFrame()
train_set = disc_inp_data.loc[disc_inp_data['icustayid'].isin(train_ids)]

val_set = DataFrame()
val_set = disc_inp_data.loc[disc_inp_data['icustayid'].isin(val_ids)]

test_set = DataFrame()
test_set = disc_inp_data.loc[disc_inp_data['icustayid'].isin(test_ids)]

In [39]:
# cap values in train and test
caps = pd.read_csv("../data/capping_values.csv")
for i in caps.index:
    param = caps.loc[i,'parameter'][1:-1]
    maxval = caps.loc[i,'limsup']
    minval = caps.loc[i,'liminf']
    train_set[param][train_set[param] >= maxval] = maxval
    train_set[param][train_set[param] <= minval] = minval
    val_set[param][val_set[param] >= maxval] = maxval
    val_set[param][val_set[param] <= minval] = minval
    test_set[param][test_set[param] >= maxval] = maxval
    test_set[param][test_set[param] <= minval] = minval
    


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the cave

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is t

In [40]:
binary_fields = ['gender','mechvent','re_admission']

norm_fields = ['age','Weight_kg','GCS','HR','SysBP','MeanBP','DiaBP','RR','Temp_C','FiO2_1',
                'Potassium','Sodium','Chloride','Glucose','Magnesium','Calcium',
                'Hb','WBC_count','Platelets_count','PTT','PT','Arterial_pH','paO2','paCO2',
                'Arterial_BE','HCO3','Arterial_lactate','SOFA','SIRS','Shock_Index',
                'PaO2_FiO2','cumulated_balance_tev', 'elixhauser', 'Albumin', u'CO2_mEqL', 'Ionised_Ca']

log_fields = ['max_dose_vaso','SpO2','BUN','Creatinine','SGOT','SGPT','Total_bili','INR',
              'input_total_tev','input_4hourly_tev','output_total','output_4hourly', 'bloc']

In [41]:
# normalise binary fields
train_set[binary_fields] = train_set[binary_fields] - 0.5 
val_set[binary_fields] = val_set[binary_fields] - 0.5 
test_set[binary_fields] = test_set[binary_fields] - 0.5 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [42]:
# normal distn fields
for item in norm_fields:
    av = train_set[item].mean()
    std = train_set[item].std()
    train_set[item] = (train_set[item] - av) / std
    val_set[item] = (val_set[item] - av) / std
    test_set[item] = (test_set[item] - av) / std

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [43]:
# log normal fields
train_set[log_fields] = np.log(0.1 + train_set[log_fields])
val_set[log_fields] = np.log(0.1 + val_set[log_fields])
test_set[log_fields] = np.log(0.1 + test_set[log_fields])
for item in log_fields:
    av = train_set[item].mean()
    std = train_set[item].std()
    train_set[item] = (train_set[item] - av) / std
    val_set[item] = (val_set[item] - av) / std
    test_set[item] = (test_set[item] - av) / std

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [44]:
train_set.head()

Unnamed: 0,bloc,icustayid,charttime,gender,age,elixhauser,re_admission,died_in_hosp,mortality_90d,Weight_kg,...,median_dose_vaso,max_dose_vaso,input_total_tev,input_4hourly_tev,output_total,output_4hourly,cumulated_balance_tev,vaso_input,iv_input,reward
24,-2.314327,30,5707619880,-0.5,-0.619268,0.988997,-0.5,0,0,1.268519,...,0.0,-0.38123,0.354419,-1.677606,-2.752261,-2.017396,0.492517,0.0,0.0,0
25,-1.47552,30,5707634280,-0.5,-0.619268,0.988997,-0.5,0,0,1.268519,...,0.08,0.624229,0.419682,1.04648,-0.244664,0.561343,0.591989,2.0,4.0,0
26,-0.970304,30,5707648680,-0.5,-0.619268,0.988997,-0.5,0,0,1.268519,...,0.08,0.624229,0.44461,0.794836,0.196333,0.929743,0.485104,2.0,3.0,0
27,-0.607626,30,5707663080,-0.5,-0.619268,0.988997,-0.5,0,0,1.268519,...,0.07,0.624229,0.464192,0.739624,0.344996,0.866858,0.398233,2.0,3.0,0
28,-0.324506,30,5707677480,-0.5,-0.619268,0.988997,-0.5,0,0,1.268519,...,0.06,0.422751,0.516561,1.055489,0.414561,0.737531,0.462423,1.0,4.0,0


In [28]:
train_set.to_csv('../data/rl_train_set_unscaled.csv',index = False)
val_set.to_csv('../data/rl_val_set_unscaled.csv', index = False)
test_set.to_csv('../data/rl_test_set_unscaled.csv', index = False)

In [45]:
# scale features to [0,1] in train set, similar in val and test
import copy
scalable_fields = copy.deepcopy(binary_fields)
scalable_fields.extend(norm_fields)
scalable_fields.extend(log_fields)
for col in scalable_fields:
    minimum = min(train_set[col])
    maximum = max(train_set[col])
    train_set[col] = (train_set[col] - minimum)/(maximum-minimum)
    val_set[col] = (val_set[col] - minimum)/(maximum-minimum)
    test_set[col] = (test_set[col] - minimum)/(maximum-minimum)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [46]:
train_set.head()

Unnamed: 0,bloc,icustayid,charttime,gender,age,elixhauser,re_admission,died_in_hosp,mortality_90d,Weight_kg,...,median_dose_vaso,max_dose_vaso,input_total_tev,input_4hourly_tev,output_total,output_4hourly,cumulated_balance_tev,vaso_input,iv_input,reward
24,0.0,30,5707619880,0.0,0.492928,0.428571,0.0,0,0,0.385085,...,0.0,0.0,0.838425,0.0,0.0,0.0,0.6,0.0,0.0,0
25,0.22256,30,5707634280,0.0,0.492928,0.428571,0.0,0,0,0.385085,...,0.08,0.171167,0.850509,0.872064,0.624047,0.794352,0.611183,2.0,4.0,0
26,0.356608,30,5707648680,0.0,0.492928,0.428571,0.0,0,0,0.385085,...,0.08,0.171167,0.855124,0.791505,0.733794,0.907834,0.599167,2.0,3.0,0
27,0.452837,30,5707663080,0.0,0.492928,0.428571,0.0,0,0,0.385085,...,0.07,0.171167,0.85875,0.77383,0.770791,0.888463,0.5894,2.0,3.0,0
28,0.527957,30,5707677480,0.0,0.492928,0.428571,0.0,0,0,0.385085,...,0.06,0.136868,0.868446,0.874948,0.788103,0.848625,0.596617,1.0,4.0,0


In [47]:
train_set.to_csv('../data/lungrl_train_set_scaled.csv',index = False)
val_set.to_csv('../data/lungrl_val_set_scaled.csv', index = False)
test_set.to_csv('../data/lungrl_test_set_scaled.csv', index = False)