In [296]:
# This notebook reads in the discretised input data and then preprocesses the model features
# Firstly, values deemed excessively high/low are capped
# Relevant binary features and normally/log-normally features are standardised accordingly
# Training and test sets are split - 70% train, 10% validation, 20% test
# Resulting datasets are saved to file.

In [297]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import DataFrame

In [298]:
disc_inp_data = pd.read_csv("../data/full_discretised_input_data.csv")

In [299]:
disc_inp_data

Unnamed: 0,bloc,icustayid,charttime,gender,age,elixhauser,re_admission,died_in_hosp,mortality_90d,Weight_kg,...,SIRS,median_dose_vaso,max_dose_vaso,input_total_tev,input_4hourly_tev,output_total,output_4hourly,cumulated_balance_tev,vaso_input,iv_input
0,1,3,7245052800,0,17639.826435,0.0,0,0,1,77.5,...,4,0.0,0.0,3500.000,2100.0,230.0,230.0,3270.000,0.0,4.0
1,2,3,7245067200,0,17639.826435,0.0,0,0,1,77.5,...,3,0.0,0.0,5499.000,1999.0,697.0,467.0,4802.000,0.0,4.0
2,3,3,7245081600,0,17639.826435,0.0,0,0,1,77.5,...,3,0.0,0.0,5604.000,105.0,2302.0,1605.0,3302.000,0.0,2.0
3,4,3,7245096000,0,17639.826435,0.0,0,0,1,77.5,...,3,0.0,0.0,5664.000,60.0,2922.0,620.0,2742.000,0.0,2.0
4,5,3,7245110400,0,17639.826435,0.0,0,0,1,77.5,...,3,0.0,0.0,5724.000,60.0,3352.0,430.0,2372.000,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258296,9,99995,4612888440,0,8538.739340,0.0,0,0,0,79.6,...,2,0.0,0.0,2113.583,0.0,4660.0,600.0,-2546.417,0.0,0.0
258297,10,99995,4612902840,0,8538.739340,0.0,0,0,0,79.6,...,2,0.0,0.0,2113.583,0.0,5360.0,700.0,-3246.417,0.0,0.0
258298,11,99995,4612917240,0,8538.739340,0.0,0,0,0,79.6,...,1,0.0,0.0,2113.583,0.0,5360.0,0.0,-3246.417,0.0,0.0
258299,12,99995,4612931640,0,8538.739340,0.0,0,0,0,79.6,...,1,0.0,0.0,2113.583,0.0,5780.0,420.0,-3666.417,0.0,0.0


In [300]:
crrt_df = pd.read_csv("../data/crrt.csv")
crrt_df.columns
crrt_ids = crrt_df['icustay_id'].unique()
ccrt_real_ids = [i - 200000 for i in ccrt_ids]

In [301]:
disc_inp_data = disc_inp_data[~(disc_inp_data['icustayid'].isin(ccrt_ids))]
print(len(disc_inp_data))

248280


In [302]:
disc_inp_data

Unnamed: 0,bloc,icustayid,charttime,gender,age,elixhauser,re_admission,died_in_hosp,mortality_90d,Weight_kg,...,SIRS,median_dose_vaso,max_dose_vaso,input_total_tev,input_4hourly_tev,output_total,output_4hourly,cumulated_balance_tev,vaso_input,iv_input
0,1,3,7245052800,0,17639.826435,0.0,0,0,1,77.5,...,4,0.0,0.0,3500.000,2100.0,230.0,230.0,3270.000,0.0,4.0
1,2,3,7245067200,0,17639.826435,0.0,0,0,1,77.5,...,3,0.0,0.0,5499.000,1999.0,697.0,467.0,4802.000,0.0,4.0
2,3,3,7245081600,0,17639.826435,0.0,0,0,1,77.5,...,3,0.0,0.0,5604.000,105.0,2302.0,1605.0,3302.000,0.0,2.0
3,4,3,7245096000,0,17639.826435,0.0,0,0,1,77.5,...,3,0.0,0.0,5664.000,60.0,2922.0,620.0,2742.000,0.0,2.0
4,5,3,7245110400,0,17639.826435,0.0,0,0,1,77.5,...,3,0.0,0.0,5724.000,60.0,3352.0,430.0,2372.000,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258296,9,99995,4612888440,0,8538.739340,0.0,0,0,0,79.6,...,2,0.0,0.0,2113.583,0.0,4660.0,600.0,-2546.417,0.0,0.0
258297,10,99995,4612902840,0,8538.739340,0.0,0,0,0,79.6,...,2,0.0,0.0,2113.583,0.0,5360.0,700.0,-3246.417,0.0,0.0
258298,11,99995,4612917240,0,8538.739340,0.0,0,0,0,79.6,...,1,0.0,0.0,2113.583,0.0,5360.0,0.0,-3246.417,0.0,0.0
258299,12,99995,4612931640,0,8538.739340,0.0,0,0,0,79.6,...,1,0.0,0.0,2113.583,0.0,5780.0,420.0,-3666.417,0.0,0.0


In [303]:
disc_inp_data = disc_inp_data.reset_index(drop=True)

In [304]:
# add rewards - sparsely for now; reward function shaping comes in a separate script
disc_inp_data['reward'] = 0
for i in disc_inp_data.index:
    if i == 0:
        continue
    else:
        if disc_inp_data.loc[i, 'icustayid'] != disc_inp_data.loc[i-1, 'icustayid']:
            if disc_inp_data.loc[i-1, 'died_in_hosp'] == 1:
                disc_inp_data.loc[i-1,'reward'] = -100
            elif disc_inp_data.loc[i-1, 'died_in_hosp'] == 0:
                disc_inp_data.loc[i-1,'reward'] = 100
            else:
                print ("error in row", i-1)
if disc_inp_data.loc[len(disc_inp_data)-1, 'died_in_hosp'] == 1:
    disc_inp_data.loc[len(disc_inp_data)-1, 'reward'] = -100
elif disc_inp_data.loc[len(disc_inp_data)-1, 'died_in_hosp'] == 0:
     disc_inp_data.loc[len(disc_inp_data)-1, 'reward'] = 100
print (disc_inp_data['reward'].value_counts())

 0      230982
 100     15278
-100      2020
Name: reward, dtype: int64


In [305]:
mech_counter = 1/6
for i in disc_inp_data.index:
    if i == 0:
        pass
    else:
        if disc_inp_data.loc[i, 'icustayid'] != disc_inp_data.loc[i-1, 'icustayid']:
            mech_counter = 1/6
        
    if disc_inp_data.loc[i, 'mechvent'] == 1: 
        disc_inp_data.loc[i,'reward'] -= mech_counter
        mech_counter +=1/6
    else:
        mech_counter = 1/6

if (disc_inp_data.loc[len(disc_inp_data)-1, 'mechvent'] == 1) and (disc_inp_data.loc[len(disc_inp_data)-1, 'icustayid'] == disc_inp_data.loc[len(disc_inp_data)-2, 'icustayid']):
    disc_inp_data.loc[len(disc_inp_data)-1, 'reward'] -= mech_counter
elif (disc_inp_data.loc[len(disc_inp_data)-1, 'mechvent'] == 1):
    disc_inp_data.loc[len(disc_inp_data)-1, 'reward'] -= 1/6
        
        
print (disc_inp_data['reward'].value_counts())

 0.000000      130583
 100.000000     11986
-0.166667        9939
-0.333333        9545
-0.500000        9119
-0.666667        8651
-0.833333        7734
-1.000000        7100
-1.166667        6597
-1.333333        6105
-1.500000        5603
-1.666667        5152
-1.833333        4730
-2.000000        4352
-2.166667        3942
-2.333333        2932
-2.500000        2356
-2.666667        1959
-2.833333        1723
-3.000000        1512
-3.166667        1348
 96.666667       1021
-100.000000       767
 97.666667        657
 97.500000        355
-103.333333       290
 97.333333        249
-102.333333       202
 97.833333        151
 97.166667        116
-102.500000       108
 98.166667         98
 97.000000         92
 98.000000         84
 96.833333         84
 98.333333         82
-102.666667        79
 98.500000         63
 99.666667         57
-102.000000        48
-102.166667        47
-102.833333        45
-100.500000        41
-101.000000        40
-101.500000        39
-101.83333

In [None]:
for i in disc_inp_data.index:
    if i == 0:
        pass
    else:
        current_creatinine = disc_inp_data.loc[i, 'Creatinine']
        reward = 0
        if current_creatinine > 1.25:
            reward = 2*(1.2 - current_creatinine)
        disc_inp_data.loc[i, 'reward'] += reward
        
print (disc_inp_data['reward'].value_counts())

In [None]:
# now split into train/validation/test sets
import random
unique_ids = disc_inp_data['icustayid'].unique()
random.shuffle(unique_ids)
train_sample = 0.7
val_sample = 0.1
test_sample = 0.2
train_num = int(len(unique_ids) * 0.7)
val_num = int(len(unique_ids)*0.1) + train_num
train_ids = unique_ids[:train_num]
val_ids = unique_ids[train_num:val_num]
test_ids = unique_ids[val_num:]

In [None]:
train_set = DataFrame()
train_set = disc_inp_data.loc[disc_inp_data['icustayid'].isin(train_ids)]

val_set = DataFrame()
val_set = disc_inp_data.loc[disc_inp_data['icustayid'].isin(val_ids)]

test_set = DataFrame()
test_set = disc_inp_data.loc[disc_inp_data['icustayid'].isin(test_ids)]

In [None]:
len(test_set)

In [None]:
# cap values in train and test
caps = pd.read_csv("../data/capping_values.csv")
for i in caps.index:
    param = caps.loc[i,'parameter'][1:-1]
    maxval = caps.loc[i,'limsup']
    minval = caps.loc[i,'liminf']
    train_set[param][train_set[param] >= maxval] = maxval
    train_set[param][train_set[param] <= minval] = minval
    val_set[param][val_set[param] >= maxval] = maxval
    val_set[param][val_set[param] <= minval] = minval
    test_set[param][test_set[param] >= maxval] = maxval
    test_set[param][test_set[param] <= minval] = minval

In [None]:
binary_fields = ['gender','mechvent','re_admission']
norm_fields= ['age','Weight_kg','GCS','HR','SysBP','MeanBP','DiaBP','RR','Temp_C','FiO2_1',
    'Potassium','Sodium','Chloride','Glucose','Magnesium','Calcium',
    'Hb','WBC_count','Platelets_count','PTT','PT','Arterial_pH','paO2','paCO2',
    'Arterial_BE','HCO3','Arterial_lactate','SOFA','SIRS','Shock_Index',
    'PaO2_FiO2','cumulated_balance_tev', 'elixhauser', 'Albumin', u'CO2_mEqL', 'Ionised_Ca']
log_fields = ['max_dose_vaso','SpO2','BUN','Creatinine','SGOT','SGPT','Total_bili','INR',
              'input_total_tev','input_4hourly_tev','output_total','output_4hourly', 'bloc']

In [None]:
# normalise binary fields
train_set[binary_fields] = train_set[binary_fields] - 0.5 
val_set[binary_fields] = val_set[binary_fields] - 0.5 
test_set[binary_fields] = test_set[binary_fields] - 0.5 

In [None]:
# normal distn fields
for item in norm_fields:
    av = train_set[item].mean()
    std = train_set[item].std()
    train_set[item] = (train_set[item] - av) / std
    val_set[item] = (val_set[item] - av) / std
    test_set[item] = (test_set[item] - av) / std

In [None]:
# log normal fields
train_set[log_fields] = np.log(0.1 + train_set[log_fields])
val_set[log_fields] = np.log(0.1 + val_set[log_fields])
test_set[log_fields] = np.log(0.1 + test_set[log_fields])
for item in log_fields:
    av = train_set[item].mean()
    std = train_set[item].std()
    train_set[item] = (train_set[item] - av) / std
    val_set[item] = (val_set[item] - av) / std
    test_set[item] = (test_set[item] - av) / std

In [None]:
train_set.head()

In [None]:
# scale features to [0,1] in train set, similar in val and test
import copy
scalable_fields = copy.deepcopy(binary_fields)
scalable_fields.extend(norm_fields)
scalable_fields.extend(log_fields)
for col in scalable_fields:
    minimum = min(train_set[col])
    maximum = max(train_set[col])
    train_set[col] = (train_set[col] - minimum)/(maximum-minimum)
    val_set[col] = (val_set[col] - minimum)/(maximum-minimum)
    test_set[col] = (test_set[col] - minimum)/(maximum-minimum)

In [None]:
train_set.head()

In [None]:
train_set.to_csv('../data/both_train_set_scaled.csv',index = False)
val_set.to_csv('../data/both_val_set_scaled.csv', index = False)
test_set.to_csv('../data/both_test_set_scaled.csv', index = False)