In [2]:
import os, sys
import numpy as np
import pandas as pd 

## Read patient files and split into train/val/test sets

In [26]:
train_df = []
val_df = []
test_df = []
features = None

for file in os.listdir('../../sepsis_data/trainingA/'):
    
    # Read file 
    patient_df = []
    with open('../../sepsis_data/trainingA/%s' % (file)) as f:
        
        if not features:
            features = f.readline().rstrip('\n').split('|')
        else:
            # This skips the headers
            f.readline()
        
        for idx, line in enumerate(f):
            # Append patient ID beginning of data vector
            pdata = [file.split('.')[0]]
            
            line = line.rstrip('\n')
            pdata.extend(line.split('|'))
            patient_df.append(pdata)
    
    train_df.extend(patient_df)

i = 0
for file in os.listdir('../../sepsis_data/trainingB/'):
    
    # Read file 
    patient_df = []
    with open('../../sepsis_data/trainingB/%s' % (file)) as f:
        
        if not features:
            features = f.readline().rstrip('\n').split('|')
        else:
            # This skips the headers
            f.readline()
        
        for idx, line in enumerate(f):
            # Append patient ID beginning of data vector
            pdata = [file.split('.')[0]]
            
            line = line.rstrip('\n')
            pdata.extend(line.split('|'))
            patient_df.append(pdata)
            
    i += 1 
    
    if i <=10000:
        train_df.extend(patient_df)
    elif i <= 15000:
        val_df.extend(patient_df)
    else:
        test_df.extend(patient_df)

In [27]:
train_df = pd.DataFrame(train_df, columns=['pid'] + features)
val_df = pd.DataFrame(val_df, columns=['pid'] + features)
test_df = pd.DataFrame(test_df, columns=['pid'] + features)

In [28]:
# train_df.to_csv('train_unimputed', index=False)
# val_df.to_csv('val_unimputed', index=False)
# test_df.to_csv('test_unimputed', index=False)

## Impute train and test sets using different methods

In [15]:
# train_df = pd.read_csv('dataframes/train_unimputed').drop(columns='Unnamed: 0')
# test_df = pd.read_csv('dataframes/test_unimputed').drop(columns='Unnamed: 0')

In [30]:
train_df.shape, val_df.shape, test_df.shape

((1170391, 42), (191612, 42), (190207, 42))

In [32]:
train_df.SepsisLabel = train_df.SepsisLabel.astype(np.int)

train_agg = train_df.groupby('pid').SepsisLabel.agg(['max'])

train_agg.sum()/30336

In [38]:
val_df.SepsisLabel = val_df.SepsisLabel.astype(np.int)

val_agg = val_df.groupby('pid').SepsisLabel.agg(['max'])

val_agg.sum()/val_agg.shape[0]

max    0.057
dtype: float64

In [39]:
test_df.SepsisLabel = test_df.SepsisLabel.astype(np.int)

test_agg = test_df.groupby('pid').SepsisLabel.agg(['max'])

test_agg.sum()/test_agg.shape[0]

max    0.0578
dtype: float64

In [16]:
# Normalize train and test sets

from sklearn.preprocessing import StandardScaler
stdscaler = StandardScaler().fit(train_df.loc[:,"HR":"Platelets"])
train_df.loc[:,"HR":"Platelets"] = stdscaler.transform(train_df.loc[:,"HR":"Platelets"])
test_df.loc[:,"HR":"Platelets"] = stdscaler.transform(test_df.loc[:,"HR":"Platelets"])

In [17]:
# Impute missing values using forward fill
train_df.fillna(method='ffill', inplace=True)

# Impute rest of missing values using mean of training set
mean_vals = train_df.loc[:,'HR':'Platelets'].astype(np.float64).mean()
train_df.fillna(mean_vals, inplace=True)

assert train_df.loc[:,'HR':'Platelets'].isna().sum().sum() == 0

In [18]:
# Impute missing values using forward fill
test_df.fillna(method='ffill', inplace=True)

# Impute missing values using mean of training set
test_df.fillna(mean_vals, inplace=True)

assert test_df.loc[:,'HR':'Platelets'].isna().sum().sum() == 0

In [165]:
# change dtypes to correct if needed.
# train_df[['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
#        'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
#        'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
#        'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
#        'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
#        'Fibrinogen', 'Platelets', 'Age', 'HospAdmTime']] = train_df[['HR', 'O2Sat', 
#         'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
#        'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
#        'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
#        'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
#        'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
#        'Fibrinogen', 'Platelets', 'Age', 'HospAdmTime']].astype(np.float)

# train_df[['Gender', 'ICULOS', 'SepsisLabel']] = train_df[['Gender', 'ICULOS', 
#                                                           'SepsisLabel']].astype(np.int)

# test_df[['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
#        'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
#        'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
#        'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
#        'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
#        'Fibrinogen', 'Platelets', 'Age', 'HospAdmTime']] = test_df[['HR', 'O2Sat', 
#         'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
#        'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
#        'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
#        'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
#        'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
#        'Fibrinogen', 'Platelets', 'Age', 'HospAdmTime']].astype(np.float)

# test_df[['Gender', 'ICULOS', 'SepsisLabel']] = test_df[['Gender', 'ICULOS', 
#                                                           'SepsisLabel']].astype(np.int)

In [19]:
for pid, df in train_df.groupby('pid'):
    df.drop(columns='pid').to_csv('train_forw_imputed/%s'%pid, sep='|', index=False)
    
for pid, df in test_df.groupby('pid'):
    df.drop(columns='pid').to_csv('test_forw_imputed/%s'%pid, sep='|', index=False)
    

In [20]:
X = train_df.drop(columns='pid')

In [21]:
X.shape

(150976, 41)

In [22]:
X.columns

Index(['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
       'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
       'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
       'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
       'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
       'Fibrinogen', 'Platelets', 'Age', 'Gender', 'Unit1', 'Unit2',
       'HospAdmTime', 'ICULOS', 'SepsisLabel'],
      dtype='object')