In [2]:
import os, sys
import numpy as np
import pandas as pd 

## Read patient files and split into train and test sets

In [3]:
train_df = []
test_df = []
features = None

i = 0
for file in os.listdir('sepsis_data/'):
    
    # Read file 
    patient_df = []
    with open('sepsis_data/%s' % (file)) as f:
        
        if not features:
            features = f.readline().rstrip('\n').split('|')
        else:
            # This skips the headers
            f.readline()
        
        for idx, line in enumerate(f):
            # Append patient ID beginning of data vector
            pdata = [file.split('.')[0]]
            
            line = line.rstrip('\n')
            pdata.extend(line.split('|'))
            patient_df.append(pdata)
            
    i += 1 
    if i%500 == 0:
        print("Count: %d"%(i))
    
    if i <= 4000:
        train_df.extend(patient_df)
    else:
        test_df.extend(patient_df)

Count: 500
Count: 1000
Count: 1500
Count: 2000
Count: 2500
Count: 3000
Count: 3500
Count: 4000
Count: 4500
Count: 5000


In [142]:
train_df.to_csv('train_unimputed')
test_df.to_csv('test_unimputed')

## Impute train and test sets using different methods

In [15]:
train_df = pd.read_csv('dataframes/train_unimputed').drop(columns='Unnamed: 0')
test_df = pd.read_csv('dataframes/test_unimputed').drop(columns='Unnamed: 0')

In [5]:
train_df.head()

Unnamed: 0,pid,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,...,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
0,p01065,,,,,,,,,,...,,,,54,1,1.0,0.0,-5.76,1,0
1,p01065,83.0,100.0,,103.0,68.0,58.0,,,,...,,,,54,1,1.0,0.0,-5.76,2,0
2,p01065,80.0,99.0,36.7,103.0,69.0,58.0,10.0,,,...,,,,54,1,1.0,0.0,-5.76,3,0
3,p01065,87.0,99.0,,107.0,76.0,67.0,,,,...,,,,54,1,1.0,0.0,-5.76,4,0
4,p01065,91.0,99.0,,106.0,76.0,68.0,,,,...,,,,54,1,1.0,0.0,-5.76,5,0


In [7]:
train_df.columns

Index(['pid', 'HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
       'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
       'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
       'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
       'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
       'Fibrinogen', 'Platelets', 'Age', 'Gender', 'Unit1', 'Unit2',
       'HospAdmTime', 'ICULOS', 'SepsisLabel'],
      dtype='object')

In [6]:
train_df.shape, test_df.shape

((150976, 42), (37477, 42))

In [8]:
train_df.dtypes

pid                  object
HR                  float64
O2Sat               float64
Temp                float64
SBP                 float64
MAP                 float64
DBP                 float64
Resp                float64
EtCO2               float64
BaseExcess          float64
HCO3                float64
FiO2                float64
pH                  float64
PaCO2               float64
SaO2                float64
AST                 float64
BUN                 float64
Alkalinephos        float64
Calcium             float64
Chloride            float64
Creatinine          float64
Bilirubin_direct    float64
Glucose             float64
Lactate             float64
Magnesium           float64
Phosphate           float64
Potassium           float64
Bilirubin_total     float64
TroponinI           float64
Hct                 float64
Hgb                 float64
PTT                 float64
WBC                 float64
Fibrinogen          float64
Platelets           float64
Age                 

In [16]:
# Normalize train and test sets
# from sklearn.preprocessing import MinMaxScaler

# mmscaler = MinMaxScaler().fit(train_df.loc[:,"HR":"HospAdmTime"])
# train_df.loc[:,"HR":"HospAdmTime"] = mmscaler.transform(train_df.loc[:,"HR":"HospAdmTime"])
# test_df.loc[:,"HR":"HospAdmTime"] = mmscaler.transform(test_df.loc[:,"HR":"HospAdmTime"])

from sklearn.preprocessing import StandardScaler
stdscaler = StandardScaler().fit(train_df.loc[:,"HR":"Platelets"])
train_df.loc[:,"HR":"Platelets"] = stdscaler.transform(train_df.loc[:,"HR":"Platelets"])
test_df.loc[:,"HR":"Platelets"] = stdscaler.transform(test_df.loc[:,"HR":"Platelets"])

In [17]:
# Impute missing values using forward fill
train_df.fillna(method='ffill', inplace=True)

# Impute rest of missing values using mean of training set
mean_vals = train_df.loc[:,'HR':'Platelets'].astype(np.float64).mean()
train_df.fillna(mean_vals, inplace=True)

assert train_df.loc[:,'HR':'Platelets'].isna().sum().sum() == 0

In [18]:
# Impute missing values using forward fill
test_df.fillna(method='ffill', inplace=True)

# Impute missing values using mean of training set
test_df.fillna(mean_vals, inplace=True)

assert test_df.loc[:,'HR':'Platelets'].isna().sum().sum() == 0

In [165]:
# change dtypes to correct if needed.
# train_df[['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
#        'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
#        'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
#        'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
#        'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
#        'Fibrinogen', 'Platelets', 'Age', 'HospAdmTime']] = train_df[['HR', 'O2Sat', 
#         'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
#        'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
#        'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
#        'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
#        'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
#        'Fibrinogen', 'Platelets', 'Age', 'HospAdmTime']].astype(np.float)

# train_df[['Gender', 'ICULOS', 'SepsisLabel']] = train_df[['Gender', 'ICULOS', 
#                                                           'SepsisLabel']].astype(np.int)

# test_df[['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
#        'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
#        'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
#        'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
#        'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
#        'Fibrinogen', 'Platelets', 'Age', 'HospAdmTime']] = test_df[['HR', 'O2Sat', 
#         'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
#        'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
#        'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
#        'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
#        'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
#        'Fibrinogen', 'Platelets', 'Age', 'HospAdmTime']].astype(np.float)

# test_df[['Gender', 'ICULOS', 'SepsisLabel']] = test_df[['Gender', 'ICULOS', 
#                                                           'SepsisLabel']].astype(np.int)

In [19]:
for pid, df in train_df.groupby('pid'):
    df.drop(columns='pid').to_csv('train_forw_imputed/%s'%pid, sep='|', index=False)
    
for pid, df in test_df.groupby('pid'):
    df.drop(columns='pid').to_csv('test_forw_imputed/%s'%pid, sep='|', index=False)
    

In [20]:
X = train_df.drop(columns='pid')

In [21]:
X.shape

(150976, 41)

In [22]:
X.columns

Index(['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
       'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
       'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
       'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
       'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
       'Fibrinogen', 'Platelets', 'Age', 'Gender', 'Unit1', 'Unit2',
       'HospAdmTime', 'ICULOS', 'SepsisLabel'],
      dtype='object')