# Sepsis Competition - BMEG400D
## Preprocessing
## By Sergei Issaev

Sepsis is a life-threatening condition that occurs when the body's response to infection exceeds typical inflammatory immune response, resulting in tissue and organ damage. As early prediction of sepsis is potentially life-saving, our goal is to predict sepsis four hours before the clinical identification of sepsis. This is the first of two notebooks - this notebook will examine the provided data, preprocess it  and provide complete training and test files as .csv.


### Import Libraries

In [8]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

### Load Training Files

In [9]:
train_directory = "C:/Users/serge/Documents/UBC/BMEG_591A/competition/training_2020-11-11"
test_directory = "C:/Users/serge/Documents/UBC/BMEG_591A/competition/testing_2020-11-11"

In [10]:
def load_challenge_data(file):
    with open(file, 'r') as f:
        header = f.readline().strip()
        column_names = header.split(',')
        data = np.loadtxt(f, delimiter=',')

    # Ignore SepsisLabel column if present.
    if column_names[-1] == 'SepsisLabel':
        column_names = column_names[:-1]
        data = data[:, :-1]

    return data

In [11]:
# Find files.
train_files = []
for f in os.listdir(train_directory):
    if os.path.isfile(os.path.join(train_directory, f)) and not f.lower().startswith('.') and f.lower().endswith('csv'):
        train_files.append(f)
print(f"The number of training files is {len(train_files)}.")

The number of training files is 5000.


In [12]:
for i in tqdm(range(len(train_files))):
    sos = pd.read_csv(train_directory + '/' + train_files[i])
    sos['WBC'] = sos['WBC'].fillna(np.mean(sos['WBC']))
    sos['Hgb'] = sos['Hgb'].fillna(np.mean(sos['Hgb']))
    sos['Hct'] = sos['Hct'].fillna(np.mean(sos['Hct']))
    sos['Potassium'] = sos['Potassium'].fillna(np.mean(sos['Potassium']))
    sos['Magnesium'] = sos['Magnesium'].fillna(np.mean(sos['Magnesium']))
    sos['Glucose'] = sos['Glucose'].fillna(np.mean(sos['Glucose']))
    sos['Creatinine'] = sos['Creatinine'].fillna(np.mean(sos['Creatinine']))
    sos['BUN'] = sos['BUN'].fillna(np.mean(sos['BUN']))
    sos['pH'] = sos['pH'].fillna(np.mean(sos['pH']))
    sos['FiO2'] = sos['FiO2'].fillna(np.mean(sos['FiO2']))
    sos['Resp'] = sos['Resp'].bfill()
    sos['DBP'] = sos['DBP'].bfill()
    sos['MAP'] = sos['MAP'].bfill()
    sos['SBP'] = sos['SBP'].bfill()
    sos['Temp'] = sos['Temp'].fillna(np.mean(sos['Temp']))
    sos['O2Sat'] = sos['O2Sat'].bfill()
    sos['HR'] = sos['HR'].bfill()
    sos['O2Sat'] = sos['O2Sat'].ffill()
    sos['HR'] = sos['HR'].ffill()
    sos['Resp'] = sos['Resp'].ffill()
    sos['DBP'] = sos['DBP'].ffill()
    sos['MAP'] = sos['MAP'].ffill()
    sos['SBP'] = sos['SBP'].ffill()
    if i == 0:
        het = sos
    else:
        try:
            het = pd.concat([het, sos]).reset_index()
        except:
            het.drop(columns='level_0', inplace=True)
            het = pd.concat([het, sos]).reset_index()
        

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [31:16<00:00,  2.66it/s]


In [13]:
het.head()

Unnamed: 0,level_0,Age,BUN,Creatinine,DBP,FiO2,Glucose,HR,Hct,Hgb,...,O2Sat,Potassium,Resp,SBP,SepsisLabel,Sex,Temp,WBC,index,pH
0,0,65.71,16.5,0.8,52.0,,135.333333,103.5,25.8,8.3,...,97.0,4.433333,18.0,107.5,0,0,36.455,7.6,0.0,7.39
1,1,65.71,14.0,0.8,52.0,,135.333333,103.5,27.6,8.3,...,97.0,4.433333,18.0,107.5,0,0,36.455,7.6,1.0,7.39
2,2,65.71,16.5,0.8,61.5,,253.0,108.0,25.8,8.3,...,98.5,5.0,19.5,124.5,0,0,36.78,7.6,2.0,7.36
3,3,65.71,16.5,0.8,58.5,,135.333333,107.5,25.8,8.3,...,96.5,4.433333,17.0,117.5,0,0,36.455,7.6,3.0,7.39
4,4,65.71,16.5,0.8,61.0,,135.333333,113.0,25.8,8.3,...,100.0,4.433333,26.0,125.0,0,0,36.455,7.6,4.0,7.39


### Load Testing Files

In [14]:
# Find files.
test_files = []
for f in os.listdir(test_directory):
    if os.path.isfile(os.path.join(test_directory, f)) and not f.lower().startswith('.') and f.lower().endswith('csv'):
        test_files.append(f)
print(f"The number of test files is {len(test_files)}.")

The number of test files is 1000.


In [15]:
for i in tqdm(range(len(test_files))):
    sos = pd.read_csv(test_directory + '/' + test_files[i])
    sos['WBC'] = sos['WBC'].fillna(np.mean(sos['WBC']))
    sos['Hgb'] = sos['Hgb'].fillna(np.mean(sos['Hgb']))
    sos['Hct'] = sos['Hct'].fillna(np.mean(sos['Hct']))
    sos['Potassium'] = sos['Potassium'].fillna(np.mean(sos['Potassium']))
    sos['Magnesium'] = sos['Magnesium'].fillna(np.mean(sos['Magnesium']))
    sos['Glucose'] = sos['Glucose'].fillna(np.mean(sos['Glucose']))
    sos['Creatinine'] = sos['Creatinine'].fillna(np.mean(sos['Creatinine']))
    sos['BUN'] = sos['BUN'].fillna(np.mean(sos['BUN']))
    sos['pH'] = sos['pH'].fillna(np.mean(sos['pH']))
    sos['FiO2'] = sos['FiO2'].fillna(np.mean(sos['FiO2']))
    sos['Resp'] = sos['Resp'].bfill()
    sos['DBP'] = sos['DBP'].bfill()
    sos['MAP'] = sos['MAP'].bfill()
    sos['SBP'] = sos['SBP'].bfill()
    sos['Temp'] = sos['Temp'].fillna(np.mean(sos['Temp']))
    sos['O2Sat'] = sos['O2Sat'].bfill()
    sos['HR'] = sos['HR'].bfill()
    sos['O2Sat'] = sos['O2Sat'].ffill()
    sos['HR'] = sos['HR'].ffill()
    sos['Resp'] = sos['Resp'].ffill()
    sos['DBP'] = sos['DBP'].ffill()
    sos['MAP'] = sos['MAP'].ffill()
    sos['SBP'] = sos['SBP'].ffill()
    if i == 0:
        test = sos
    else:
        try:
            test = pd.concat([test, sos]).reset_index()
        except:
            test.drop(columns='level_0', inplace=True)
            test = pd.concat([test, sos]).reset_index()
        

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:55<00:00,  8.68it/s]


In [16]:
test.drop(columns='level_0', inplace=True)
het.drop(columns='level_0', inplace=True)
combined = pd.concat([test, het]).reset_index()

### Get Global Means

In [17]:
wbc_mean = np.mean(combined['WBC'])
hgb_mean = np.mean(combined['Hgb'])
hct_mean = np.mean(combined['Hct'])
pot_mean = np.mean(combined['Potassium'])
mag_mean = np.mean(combined['Magnesium'])
glu_mean = np.mean(combined['Glucose'])
crea_mean = np.mean(combined['Creatinine'])
bun_mean = np.mean(combined['BUN'])
ph_mean = np.mean(combined['pH'])
fio_mean = np.mean(combined['FiO2'])
dbp_mean = np.mean(combined['DBP'])
map_mean = np.mean(combined['MAP'])
sbp_mean = np.mean(combined['SBP'])
temp_mean = np.mean(combined['Temp'])
resp_mean = np.mean(combined['Resp'])
hr_mean = np.mean(combined['HR'])
ic_mean = np.mean(combined['ICULOS'])
o2_mean = np.mean(combined['O2Sat'])

In [18]:
print(ic_mean)
print(hr_mean)
print(o2_mean)
print(temp_mean)
print(sbp_mean)
print(map_mean)
print(dbp_mean)
print(resp_mean)
print(fio_mean)
print(ph_mean)
print(bun_mean)
print(crea_mean)
print(glu_mean)
print(mag_mean)
print(pot_mean)
print(hct_mean)
print(hgb_mean)
print(wbc_mean)

30.154070036034373
86.29731254147312
97.04501356538684
36.87898051414656
121.7602888574503
81.55512870970712
62.83403054491221
19.145484125916457
0.519280928328674
7.383057200679564
24.701493225883524
1.5614007834364017
132.2681938675735
2.047994791406419
4.129589329827328
30.893636449077842
10.273107324364078
11.407252670058481


In [19]:
print('Part of missing values for every column')
print(combined.isnull().sum() / len(combined))

Part of missing values for every column
level_0        0.000000
Age            0.000000
BUN            0.032326
Creatinine     0.032355
DBP            0.143948
FiO2           0.434664
Glucose        0.027118
HR             0.000000
Hct            0.037807
Hgb            0.038575
ICULOS         0.000000
MAP            0.001193
Magnesium      0.087222
O2Sat          0.000000
Potassium      0.032162
Resp           0.000983
SBP            0.005506
SepsisLabel    0.000000
Sex            0.000000
Temp           0.007001
WBC            0.042355
index          0.999446
pH             0.422031
dtype: float64


### Load the Training Data Again

In [20]:
for i in tqdm(range(len(train_files))):
    sos = pd.read_csv(train_directory + '/' + train_files[i])
    if i == 0:
        het = sos
    else:
        try:
            het = pd.concat([het, sos]).reset_index()
        except:
            het.drop(columns='level_0', inplace=True)
            het = pd.concat([het, sos]).reset_index()     

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # Remove the CWD from sys.path while we load stuff.
100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [22:49<00:00,  3.65it/s]


In [21]:
het.sample(4)

Unnamed: 0,level_0,Age,BUN,Creatinine,DBP,FiO2,Glucose,HR,Hct,Hgb,...,O2Sat,Potassium,Resp,SBP,SepsisLabel,Sex,Temp,WBC,index,pH
147786,147786,74.0,,,58.0,,,73.0,,,...,98.0,,19.0,104.0,0,1,37.1,,,
76358,76358,53.03,,,44.0,,,83.0,,,...,93.0,,15.0,95.0,0,1,,,,
106286,106286,29.0,,,,,,83.0,,,...,99.0,,34.0,,0,0,,,,
18980,18980,56.89,19.0,0.7,,,71.0,79.0,29.3,10.1,...,99.0,3.7,,123.0,0,0,,5.4,,


### Load the Testing Data Again

In [22]:
for i in tqdm(range(len(test_files))):
    sos = pd.read_csv(test_directory + '/' + test_files[i])

    if i == 0:
        test = sos
    else:
        try:
            test = pd.concat([test, sos]).reset_index()
        except:
            test.drop(columns='level_0', inplace=True)
            test = pd.concat([test, sos]).reset_index()
        

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # This is added back by InteractiveShellApp.init_path()
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:20<00:00, 12.36it/s]


In [23]:
test.sample(4)

Unnamed: 0,level_0,Age,BUN,Creatinine,DBP,FiO2,Glucose,HR,Hct,Hgb,...,O2Sat,Potassium,Resp,SBP,SepsisLabel,Sex,Temp,WBC,index,pH
32123,32123,57.0,,,59.0,,,108.0,,,...,99.0,,26.0,86.0,0,1,35.1,,,
33468,33468,44.0,,,72.0,,,79.0,,,...,,4.0,18.0,116.0,0,1,,,,
38687,25,76.0,,,52.5,0.4,160.0,71.5,,,...,100.0,4.9,22.0,100.0,0,0,38.8,,,7.38
9660,9660,54.39,13.0,0.9,52.0,,238.0,67.0,26.5,8.6,...,,4.1,18.0,134.0,0,0,,8.5,,


### Fill NaNs with Global Means

In [24]:
het['WBC'] = het['WBC'].fillna(wbc_mean)
het['Hgb'] = het['Hgb'].fillna(hgb_mean)
het['Hct'] = het['Hct'].fillna(hct_mean)
het['Potassium'] = het['Potassium'].fillna(pot_mean)
het['Magnesium'] = het['Magnesium'].fillna(mag_mean)
het['Glucose'] = het['Glucose'].fillna(glu_mean)
het['Creatinine'] = het['Creatinine'].fillna(crea_mean)
het['BUN'] = het['BUN'].fillna(bun_mean)
het['pH'] = het['pH'].fillna(ph_mean)
het['FiO2'] = het['FiO2'].fillna(fio_mean)
het['DBP'] = het['DBP'].fillna(dbp_mean)
het['MAP'] = het['MAP'].fillna(map_mean)
het['SBP'] = het['SBP'].fillna(sbp_mean)
het['Temp'] = het['Temp'].fillna(temp_mean)
het['Resp'] = het['Resp'].fillna(resp_mean)
het['HR'] = het['Resp'].fillna(hr_mean)
het['O2Sat'] = het['O2Sat'].fillna(o2_mean)

In [25]:
het.sample(4)

Unnamed: 0,level_0,Age,BUN,Creatinine,DBP,FiO2,Glucose,HR,Hct,Hgb,...,O2Sat,Potassium,Resp,SBP,SepsisLabel,Sex,Temp,WBC,index,pH
76142,76142,57.61,24.701493,1.561401,68.0,0.5,132.268194,31.0,30.893636,10.273107,...,99.0,4.129589,31.0,150.0,0,1,36.878981,11.407253,,7.383057
9065,9065,70.77,24.701493,1.561401,52.0,0.519281,132.268194,9.0,30.893636,10.273107,...,95.0,4.129589,9.0,95.0,0,0,36.878981,11.407253,,7.383057
49512,49512,44.77,24.701493,1.561401,59.5,0.519281,122.0,16.0,30.893636,10.273107,...,95.0,3.4,16.0,119.0,0,0,35.22,11.407253,,7.26
9661,9661,85.44,24.701493,1.561401,42.0,0.519281,132.268194,26.0,30.893636,10.273107,...,100.0,4.129589,26.0,131.0,0,0,36.878981,11.407253,,7.383057


In [26]:
test['WBC'] = test['WBC'].fillna(wbc_mean)
test['Hgb'] = test['Hgb'].fillna(hgb_mean)
test['Hct'] = test['Hct'].fillna(hct_mean)
test['Potassium'] = test['Potassium'].fillna(pot_mean)
test['Magnesium'] = test['Magnesium'].fillna(mag_mean)
test['Glucose'] = test['Glucose'].fillna(glu_mean)
test['Creatinine'] = test['Creatinine'].fillna(crea_mean)
test['BUN'] = test['BUN'].fillna(bun_mean)
test['pH'] = test['pH'].fillna(ph_mean)
test['FiO2'] = test['FiO2'].fillna(fio_mean)
test['DBP'] = test['DBP'].fillna(dbp_mean)
test['MAP'] = test['MAP'].fillna(map_mean)
test['SBP'] = test['SBP'].fillna(sbp_mean)
test['Temp'] = test['Temp'].fillna(temp_mean)
test['Resp'] = test['Resp'].fillna(resp_mean)
test['HR'] = test['Resp'].fillna(hr_mean)
test['O2Sat'] = test['O2Sat'].fillna(o2_mean)

In [27]:
test.sample(4)

Unnamed: 0,level_0,Age,BUN,Creatinine,DBP,FiO2,Glucose,HR,Hct,Hgb,...,O2Sat,Potassium,Resp,SBP,SepsisLabel,Sex,Temp,WBC,index,pH
27798,27798,56.0,24.701493,1.561401,74.5,0.519281,162.0,15.5,30.893636,10.273107,...,97.0,4.129589,15.5,93.5,0,1,38.25,11.407253,,7.383057
15957,15957,51.95,24.701493,1.561401,72.0,0.519281,132.268194,20.0,24.0,8.0,...,100.0,4.129589,20.0,159.0,0,0,37.4,11.407253,,7.33
20612,20612,73.0,24.701493,1.561401,50.0,0.519281,89.0,17.5,30.893636,10.273107,...,99.0,4.129589,17.5,100.0,0,1,36.1,11.407253,,7.383057
30828,30828,59.0,24.701493,1.561401,75.0,0.519281,132.268194,15.0,30.893636,10.273107,...,97.5,4.129589,15.0,136.5,0,1,36.35,11.407253,,7.383057


### Save training.csv & testing.csv

In [28]:
het.to_csv('training.csv', index=False)
test.to_csv('testing.csv', index=False)