Feature Engineering

# Import packages

In [5]:
# load data
from submodules.load_data import load_data
# data manipulation
import numpy as np
import pandas as pd

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from submodules.plots import plotGender
from submodules.plots import plotUnit


# data splitting
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

# data preprocessing
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

# model
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate

# performance
from sklearn.metrics import f1_score

# Load the data

Load semi-colon seperated data from disk

In [6]:
data = load_data()

# Create a Test Dataset
> uses scikit-learn

Performing this early minimizes generalization and bias you may inadvertently apply to your system.
Simply put, a test set of data involves: picking ~20% of the instances randomly and setting them aside.

Some considerations for sampling methods that generate the test set:
1. you don't want your model to see the entire dataset
1. you want to be able to fetch new data for training
1. you want to maintain the same percentage of training data against the entire dataset
1. you want a representative training dataset (~7% septic positive)

https://realpython.com/train-test-split-python-data/

In [7]:
# sets 20% of the data aside for testing, sets the random number generate to it always generates the same shuffled indicies
# x = 2 dimensional array with inputs
# X_train is the training part of the first sequence (x)
# X_test is the test part of the first sequence (x)
# y = 1 dimensional array with outputs
# y_train is the labeled training part of the second sequence
# y_test is the labeled test part of the second sequence
# test_size is the amount of the total dataset to set aside for testing
# random state fixes the randomization so you get the same results each time
# Shuffle before the data is split, it is shuffled
# stratified splitting keeps the proportion of y values trhough the train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.drop("isSepsis", axis=1), data["isSepsis"], test_size=0.2, random_state=42, stratify=data["isSepsis"])

Age
- The max is 100 years old
- This could be considered an outlier and capping the age is 90 is a parameter to tune

In [8]:
# X_train is all the instance with attributes
X_train = X_train.loc[X_train["Age"] <= 100]
# y_train is the label of each instance (isSepsis = 1 or 0)
y_train = y_train.loc[X_train["Age"] <= 100]

# X_test is all the instance with attributes
X_test = X_test.loc[X_test["Age"] <= 100]
# y_test is the label of each instance (isSepsis = 1 or 0)
y_test = y_test.loc[X_test["Age"] <= 100]

In [9]:
# discretization is the process of transferring continuous
# functions, models, variables, and equations into discrete
# counterparts. This process is usually carried out as a first step toward making them suitable for numerical evaluation and implementation on digital computers.
# this splits age into 4 categories
def discretization_age(data):
    bins = [13, 18, 30, 60, np.inf]
    data = np.digitize(data, bins=bins)
    data = data.reshape(len(data), 1)
    return data

In [10]:
transform_age = FunctionTransformer(discretization_age)
transform_age.fit_transform(X_train["Age"]).shape

(29041, 1)

In [11]:
# create a class to take care of missing values SimpleImputer
# create a SimpleImputer instance with values to replace missing values with the median value
# fir the imputer instance to the training data using fit() method
age_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("discretization", transform_age)
])

# use the "trained" imputer to transform the training set by replacing missing values with the learned medians
age_pipeline.fit_transform(X_train[["Age"]]).shape

(29041, 1)

Unit 1 and Unit 2
Unit1 - Administrative identifier for ICU unit (MICU)
Unit2 - Administrative identifier for ICU unit (SICU)

In [12]:
combineUnit1and2 = FunctionTransformer(combineUnits)

units = ["Unit1", "Unit2"]

unit_pipeline = Pipeline([
    ("combine", combineUnit1and2),
    ("encoder", OneHotEncoder(sparse=False))
])

unit_pipeline.fit_transform(X_train[units]).shape

NameError: name 'combineUnits' is not defined

In [13]:
acidbase_features = ["BaseExcess", "PaCO2"]

def isAcidBaseDisturb(cols):
    cols = np.c_[cols, np.zeros(len(cols))]
    cols[:,2][(cols[:,0] < -2) & (cols[:,1] < 40)] = 1
    col = cols[:,2].reshape(len(cols), 1)
    return col

FindAcidosis = FunctionTransformer(isAcidBaseDisturb)
FindAcidosis.fit_transform(X_train[acidbase_features])

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [14]:
acidbase_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("acidosis", FindAcidosis)
])

acidbase_pipeline.fit_transform(X_train[acidbase_features]).shape

(29041, 1)

In [None]:
num_features = ["HR",
                "O2Sat",
                "Temp",
                "MAP",
                "Resp",
                "AST",
                "BUN",
                "Alkalinephos",
                "Calcium",
                "Creatinine",
                "Glucose",
                "Bilirubin_total",
                "Hgb",
                "PTT",
                "WBC",
                "Fibrinogen",
                "Platelets",
                "ICULOS"
                ]

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

num_pipeline.fit_transform(X_train[num_features]).shape

In [None]:
gender_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OrdinalEncoder())
])

gender_pipeline.fit_transform(X_train[["Gender"]])

In [None]:
preprocessing_pipeline = ColumnTransformer([
    ("numbers", num_pipeline, num_features),
    ("acidbase", acidbase_pipeline, acidbase_features),
    ("age", age_pipeline, ["Age"]),
    ("units", units_pipeline, units),
    ("gender", gender_pipeline, ["Gender"])
], verbose=True)

preprocessing_pipeline.fit_transform(X_train).shape

In [None]:
vitals = ["HR", # Heart Rate normal adult 60 - 100 beats per minute (bpm)
          "O2Sat", # Oxygen saturation normal adult 97% - 100% (%)
          "Temp", # Temperature normal 97.8°F/36.5°C - 99°F/37.2°C (°C)
          #"SBP", # Systolic Blood Pressure normal < 120 mmHg (mm Hg) - REMOVED for MAP
          "MAP", # Mean Arterial Pressure (mm Hg)
          #"DBP", # beclomethasone dipropionate normal < 80 mmHg (mm Hg) - REMOVED for MAP
          "Resp" # Respiration rate  12<normal<20 breaths / minute (bpm)
          #"EtCO2" # End-tidal CO2 maximum concentration of CO2 at exhalation normal 35 - 45 mmHg (mm Hg) = REMOVED missing too much data
         ]
labs = ['BaseExcess', # strong acid to restore pH (mmoI/L)
        #'Magnesium', # symptoms such as weakness, irritability, cardiac arrhythmia, nausea, and/or diarrhea (mmoI/L)
        #'Glucose', # blood sugar test (indicates diagnose diabetes, pre-diabetes and gestational diabetes) (mg/dL)
        #'Alkalinephos', # Alkaline phosphatase (indicates enzyme activity) (IU/L)
        'pH', # 0:14 = acidic = lower pH; alkaline = higher
        'Chloride', # an electrolyte to balance fluid in cells normal 96-103 ml (indicates blood pressure/pH) (mmoI/L)
        #'Lactate', # high levels indicate lack of oxygen (hypoxia) or other conditions (indicates sepsis) (mg/dL)
        # Respiratory
        'HCO3', # Bicarbonate, carbon dioxide in blood (indicates metabolism / resp) (mmoI/L)
        #'FiO2', # % of concentration of oxygen inhaled (indicates resp) (%) - REMOVED missing too much data
        'PaCO2', # partial pressure of carbon dioxide measured in blood (indicates resp) (mm Hg)
        #'SaO2', # normal 95-100% oxygen saturation bound to hemoglobin (indicates resp) (%) - REMOVED lack of correlation
        # liver
        'AST', # aspartate aminotransferase (indicates liver) (IU/L)
        #'Bilirubin_direct', # conjugated water soluble (indicates liver) (mg/dL) - REMOVED for _total
        'Bilirubin_total', # normal 0.1:1.2 mg/dL (indicates liver) (mg/dL)
        # kidneys
        'BUN', # blood urea nitrogen, nitrogen in the blood (indicates kidneys) (mg/dL)
        'Creatinine', # metabolic panel (indicates kidneys) (mg/dL)
        'Calcium', # indicates range of conditions bones, heart, nerves, kidneys, and teeth (mg/dL)
        'Phosphate', # related to calcium (indicates kidney or diabetes)  (mg/dL)
        'Potassium', # electrolyte or metabolism (affected by blood pressure, kidneys, etc.)  (mmoI/L)
        # Heart
        #'TroponinI', # cardiac specific Trenonin I and T (indicates injury to heart muscle) - REMOVED missing too much data
        # Blood
        'Hct', # Hematocrit (indicates portion of blood from Red Blood Cell count) (%)
        'Hgb', # Hemoglobin apart of Complete Blood Count (CBC) (indicates blood cell count) (g/dL)
        'WBC', # White Blood Cell Count (indicates infection, inflammation or disease) (count10^3/µL)
        'PTT', # Partial Thromboplastin Time (indicates bleeding or blood clot) (seconds)
        #'Fibrinogen', # coagulation factor I (indicates bleeding or blood clot or cardiovascular disease) (mg/dL) - REMOVED missing too much data
        #'Platelets' # indicates bleeding disorder, a bone marrow disease, or other underlying condition ((count10^3/µL))
        ]
demo = ["Age", # Years (100 for patients 90 or above)
        #"Gender", # Female = 0, Male = 1 - REMOVED no correlation
        #"HospAdmTime", # Hours between hospital admit and ICU admit - REMOVED no correlation
        "ICULOS" # ICU length of stay in hours (hours since ICU admit)
       ]

