# Build LSTM model

## Read data

In [1]:
import os
from pathlib import Path

import pandas as pd


dataDirName = Path(os.environ['DATA_DIR'], 'sepsis_prediction', 'lstm_initial_trials', '02_data_matrix', 'balanced_data')
dataDirName.mkdir(exist_ok=True, parents=True)

dataDf = pd.read_csv(Path(dataDirName, 'data_all_variables.csv'))
dataDf = dataDf.sort_values(by=['PATIENT_ID', 'EPISODE_ID', 'measurement_datetime'], ascending=True)
dataDf

Unnamed: 0,PATIENT_ID,EPISODE_ID,measurement_datetime,Diastolic blood pressure_mean,Diastolic blood pressure_min,Diastolic blood pressure_max,Diastolic blood pressure_first,Diastolic blood pressure_last,Diastolic blood pressure_std,Heart rate_mean,...,Systolic blood pressure_std,Temperature_mean,Temperature_min,Temperature_max,Temperature_first,Temperature_last,Temperature_std,Organism_FIRST_NOTED,sepsis,target
0,18849,12790706,2019-09-01,63.750000,50.0,74.0,74.0,69.0,7.497619,96.500,...,12.199971,37.477778,35.5,40.6,35.5,36.9,1.605286,2019-09-01,0.0,0.0
1,18849,12790706,2019-09-02,55.750000,48.0,61.0,54.0,48.0,6.020797,88.000,...,7.788881,36.875000,36.2,38.1,36.8,36.4,0.853913,2019-09-01,0.0,0.0
2,18849,12790706,2019-09-03,55.000000,47.0,69.0,48.0,47.0,10.165300,90.000,...,8.341663,36.750000,36.1,37.7,37.7,36.1,0.695222,2019-09-01,0.0,0.0
3,53162,15749998,2021-11-29,80.076923,70.0,91.0,75.0,86.0,6.264470,85.250,...,13.171940,36.225000,35.9,36.7,36.2,36.7,0.340343,2022-01-06,0.0,0.0
4,53162,15749998,2021-11-29,80.076923,70.0,91.0,75.0,86.0,6.264470,85.250,...,13.171940,36.225000,35.9,36.7,36.2,36.7,0.340343,2022-01-06,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103753,2676371,17444357,2023-04-14,76.000000,64.0,105.0,65.0,68.0,14.302847,72.625,...,9.739463,36.725000,36.1,37.2,36.9,36.8,0.353553,2023-04-21,0.0,0.0
103754,2676371,17444357,2023-04-15,69.800000,61.0,77.0,75.0,61.0,6.379655,89.000,...,12.502000,36.875000,36.6,37.1,37.1,37.1,0.262996,2023-04-21,0.0,0.0
103755,2676371,17444357,2023-04-15,69.800000,61.0,77.0,75.0,61.0,6.379655,89.000,...,12.502000,36.875000,36.6,37.1,37.1,37.1,0.262996,2023-04-21,0.0,0.0
103756,2676371,17444357,2023-04-15,69.800000,61.0,77.0,75.0,61.0,6.379655,89.000,...,12.502000,36.875000,36.6,37.1,37.1,37.1,0.262996,2023-04-21,0.0,0.0


In [4]:
rowsX = []
rowsY = []
for name, group in dataDf.groupby(by=['PATIENT_ID', 'EPISODE_ID']):
    low = 0
    high = 3
    while high <= group.shape[0]:
        valuesList = []
        for col in dataDf.columns[~dataDf.columns.isin(['PATIENT_ID', 'EPISODE_ID', 'measurement_datetime', 'Organism_FIRST_NOTED', 'sepsis'])]:
            valuesList.append(list(group[col][low: high]))
        rowsX.append(
            [[*x] for x in zip(valuesList)]
            )
        rowsY.append(group.target[(high - 1): high].values[0])
        high += 1
        low += 1

In [5]:
rowsX

[[[[63.75, 55.75, 55.0]],
  [[50.0, 48.0, 47.0]],
  [[74.0, 61.0, 69.0]],
  [[74.0, 54.0, 48.0]],
  [[69.0, 48.0, 47.0]],
  [[7.497618669570073, 6.020797289396148, 10.16530045465127]],
  [[96.5, 88.0, 90.0]],
  [[87.0, 82.0, 80.0]],
  [[104.0, 100.0, 101.0]],
  [[87.0, 100.0, 91.0]],
  [[104.0, 82.0, 80.0]],
  [[7.32575365861197, 10.392304845413264, 8.679477710861022]],
  [[75.33333333333333, 61.0, 69.0]],
  [[69.0, 61.0, 61.0]],
  [[83.0, 61.0, 80.0]],
  [[69.0, 61.0, 65.0]],
  [[83.0, 61.0, 61.0]],
  [[7.094598884597589, 7.650143983995853, 8.205689083394116]],
  [[97.875, 98.5, 97.5]],
  [[96.0, 98.0, 97.0]],
  [[99.0, 100.0, 98.0]],
  [[96.0, 98.0, 97.0]],
  [[98.0, 98.0, 98.0]],
  [[1.3562026818605364, 1.000000000000001, 0.5773502691896278]],
  [[21.375, 19.0, 18.0]],
  [[18.0, 18.0, 17.0]],
  [[28.0, 20.0, 19.0]],
  [[22.0, 18.0, 17.0]],
  [[21.0, 18.0, 18.0]],
  [[3.020761493398642, 1.1547005383792517, 0.816496580927726]],
  [[130.625, 114.0, 121.25]],
  [[115.0, 105.0, 111.0]],


In [6]:
import numpy as np


trainX = np.array(rowsX)
testX = np.array(rowsX)
trainY = np.array(rowsY)
testY = np.array(rowsY)


In [7]:
trainX.shape, trainY.shape

((100984, 43, 1, 3), (100984,))

### LSTM

In [11]:
# LSTM for international airline passengers problem with time step regression framing
import numpy as np
import matplotlib.pyplot as plt
from pandas import read_csv
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error


look_back = 3

model = Sequential()
model.add(LSTM(128, input_shape=(look_back, 43)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(trainX, trainY, epochs=10, batch_size=1, verbose=2)
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)

Epoch 1/10


ValueError: Exception encountered when calling Sequential.call().

[1mInvalid input shape for input Tensor("data:0", shape=(1, 43, 1, 3), dtype=float32). Expected shape (None, 3, 43), but input has incompatible shape (1, 43, 1, 3)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(1, 43, 1, 3), dtype=float32)
  • training=True
  • mask=None

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score


print('accuracy_score', accuracy_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('balanced_accuracy_score', balanced_accuracy_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('precision_score', precision_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('recall_score', recall_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('precision_recall_fscore_support', precision_recall_fscore_support(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('f1_score', f1_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('roc_auc_score', roc_auc_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))

accuracy_score 0.582662839121821
balanced_accuracy_score 0.5
precision_score 0.0
recall_score 0.0
precision_recall_fscore_support (array([0.58266284, 0.        ]), array([1., 0.]), array([0.73630697, 0.        ]), array([14411, 10322]))
f1_score 0.0
roc_auc_score 0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
print('roc_auc_score', roc_auc_score(trainY, trainPredict))

roc_auc_score 0.4999653042814517


## DNN

In [None]:
from tensorflow import keras


model = keras.Sequential(
    [
        keras.Input(shape=(3, )),
        keras.layers.Dense(4, activation="relu"),
        keras.layers.Dropout(0.1),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)
model.summary()


In [None]:
metrics = [
    keras.metrics.FalseNegatives(name="fn"),
    keras.metrics.FalsePositives(name="fp"),
    keras.metrics.TrueNegatives(name="tn"),
    keras.metrics.TruePositives(name="tp"),
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
    keras.metrics.Accuracy(name="accuracy"),
]

model.compile(
    optimizer=keras.optimizers.Adam(1e-2), loss="binary_crossentropy", metrics=metrics
)

model.fit(trainX.reshape((trainX.shape[0], 3)), trainY, epochs=10, batch_size=1, verbose=2)

Epoch 1/10
24733/24733 - 22s - 874us/step - accuracy: 3.2345e-04 - fn: 10261.0000 - fp: 85.0000 - loss: 0.7015 - precision: 0.4178 - recall: 0.0059 - tn: 14326.0000 - tp: 61.0000
Epoch 2/10
24733/24733 - 21s - 859us/step - accuracy: 0.0000e+00 - fn: 10322.0000 - fp: 0.0000e+00 - loss: 0.6808 - precision: 0.0000e+00 - recall: 0.0000e+00 - tn: 14411.0000 - tp: 0.0000e+00
Epoch 3/10
24733/24733 - 21s - 850us/step - accuracy: 0.0000e+00 - fn: 10322.0000 - fp: 0.0000e+00 - loss: 0.6808 - precision: 0.0000e+00 - recall: 0.0000e+00 - tn: 14411.0000 - tp: 0.0000e+00
Epoch 4/10
24733/24733 - 20s - 823us/step - accuracy: 0.0000e+00 - fn: 10322.0000 - fp: 0.0000e+00 - loss: 0.6809 - precision: 0.0000e+00 - recall: 0.0000e+00 - tn: 14411.0000 - tp: 0.0000e+00
Epoch 5/10
24733/24733 - 20s - 828us/step - accuracy: 0.0000e+00 - fn: 10322.0000 - fp: 0.0000e+00 - loss: 0.6804 - precision: 0.0000e+00 - recall: 0.0000e+00 - tn: 14411.0000 - tp: 0.0000e+00
Epoch 6/10
24733/24733 - 20s - 828us/step - accur

<keras.src.callbacks.history.History at 0x7495b5f29a80>

In [None]:
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)

[1m  1/773[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m23s[0m 30ms/step

[1m773/773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 466us/step
[1m773/773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 442us/step


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score


print('accuracy_score', accuracy_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('balanced_accuracy_score', balanced_accuracy_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('precision_score', precision_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('recall_score', recall_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('precision_recall_fscore_support', precision_recall_fscore_support(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('f1_score', f1_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('roc_auc_score', roc_auc_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))

accuracy_score 0.582662839121821
balanced_accuracy_score 0.5


precision_score 0.0
recall_score 0.0
precision_recall_fscore_support (array([0.58266284, 0.        ]), array([1., 0.]), array([0.73630697, 0.        ]), array([14411, 10322]))
f1_score 0.0
roc_auc_score 0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
print('roc_auc_score', roc_auc_score(trainY, trainPredict))

roc_auc_score 0.5


## Traditional ML

### NB Classifier model

In [None]:
from sklearn.naive_bayes import GaussianNB


gnb = GaussianNB()
y_score = gnb.fit(trainX.reshape((trainX.shape[0], 3)), trainY).predict_proba(trainX.reshape((trainX.shape[0], 3)))
y_pred = gnb.fit(trainX.reshape((trainX.shape[0], 3)), trainY).predict(trainX.reshape((trainX.shape[0], 3)))


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


print('Accuracy', accuracy_score(trainY, y_pred))
print('Balanced Accuracy', balanced_accuracy_score(trainY, y_pred))
print('F1 Score', f1_score(trainY, y_pred))
print('Precision Score', precision_score(trainY, y_pred))
print('Recall Score', recall_score(trainY, y_pred))
print('roc_auc_score', roc_auc_score(trainY, [y[1] for y in y_score]))

Accuracy 0.582662839121821
Balanced Accuracy 0.5
F1 Score 0.0
Precision Score 0.0
Recall Score 0.0
roc_auc_score 0.5301128349674651


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### LR Classifier

In [None]:
from sklearn.linear_model import LogisticRegression


lrc = LogisticRegression(random_state=0)
y_score = lrc.fit(trainX.reshape((trainX.shape[0], 3)), trainY).predict_proba(trainX.reshape((trainX.shape[0], 3)))
y_pred = lrc.fit(trainX.reshape((trainX.shape[0], 3)), trainY).predict(trainX.reshape((trainX.shape[0], 3)))

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


print('Accuracy', accuracy_score(trainY, y_pred))
print('Balanced Accuracy', balanced_accuracy_score(trainY, y_pred))
print('F1 Score', f1_score(trainY, y_pred))
print('Precision Score', precision_score(trainY, y_pred))
print('Recall Score', recall_score(trainY, y_pred))
print('roc_auc_score', roc_auc_score(trainY, [y[1] for y in y_score]))

Accuracy 0.5820967937573283
Balanced Accuracy 0.49983038358325255
F1 Score 0.0044307455210942015
Precision Score 0.38333333333333336
Recall Score 0.0022282503390815733
roc_auc_score 0.5323054013549764


### XGBoost Classifier

In [None]:
from xgboost import XGBClassifier

bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
# fit model
bst.fit(trainX.reshape((trainX.shape[0], 3)), trainY)
# make predictions
y_score = bst.predict_proba(trainX.reshape((trainX.shape[0], 3)))
y_pred = bst.predict(trainX.reshape((trainX.shape[0], 3)))

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


print('Accuracy', accuracy_score(trainY, y_pred))
print('Balanced Accuracy', balanced_accuracy_score(trainY, y_pred))
print('F1 Score', f1_score(trainY, y_pred))
print('Precision Score', precision_score(trainY, y_pred))
print('Recall Score', recall_score(trainY, y_pred))
print('roc_auc_score', roc_auc_score(trainY, [y[1] for y in y_score]))

Accuracy 0.5847652933327943
Balanced Accuracy 0.5055151940423774
F1 Score 0.04995374653098982
Precision Score 0.5532786885245902
Recall Score 0.026157721371827165
roc_auc_score 0.5458594172509532
