# Build LSTM model

## Read data

In [1]:
import os
from pathlib import Path

import pandas as pd


dataDirName = Path(os.environ['DATA_DIR'], 'sepsis_prediction', 'lstm_initial_trials', '02_data_matrix', 'balanced_data')
dataDirName.mkdir(exist_ok=True, parents=True)

dataDf = pd.read_csv(Path(dataDirName, 'data_temp_variables.csv'))
dataDf = dataDf.sort_values(by=['PATIENT_ID', 'EPISODE_ID', 'measurement_datetime'], ascending=True)
dataDf

Unnamed: 0,PATIENT_ID,EPISODE_ID,measurement_datetime,concept_name,value_mean,value_std,value_open,value_high,value_low,value_close,Organism_FIRST_NOTED,sepsis,target
0,56749,14126754,2020-11-23,Temperature,36.619017,0.000000,0.0,0.0,0.0,0.0,2021-07-30,0.0,0.0
1,56749,14126754,2020-11-24,Temperature,37.150000,0.288675,37.4,37.4,36.9,36.9,2021-07-30,0.0,0.0
2,56749,14126754,2020-11-25,Temperature,36.800000,0.000000,36.8,36.8,36.8,36.8,2021-07-30,0.0,0.0
3,62567,12648702,2019-07-25,Temperature,37.771429,0.596418,38.6,38.6,36.9,37.7,2019-07-26,1.0,0.0
4,62567,12648702,2019-07-26,Temperature,37.000000,0.427618,37.2,37.8,36.3,36.8,2019-07-26,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27370,2669870,17327467,2023-02-03,Temperature,36.300000,0.327872,36.1,37.0,36.0,37.0,2023-01-23,0.0,0.0
27371,2669870,17327467,2023-02-04,Temperature,36.750000,0.361939,37.2,37.2,36.1,36.8,2023-01-23,0.0,0.0
27372,2669870,17327467,2023-02-05,Temperature,36.483333,0.292689,36.2,36.8,36.1,36.8,2023-01-23,0.0,0.0
27373,2669870,17327467,2023-02-06,Temperature,36.583333,0.392003,36.8,37.0,36.0,36.7,2023-01-23,0.0,0.0


In [2]:
rowsX = []
rowsY = []
for name, group in dataDf.groupby(by=['PATIENT_ID', 'EPISODE_ID']):
    low = 0
    high = 3
    while high <= group.shape[0]:
        rowsX.append(
            [[*x] for x in zip(list(group.value_mean[low: high]), list(group.value_std[low: high]), list(group.value_open[low: high]), list(group.value_high[low: high]), list(group.value_low[low: high]), list(group.value_close[low: high]))]
            )
        rowsY.append(group.target[(high - 1): high].values[0])
        high += 1
        low += 1

In [3]:
rowsX

[[[36.61901735331418, 0.0, 0.0, 0.0, 0.0, 0.0],
  [37.15, 0.2886751345948128, 37.4, 37.4, 36.9, 36.9],
  [36.8, 0.0, 36.8, 36.8, 36.8, 36.8]],
 [[37.771428571428565, 0.5964178784328027, 38.6, 38.6, 36.9, 37.7],
  [37.0, 0.4276179870598799, 37.2, 37.8, 36.3, 36.8],
  [36.66666666666666, 0.3983298465677236, 36.7, 37.0, 36.0, 36.4]],
 [[37.0, 0.4276179870598799, 37.2, 37.8, 36.3, 36.8],
  [36.66666666666666, 0.3983298465677236, 36.7, 37.0, 36.0, 36.4],
  [37.016666666666666, 0.3311595788538575, 36.8, 37.4, 36.7, 37.4]],
 [[36.66666666666666, 0.3983298465677236, 36.7, 37.0, 36.0, 36.4],
  [37.016666666666666, 0.3311595788538575, 36.8, 37.4, 36.7, 37.4],
  [36.81666666666667, 0.4445971959725647, 36.4, 37.4, 36.2, 36.2]],
 [[37.016666666666666, 0.3311595788538575, 36.8, 37.4, 36.7, 37.4],
  [36.81666666666667, 0.4445971959725647, 36.4, 37.4, 36.2, 36.2],
  [36.75, 0.2880972058177575, 36.9, 37.1, 36.4, 36.4]],
 [[36.81666666666667, 0.4445971959725647, 36.4, 37.4, 36.2, 36.2],
  [36.75, 0.2880

In [4]:
import numpy as np


trainX = np.array(rowsX)
testX = np.array(rowsX)
trainY = np.array(rowsY)
testY = np.array(rowsY)


In [5]:
trainX.shape, trainY.shape

((24733, 3, 6), (24733,))

### LSTM

In [6]:
# LSTM for international airline passengers problem with time step regression framing
import numpy as np
import matplotlib.pyplot as plt
from pandas import read_csv
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error


look_back = 3

model = Sequential()
model.add(LSTM(24, input_shape=(look_back, 6)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(trainX, trainY, epochs=10, batch_size=1, verbose=2)
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)

2025-05-13 08:16:02.536957: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-13 08:16:02.537979: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-13 08:16:02.541085: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-13 08:16:02.549554: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747124162.563709  361889 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747124162.56

Epoch 1/10


2025-05-13 08:16:04.241680: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
  super().__init__(**kwargs)


24733/24733 - 25s - 998us/step - accuracy: 0.5798 - loss: 0.6799
Epoch 2/10
24733/24733 - 24s - 980us/step - accuracy: 0.5826 - loss: 0.6786
Epoch 3/10
24733/24733 - 24s - 964us/step - accuracy: 0.5827 - loss: 0.6786
Epoch 4/10
24733/24733 - 25s - 995us/step - accuracy: 0.5827 - loss: 0.6784
Epoch 5/10
24733/24733 - 24s - 989us/step - accuracy: 0.5825 - loss: 0.6782
Epoch 6/10
24733/24733 - 24s - 987us/step - accuracy: 0.5828 - loss: 0.6784
Epoch 7/10
24733/24733 - 25s - 1ms/step - accuracy: 0.5828 - loss: 0.6786
Epoch 8/10
24733/24733 - 25s - 991us/step - accuracy: 0.5829 - loss: 0.6785
Epoch 9/10
24733/24733 - 25s - 999us/step - accuracy: 0.5827 - loss: 0.6783
Epoch 10/10
24733/24733 - 25s - 1ms/step - accuracy: 0.5827 - loss: 0.6780
[1m773/773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 714us/step
[1m773/773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 610us/step


In [7]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score


print('accuracy_score', accuracy_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('balanced_accuracy_score', balanced_accuracy_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('precision_score', precision_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('recall_score', recall_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('precision_recall_fscore_support', precision_recall_fscore_support(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('f1_score', f1_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('roc_auc_score', roc_auc_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))

accuracy_score 0.582824566368819
balanced_accuracy_score 0.5001937608990505
precision_score 1.0
recall_score 0.00038752179810114316
precision_recall_fscore_support (array([0.58275709, 1.        ]), array([1.00000000e+00, 3.87521798e-04]), array([0.73638222, 0.00077474]), array([14411, 10322]))
f1_score 0.0007747433662599264
roc_auc_score 0.5001937608990505


In [8]:
print('roc_auc_score', roc_auc_score(trainY, trainPredict))

roc_auc_score 0.5243639540674132


## DNN

In [9]:
from tensorflow import keras


model = keras.Sequential(
    [
        keras.Input(shape=(18, )),
        keras.layers.Dense(24, activation="relu"),
        keras.layers.Dropout(0.1),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)
model.summary()


In [11]:
metrics = [
    keras.metrics.FalseNegatives(name="fn"),
    keras.metrics.FalsePositives(name="fp"),
    keras.metrics.TrueNegatives(name="tn"),
    keras.metrics.TruePositives(name="tp"),
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
    keras.metrics.Accuracy(name="accuracy"),
]

model.compile(
    optimizer=keras.optimizers.Adam(1e-2), loss="binary_crossentropy", metrics=metrics
)

model.fit(trainX.reshape((trainX.shape[0], 18)), trainY, epochs=10, batch_size=1, verbose=2)

Epoch 1/10
24733/24733 - 22s - 882us/step - accuracy: 1.2130e-04 - fn: 10221.0000 - fp: 120.0000 - loss: 0.7015 - precision: 0.4570 - recall: 0.0098 - tn: 14291.0000 - tp: 101.0000
Epoch 2/10
24733/24733 - 21s - 869us/step - accuracy: 0.0000e+00 - fn: 10322.0000 - fp: 0.0000e+00 - loss: 0.6806 - precision: 0.0000e+00 - recall: 0.0000e+00 - tn: 14411.0000 - tp: 0.0000e+00
Epoch 3/10
24733/24733 - 21s - 861us/step - accuracy: 0.0000e+00 - fn: 10322.0000 - fp: 0.0000e+00 - loss: 0.6808 - precision: 0.0000e+00 - recall: 0.0000e+00 - tn: 14411.0000 - tp: 0.0000e+00
Epoch 4/10
24733/24733 - 21s - 856us/step - accuracy: 0.0000e+00 - fn: 10322.0000 - fp: 0.0000e+00 - loss: 0.6808 - precision: 0.0000e+00 - recall: 0.0000e+00 - tn: 14411.0000 - tp: 0.0000e+00
Epoch 5/10
24733/24733 - 21s - 837us/step - accuracy: 0.0000e+00 - fn: 10322.0000 - fp: 0.0000e+00 - loss: 0.6806 - precision: 0.0000e+00 - recall: 0.0000e+00 - tn: 14411.0000 - tp: 0.0000e+00
Epoch 6/10
24733/24733 - 21s - 844us/step - acc

<keras.src.callbacks.history.History at 0x767c86bb56c0>

In [14]:
trainPredict = model.predict(trainX.reshape((trainX.shape[0], 18)))
testPredict = model.predict(testX.reshape((trainX.shape[0], 18)))

[1m773/773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 460us/step
[1m773/773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 437us/step


In [15]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score


print('accuracy_score', accuracy_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('balanced_accuracy_score', balanced_accuracy_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('precision_score', precision_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('recall_score', recall_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('precision_recall_fscore_support', precision_recall_fscore_support(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('f1_score', f1_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))
print('roc_auc_score', roc_auc_score(trainY, [1 if (x[0] > 0.5) else 0 for x in trainPredict]))

accuracy_score 0.582662839121821
balanced_accuracy_score 0.5
precision_score 0.0
recall_score 0.0
precision_recall_fscore_support (array([0.58266284, 0.        ]), array([1., 0.]), array([0.73630697, 0.        ]), array([14411, 10322]))
f1_score 0.0
roc_auc_score 0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
print('roc_auc_score', roc_auc_score(trainY, trainPredict))

roc_auc_score 0.5


## Traditional ML

### NB Classifier model

In [18]:
from sklearn.naive_bayes import GaussianNB


gnb = GaussianNB()
y_score = gnb.fit(trainX.reshape((trainX.shape[0], 18)), trainY).predict_proba(trainX.reshape((trainX.shape[0], 18)))
y_pred = gnb.fit(trainX.reshape((trainX.shape[0], 18)), trainY).predict(trainX.reshape((trainX.shape[0], 18)))


In [19]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


print('Accuracy', accuracy_score(trainY, y_pred))
print('Balanced Accuracy', balanced_accuracy_score(trainY, y_pred))
print('F1 Score', f1_score(trainY, y_pred))
print('Precision Score', precision_score(trainY, y_pred))
print('Recall Score', recall_score(trainY, y_pred))
print('roc_auc_score', roc_auc_score(trainY, [y[1] for y in y_score]))

Accuracy 0.41713500181943153
Balanced Accuracy 0.4942325073780335
F1 Score 0.5790457279682298
Precision Score 0.41443738505266675
Recall Score 0.9605696570432087
roc_auc_score 0.5077259318166811


### LR Classifier

In [20]:
from sklearn.linear_model import LogisticRegression


lrc = LogisticRegression(random_state=0)
y_score = lrc.fit(trainX.reshape((trainX.shape[0], 18)), trainY).predict_proba(trainX.reshape((trainX.shape[0], 18)))
y_pred = lrc.fit(trainX.reshape((trainX.shape[0], 18)), trainY).predict(trainX.reshape((trainX.shape[0], 18)))

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


print('Accuracy', accuracy_score(trainY, y_pred))
print('Balanced Accuracy', balanced_accuracy_score(trainY, y_pred))
print('F1 Score', f1_score(trainY, y_pred))
print('Precision Score', precision_score(trainY, y_pred))
print('Recall Score', recall_score(trainY, y_pred))
print('roc_auc_score', roc_auc_score(trainY, [y[1] for y in y_score]))

Accuracy 0.5831480208628148
Balanced Accuracy 0.5019007552937257
F1 Score 0.020520615618468553
Precision Score 0.5294117647058824
Recall Score 0.010463088548730867
roc_auc_score 0.5328281665396104


### XGBoost Classifier

In [22]:
from xgboost import XGBClassifier

bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
# fit model
bst.fit(trainX.reshape((trainX.shape[0], 18)), trainY)
# make predictions
y_score = bst.predict_proba(trainX.reshape((trainX.shape[0], 18)))
y_pred = bst.predict(trainX.reshape((trainX.shape[0], 18)))

In [23]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


print('Accuracy', accuracy_score(trainY, y_pred))
print('Balanced Accuracy', balanced_accuracy_score(trainY, y_pred))
print('F1 Score', f1_score(trainY, y_pred))
print('Precision Score', precision_score(trainY, y_pred))
print('Recall Score', recall_score(trainY, y_pred))
print('roc_auc_score', roc_auc_score(trainY, [y[1] for y in y_score]))

Accuracy 0.5827032709335705
Balanced Accuracy 0.525943089932526
F1 Score 0.2675466609892839
Precision Score 0.5001326611833378
Recall Score 0.18261964735516373
roc_auc_score 0.5527094888965027
