## Perform sepsis prediction using traditional machine learning models

## Obtain data dict

In [1]:
import os

import sys

from pathlib import Path

sys.path.insert(0, Path(os.environ['EHR_ML_BASE']).as_posix())

from ehrml.utils import DataUtils


datamatrixDirName = Path(os.environ['DATA_DIR'], 'sepsis_prediction', 'blood_pos_cohort_20250416', '04_data_matrix')


dataDict = DataUtils.getDataDict(
    dirPath=Path(datamatrixDirName, 'data_matrix_traditional_ml.csv'),
    idColumns=['person_id', 'visit_occurrence_id'],
    targetColumn='sepsis',
    measurementDateColumn='measurement_date',
    anchorDateColumn='start_date',
    windowStart=30,
    windowEnd=2,
)

In [2]:
dataDict.keys()

dict_keys(['Full', 'VitalsMax', 'VitalsMin', 'VitalsAvg', 'VitalsFirst', 'VitalsLast', 'LabsMax', 'LabsMin', 'LabsAvg', 'LabsFirst', 'LabsLast'])

## NB Classifier model

In [3]:
from sklearn.naive_bayes import GaussianNB


(XTrain, yTrain, XTest, yTest) = dataDict['Full']

gnb = GaussianNB()
y_pred = gnb.fit(XTrain, yTrain).predict(XTest)


  y = column_or_1d(y, warn=True)


In [4]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


print('Accuracy', accuracy_score(yTest, y_pred))
print('Balanced Accuracy', balanced_accuracy_score(yTest, y_pred))
print('F1 Score', f1_score(yTest, y_pred))
print('Precision Score', precision_score(yTest, y_pred))
print('Recall Score', recall_score(yTest, y_pred))

Accuracy 0.8370056909382753
Balanced Accuracy 0.6157854186845118
F1 Score 0.12254516889238021
Precision Score 0.07303370786516854
Recall Score 0.3804878048780488


## LR Classifier

In [5]:
from sklearn.linear_model import LogisticRegression


lrc = LogisticRegression(random_state=0)
y_pred = lrc.fit(XTrain, yTrain).predict(XTest)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


print('Accuracy', accuracy_score(yTest, y_pred))
print('Balanced Accuracy', balanced_accuracy_score(yTest, y_pred))
print('F1 Score', f1_score(yTest, y_pred))
print('Precision Score', precision_score(yTest, y_pred))
print('Recall Score', recall_score(yTest, y_pred))

Accuracy 0.9692105647161827
Balanced Accuracy 0.5137316192656511
F1 Score 0.053811659192825115
Precision Score 0.3333333333333333
Recall Score 0.02926829268292683


## XGBoost Classifier

In [8]:
from xgboost import XGBClassifier

bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
# fit model
bst.fit(XTrain, yTrain)
# make predictions
y_pred = bst.predict(XTest)


In [9]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


print('Accuracy', accuracy_score(yTest, y_pred))
print('Balanced Accuracy', balanced_accuracy_score(yTest, y_pred))
print('F1 Score', f1_score(yTest, y_pred))
print('Precision Score', precision_score(yTest, y_pred))
print('Recall Score', recall_score(yTest, y_pred))

Accuracy 0.9697942506931271
Balanced Accuracy 0.5022133926212908
F1 Score 0.009569377990430622
Precision Score 0.25
Recall Score 0.004878048780487805
