In [1]:
import os
import pickle

import lgbm_pipeline.feature_load as loader
import lgbm_pipeline.feature_extraction as extractor

from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score, make_scorer, RocCurveDisplay, ConfusionMatrixDisplay, classification_report
import lightgbm as lgbm
import xgboost as xgb

In [2]:
patients: list[pd.DataFrame] = loader.load_training_data(f"../training_setA/*.psv")
# f = open("patients", "w")
# pickle.dump(patients, f)

Loading PSV Files: 100%|██████████| 20336/20336 [00:26<00:00, 765.61it/s]


In [3]:
sepsis_patients: list[pd.DataFrame] = []
non_sepsis_patients: list[pd.DataFrame] = []

for patient in tqdm(patients, "Converting indices to timedeltas"):
    patient.index = pd.to_timedelta(patient.index, 'h')
    if patient["SepsisLabel"].any():
        sepsis_patients.append(patient)
    else:
        non_sepsis_patients.append(patient)

train_sepsis_patients, test_sepsis_patients = train_test_split(sepsis_patients)
train_non_sepsis_patients, test_non_sepsis_patients = train_test_split(non_sepsis_patients)

weight: float = len(train_non_sepsis_patients)/len(train_sepsis_patients)

train_patients: list[pd.DataFrame] = train_sepsis_patients + train_non_sepsis_patients
test_patients: list[pd.DataFrame] = test_sepsis_patients + test_non_sepsis_patients

print(f"Number of sepsis patients in training set: {len(train_sepsis_patients)}")
print(f"Number of non-sepsis patients in training set: {len(train_non_sepsis_patients)}")
print(f"Number of patients in training set: {len(train_patients)}\n")
print(f"Number of sepsis patients in testing set: {len(test_sepsis_patients)}")
print(f"Number of non-sepsis patients in testing set: {len(test_non_sepsis_patients)}")
print(f"Number of patients in testing set: {len(test_patients)}")

Converting indices to timedeltas: 100%|██████████| 20336/20336 [00:03<00:00, 6659.13it/s]

Number of sepsis patients in training set: 1342
Number of non-sepsis patients in training set: 13909
Number of patients in training set: 15251

Number of sepsis patients in testing set: 448
Number of non-sepsis patients in testing set: 4637
Number of patients in testing set: 5085





In [4]:
train_patients_forward: list[pd.DataFrame] = extractor.fill(train_patients, extractor.FillMethod.FORWARD)
train_patients_backward: list[pd.DataFrame] = extractor.fill(train_patients, extractor.FillMethod.BACKWARD)
train_patients_linear: list[pd.DataFrame] = extractor.fill(train_patients, extractor.FillMethod.LINEAR)

Filling gaps in patient data: 100%|██████████| 15251/15251 [00:01<00:00, 8073.56it/s]
Filling gaps in patient data: 100%|██████████| 15251/15251 [00:02<00:00, 7390.36it/s]
Filling gaps in patient data: 100%|██████████| 15251/15251 [00:20<00:00, 757.62it/s]


In [5]:
fill_method_to_train_patients: dict[extractor.FillMethod, list[pd.DataFrame]] = {extractor.FillMethod.FORWARD: train_patients_forward,
                              extractor.FillMethod.BACKWARD: train_patients_backward,
							  extractor.FillMethod.LINEAR: train_patients_linear}
fill_methods_to_use: dict[str, extractor.FillMethod] = extractor.select_best_fill_methods(fill_method_to_train_patients)

Computing correlation matrices: 100%|██████████| 3/3 [00:31<00:00, 10.51s/it]
100%|██████████| 3/3 [00:00<00:00, 7979.02it/s]0/37 [00:00<?, ?it/s]
100%|██████████| 3/3 [00:00<00:00, 19358.33it/s]
100%|██████████| 3/3 [00:00<00:00, 15907.60it/s]
100%|██████████| 3/3 [00:00<00:00, 10364.84it/s]
100%|██████████| 3/3 [00:00<00:00, 10520.83it/s]
100%|██████████| 3/3 [00:00<00:00, 13827.38it/s]
100%|██████████| 3/3 [00:00<00:00, 10313.86it/s]
100%|██████████| 3/3 [00:00<00:00, 10556.13it/s]
100%|██████████| 3/3 [00:00<00:00, 9383.23it/s]
100%|██████████| 3/3 [00:00<00:00, 11234.74it/s]
100%|██████████| 3/3 [00:00<00:00, 9144.56it/s]
100%|██████████| 3/3 [00:00<00:00, 6743.25it/s]
100%|██████████| 3/3 [00:00<00:00, 10131.17it/s]
100%|██████████| 3/3 [00:00<00:00, 4730.42it/s]
100%|██████████| 3/3 [00:00<00:00, 11904.36it/s]
100%|██████████| 3/3 [00:00<00:00, 9876.70it/s]
100%|██████████| 3/3 [00:00<00:00, 9058.97it/s]
100%|██████████| 3/3 [00:00<00:00, 9218.25it/s]
100%|██████████| 3/3 [00:00

In [6]:
test_patients_forward: list[pd.DataFrame] = extractor.fill(test_patients, extractor.FillMethod.FORWARD)
test_patients_backward: list[pd.DataFrame] = extractor.fill(test_patients, extractor.FillMethod.BACKWARD)
test_patients_linear: list[pd.DataFrame] = extractor.fill(test_patients, extractor.FillMethod.LINEAR)

Filling gaps in patient data: 100%|██████████| 5085/5085 [00:00<00:00, 8199.85it/s]
Filling gaps in patient data: 100%|██████████| 5085/5085 [00:00<00:00, 9471.91it/s] 
Filling gaps in patient data: 100%|██████████| 5085/5085 [00:06<00:00, 831.47it/s]


In [7]:
train_patients_mixed = extractor.mixed_fill(train_patients, train_patients_forward, train_patients_backward, train_patients_linear, fill_methods_to_use)
test_patients_mixed = extractor.mixed_fill(test_patients, test_patients_forward, test_patients_backward, test_patients_linear, fill_methods_to_use)

Doing mixed fill: 100%|██████████| 15251/15251 [01:13<00:00, 207.20it/s]
Doing mixed fill: 100%|██████████| 5085/5085 [00:23<00:00, 213.67it/s]


In [8]:
# Find the maximum length of the DataFrames in train_patients_mixed
max_length = max(len(df) for df in train_patients_mixed)

# Adjust the length of each DataFrame in X_train to match the maximum length and forward-fill missing values
X_train: list[pd.DataFrame] = []
y_train: list[pd.Series] = []

for j in tqdm(range(len(train_patients_mixed)), "Extending indices and splitting into (X_train, y_train)"):
    df = train_patients_mixed[j]
    # Generate a new index that extends to the maximum length
    new_index = pd.timedelta_range(start=df.index[0], periods=max_length, freq='h')
    df = df.reindex(new_index).ffill()  # Reindex to the new index and forward-fill
    X_train.append(df.drop(columns="SepsisLabel", inplace=False))
    y_train.append(df["SepsisLabel"])

# Adjust the length of each DataFrame in X_test similarly
X_test: list[pd.DataFrame] = []
y_test: list[pd.Series] = []

for k in tqdm(range(len(test_patients_mixed)), "Extending indices and splitting into (X_test, y_test)"):
    df = test_patients_mixed[k]
    new_index = pd.timedelta_range(start=df.index[0], periods=max_length, freq='h')
    df = df.reindex(new_index).ffill()
    X_test.append(df.drop(columns="SepsisLabel", inplace=False))
    y_test.append(df["SepsisLabel"])

Extending indices and splitting into (X_train, y_train): 100%|██████████| 15251/15251 [00:43<00:00, 354.28it/s]
Extending indices and splitting into (X_test, y_test): 100%|██████████| 5085/5085 [00:17<00:00, 292.72it/s]


In [9]:
f = make_scorer(fbeta_score, beta=1)

clf = xgb.XGBClassifier(objective="binary:logistic", eval_metric=f, scale_pos_weight=weight)
bst = clf.fit(X_train, y_train)

ValueError: Please reshape the input data into 2-dimensional matrix.

In [None]:
y_pred = bst.predict(X_test)

In [None]:
RocCurveDisplay.from_predictions(y_test, y_pred)
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
print(classification_report(y_test, y_pred))