In [None]:
import lgbm_pipeline.feature_load as loader
import lgbm_pipeline.feature_extraction as extractor
from lgbm_pipeline.feature_extraction import create_windows

from tqdm import tqdm
import fireducks.pandas as pd
%load_ext fireducks.ipyext
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import fbeta_score, make_scorer, RocCurveDisplay, ConfusionMatrixDisplay, classification_report
import xgboost as xgb

In [None]:
patients: pd.DataFrame = loader.load_data("../training_set?/*.psv", max_files=None)

In [None]:
sepsis_patients: list[pd.DataFrame] = []
non_sepsis_patients: list[pd.DataFrame] = []

for patient in tqdm(patients, "Converting indices to time series and splitting sepsis/non-sepsis"):
	patient.index = pd.to_timedelta(patient.index, 'h')
	if patient["SepsisLabel"].any():
		sepsis_patients.append(patient)
	else:
		non_sepsis_patients.append(patient)

In [None]:
train_sepsis_patients, test_sepsis_patients = train_test_split(sepsis_patients, random_state=42)
train_non_sepsis_patients, test_non_sepsis_patients = train_test_split(non_sepsis_patients, random_state=42)

In [None]:
from sklearn.utils import shuffle

train_non_sepsis_patients = shuffle(train_non_sepsis_patients, random_state=42, n_samples=len(train_sepsis_patients))

In [None]:
ratio: float = len(train_non_sepsis_patients) / len(train_sepsis_patients)
print(f"Ratio: {ratio}")

train_patients: list[pd.DataFrame] = train_sepsis_patients + train_non_sepsis_patients
test_patients: list[pd.DataFrame] = test_sepsis_patients + test_non_sepsis_patients

print(f"Number of sepsis patients in training set: {len(train_sepsis_patients)}")
print(f"Number of non-sepsis patients in training set: {len(train_non_sepsis_patients)}")
print(f"Number of patients in training set: {len(train_patients)}\n")
print(f"Number of sepsis patients in testing set: {len(test_sepsis_patients)}")
print(f"Number of non-sepsis patients in testing set: {len(test_non_sepsis_patients)}")
print(f"Number of patients in testing set: {len(test_patients)}")

In [None]:
train_patients_forward: list[pd.DataFrame] = extractor.fill(train_patients, extractor.FillMethod.FORWARD)
train_patients_backward: list[pd.DataFrame] = extractor.fill(train_patients, extractor.FillMethod.BACKWARD)
train_patients_linear: list[pd.DataFrame] = extractor.fill(train_patients, extractor.FillMethod.LINEAR)

In [None]:
fill_method_to_train_patients: dict[extractor.FillMethod, list[pd.DataFrame]] = {
	extractor.FillMethod.FORWARD : train_patients_forward,
	extractor.FillMethod.BACKWARD: train_patients_backward,
	extractor.FillMethod.LINEAR  : train_patients_linear}
fill_methods_to_use: dict[str, extractor.FillMethod] = extractor.best_fill_method_for_feature(
	fill_method_to_train_patients)
train_patients_mixed: list[pd.DataFrame] = extractor.mixed_fill(train_patients, train_patients_forward,
                                                                train_patients_backward, train_patients_linear,
                                                                fill_methods_to_use)

In [None]:
test_patients_forward: list[pd.DataFrame] = extractor.fill(test_patients, extractor.FillMethod.FORWARD)
test_patients_backward: list[pd.DataFrame] = extractor.fill(test_patients, extractor.FillMethod.BACKWARD)
test_patients_linear: list[pd.DataFrame] = extractor.fill(test_patients, extractor.FillMethod.LINEAR)
test_patients_mixed: list[pd.DataFrame] = extractor.mixed_fill(test_patients, test_patients_forward,
                                                               test_patients_backward, test_patients_linear,
                                                               fill_methods_to_use)

In [None]:
for patient in train_patients_mixed:
	create_windows(patient)
for patient in test_patients_mixed:
	create_windows(patient)

In [None]:
train = pd.concat(train_patients_mixed)
test = pd.concat(test_patients_mixed)

X_train = train.drop(columns="SepsisLabel", inplace=False)
y_train = train["SepsisLabel"]
X_test = test.drop(columns="SepsisLabel", inplace=False)
y_test = test["SepsisLabel"]

In [None]:
f = make_scorer(fbeta_score, beta=5.5)

clf = xgb.XGBClassifier(objective="binary:logistic", eval_metric="auc", scale_pos_weight=ratio)
bst = clf.fit(X_train, y_train)

In [None]:
y_pred = bst.predict(X_test)

In [None]:
RocCurveDisplay.from_predictions(y_test, y_pred)
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
print(classification_report(y_test, y_pred))