In [1]:
# Cell 1: Imports
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import xgboost as xgb

from data.load_data import loadTrainingData
from plots.feature_plots import plot_roc_auc, plot_confusion_matrix
from features.create_feature_vectors import extract_features_with_expanding_window

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# Cell 2: Load data
directories = ['../../training_setA/', '../../training_setB/']
max_files = 10000  # Adjust as needed

patient_dict = {}

for directory in directories:
    pattern = os.path.join(directory, "*.psv")
    print(f"\nLoading data from: {pattern} with max_files={max_files}")
    patient_data = loadTrainingData(
        pattern,
        max_files,
        ignore_columns=['Age', 'Gender', 'Unit1', 'Unit2', 'HospAdmTime']
    )
    patient_dict.update(patient_data)



Loading data from: ../../training_setA/*.psv with max_files=10000


Loading PSV Files: 100%|████████████████| 10000/10000 [00:08<00:00, 1142.90it/s]



Loading data from: ../../training_setB/*.psv with max_files=10000


Loading PSV Files: 100%|████████████████| 10000/10000 [00:08<00:00, 1236.74it/s]


In [None]:
# Cell 3: Create features
feature_df = extract_features_with_expanding_window(patient_dict)
feature_df.head(10)  # Adjust as needed for a quick glance

# Drop non-feature columns if present
for col in ["patient_id", "window_size"]:
    if col in feature_df.columns:
        feature_df.drop(columns=[col], inplace=True, errors="ignore")

# Separate features and target
X = feature_df.drop(columns=["SepsisLabel"], errors="ignore")
y = feature_df["SepsisLabel"]


(769652, 36)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.it/s]
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.7s00:00<13:31, 24.63i
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    1.9s00:01<08:13, 40.38i
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    4.4s[00:04<08:17, 39.91
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:    7.8s[00:07<08:32, 38.47
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   12.2s[00:12<09:18, 34.98
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   17.6s[00:17<07:40, 42.04
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed:   23.5s[00:23<08:52, 35.92
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:   31.3s[00:31<09:18, 33.7
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:   40.9s[00:40<10:09, 30.4
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:   49.5s[00:49<06:47, 44.6
[Parallel(n_jobs=-1)]: Done 2162 tasks      | elapsed:   59.2s[00:59<07:17, 40.7
[Parallel(n_jobs=-1)]: Done 

In [None]:
# Testing Rolling Box
for df in patient_dict.values():
    test = df
    break
    
test.head(10)

In [None]:
feature_df.head(10)

In [None]:
# Cell 4: Train/test split
neg_samples, pos_samples = y.value_counts()
print(f"Negative samples: {neg_samples}, Positive samples: {pos_samples}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
# Cell 5: Train model
model = xgb.XGBClassifier(
    random_state=42,
    objective='binary:logistic',
    eval_metric="auc",
    scale_pos_weight=neg_samples / pos_samples
)
model.fit(X_train, y_train, 
          eval_set=[(X_test, y_test)],
          verbose=1)

In [None]:
# Cell 6: Evaluate model
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

# Plot ROC curve
plot_roc_auc(model, X_test, y_test)

# Plot confusion matrix
plot_confusion_matrix(y_test, y_pred, labels=("No Sepsis", "Sepsis"))

# Print classification metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))