# Import Modules

In [1]:
import sys
import os
import pandas as pd
import tqdm as tqdm
import xgboost as xgb
import numpy as np
import optuna

# Import your custom modules. Adjust the module paths as needed.
from src.data.load_data import loadTrainingData
from sklearn.model_selection import train_test_split
from src.features.create_feature_vectors import extract_features_with_expanding_window
from src.plots.feature_plots import plot_roc_auc, plot_confusion_matrix
from sklearn.metrics import classification_report,fbeta_score,confusion_matrix
from sklearn import set_config

set_config(display="text")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_seq_items', None)
pd.set_option('display.max_rows', None)


  from .autonotebook import tqdm as notebook_tqdm


# Load Data

In [2]:
directories = ['../../training_setA/', '../../training_setB/']
max_files = None  # Adjust as needed

patient_dict = {}

for directory in directories:
    pattern = os.path.join(directory, "*.psv")
    print(f"\nLoading data from: {pattern} with max_files={max_files}")
    patient_data = loadTrainingData(
        pattern,
        max_files,
        ignore_columns=['Unit2','Unit1','ICULOS','HospAdmTime']
    )
    patient_dict.update(patient_data)


Loading data from: ../../training_setA/*.psv with max_files=None


Loading PSV Files: 100%|██████████| 20336/20336 [00:35<00:00, 569.99it/s]



Loading data from: ../../training_setB/*.psv with max_files=None


Loading PSV Files: 100%|██████████| 20000/20000 [00:35<00:00, 563.04it/s]


# Create Feature Vectors

In [None]:
feature_df = extract_features_with_expanding_window(patient_dict)

Extracting features with expanding window:   0%|          | 0/40336 [00:00<?, ?it/s][Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
Extracting features with expanding window:   0%|          | 16/40336 [00:00<21:45, 30.89it/s]


Intel MKL ERROR: Parameter 6 was incorrect on entry to DGELSD.

Intel MKL ERROR: Parameter 6 was incorrect on entry to DGELSD.

Intel MKL ERROR: Parameter 6 was incorrect on entry to DGELSD.

Intel MKL ERROR: Parameter 6 was incorrect on entry to DGELSD.

Intel MKL ERROR: Parameter 6 was incorrect on entry to DGELSD.


LinAlgError: SVD did not converge in Linear Least Squares

Extracting features with expanding window:   0%|          | 16/40336 [00:14<21:45, 30.89it/s]

 # Save or Load Feature

In [None]:
def save_feature_data(feature_df, file_path="feature_data.pkl"):
    feature_df.to_pickle(file_path)
    print(f"Feature data saved to {file_path}")

def load_feature_data(file_path="feature_data.pkl"):
    feature_df = pd.read_pickle(file_path)
    print(f"Feature data loaded from {file_path}")
    return feature_df

# feature_df = load_feature_data()
save_feature_data(feature_df)

# Add/Remove features

In [None]:
feature_df.columns

In [None]:
feature_df.shape

# Split Sets into Test and Train on Patient ID of Dictionary

In [None]:
# Group rows by patient_id to create a patient-wise dictionary.
patient_groups = {patient_id: group 
                  for patient_id, group in feature_df.groupby("patient_id")}

# Create a new dictionary with keys indicating sepsis status.
# For each patient, if any row's SepsisLabel equals 1, mark that patient as sepsis.
labeled_patients = {}
for patient_id, df in patient_groups.items():
    # Check if the patient ever had sepsis
    sepsis_label = "1" if df["SepsisLabel"].any() else "0"
    new_key = f"{patient_id}_{sepsis_label}"
    labeled_patients[new_key] = df

# Optional: Print counts to verify split counts
sepsis_count = sum(1 for key in labeled_patients if key.endswith('_1'))
nonsepsis_count = sum(1 for key in labeled_patients if key.endswith('_0'))
print(f"Number of SEPSIS patients: {sepsis_count}")
print(f"Number of NON-SEPSIS patients: {nonsepsis_count}")

# Now, create a list of keys and a matching list of binary labels for stratification.
keys = list(labeled_patients.keys())
labels = [1 if key.endswith('_sepsis') else 0 for key in keys]  # 1 = sepsis, 0 = no sepsis

# Split the keys into train and test sets while maintaining the sepsis proportion.
train_keys, test_keys, _, _ = train_test_split(
    keys, labels, test_size=0.2, random_state=42, stratify=labels
)

# Build train and test dictionaries from the split keys.
train_data_dict = {key: labeled_patients[key] for key in train_keys}
test_data_dict = {key: labeled_patients[key] for key in test_keys}

# Optional: Verify the stratification in your splits.
train_sepsis = sum(1 for key in train_data_dict if key.endswith('_1'))
test_sepsis = sum(1 for key in test_data_dict if key.endswith('_1'))
print(f"Train SEPSIS: {train_sepsis}, NON-SEPSIS: {len(train_data_dict) - train_sepsis}")
print(f"Test SEPSIS: {test_sepsis}, NON-SEPSIS: {len(test_data_dict) - test_sepsis}")

# If needed, you can also concatenate these dictionaries back into DataFrames:
train_df = pd.concat(train_data_dict.values(), ignore_index=True)
test_df = pd.concat(test_data_dict.values(), ignore_index=True)

train_df = train_df.drop(columns=['patient_id'])
test_df = test_df.drop(columns=['patient_id'])

#  Train Model

In [None]:
# Separate features and target
X_train = train_df.drop(columns=["SepsisLabel"], errors="ignore")
y_train = train_df["SepsisLabel"]

X_test = test_df.drop(columns=["SepsisLabel"], errors="ignore")
y_test = test_df["SepsisLabel"]

neg_samples, pos_samples = y_train.value_counts()
neg_samples_test, pos_samples_test = y_test.value_counts()
print(f"Negative samples of Train: {neg_samples}, Positive samples of Train: {pos_samples}")
print(f"Negative samples of Test: {neg_samples_test}, Positive samples of Test: {pos_samples_test}")

# Optuna HyperParams

In [None]:
# from sklearn.metrics import make_scorer
# from sklearn.model_selection import StratifiedKFold, cross_val_score
# from optuna.terminator import report_cross_validation_scores
# from optuna.visualization import plot_terminator_improvement
# from plotly.io import show

# def objective(trial):
#     # Define a search space for hyperparameters.
#     params = {
#         'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
#         'max_depth': trial.suggest_int('max_depth', 3, 10),
#         'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
#         'gamma': trial.suggest_loguniform('gamma', 1e-8, 1e1),
#         'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
#         'objective': 'binary:logistic',
#         'eval_metric': 'auc',
#         'scale_pos_weight': neg_samples / pos_samples,
#         'random_state': 42,
#         'use_label_encoder': False
#     }
    
#     # Use stratified 3-fold cross-validation.
#     skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
#     clf = xgb.XGBClassifier(**params)
    
#     # Define a scoring function based on F_beta score with beta=4.5.
#     scorer = make_scorer(fbeta_score, beta=4.5)
#     scores = cross_val_score(clf, X_train, y_train, scoring=scorer, cv=skf)
    
#     # Return the mean F_beta score.
#     return np.mean(scores)

# # Create an Optuna study object to maximize the F_beta score.
# study = optuna.create_study(direction='maximize')
# print("Starting hyperparameter tuning with Optuna...")
# study.optimize(objective, n_trials=15)

# fig = plot_terminator_improvement(study, plot_error=True)
# show(fig)

# # Output the best trial.
# print("Best trial:")
# print("  F_beta Score: ", study.best_trial.value)
# print("  Params: ", study.best_trial.params)

# # Extract best parameters and add fixed parameters.
# best_params = study.best_trial.params
# best_params['objective'] = 'binary:logistic'
# best_params['eval_metric'] = 'auc'
# best_params['scale_pos_weight'] = neg_samples / pos_samples
# best_params['random_state'] = 42
# best_params['use_label_encoder'] = False

In [None]:
# Cell 5: Train model
model = xgb.XGBClassifier(
    random_state=42,
    objective='binary:logistic',
    eval_metric="auc",
    scale_pos_weight=neg_samples / pos_samples
)
model.fit(X_train, y_train, 
          eval_set=[(X_test, y_test)],
          verbose=1)

# Evaluate Model

In [None]:
from src.plots.feature_plots import plot_roc_auc, plot_confusion_matrix
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

plot_roc_auc(model, X_test, y_test)

# Plot confusion matrix
plot_confusion_matrix(y_test, y_pred, labels=("No Sepsis", "Sepsis"))

# Print classification metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Maximise Threshold

In [None]:
thresholds = np.arange(0.0, 1.0, 0.01)

f_beta_scores = []
beta_val = 4.5

for t in thresholds:
    y_pred_threshold = (y_proba >= t).astype(int)
    fb = fbeta_score(y_test, y_pred_threshold, beta=beta_val)
    f_beta_scores.append(fb)

optimal_threshold = thresholds[np.argmax(f_beta_scores)]
print(f"Optimal threshold: {optimal_threshold}, F Beta {beta_val} Score: {max(f_beta_scores)}")

# Re-evaluate Model

In [None]:
# Get predicted probabilities for the positive class
y_proba = model.predict_proba(X_test)[:, 1]

# Apply the threshold to get the new predictions
y_pred_custom = (y_proba >= optimal_threshold).astype(int)


plot_roc_auc(model, X_test, y_test, optimal_threshold)

print(classification_report(y_test, y_pred_custom))
print(confusion_matrix(y_test, y_pred_custom))

# Feature Importance

In [None]:
feature_importances = model.feature_importances_
features = X_test.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df.sort_values(by='Importance', ascending=False, inplace=True)
importance_df.head(100)

# Shap Importance

# Utility Score

# Most Important Features