# Import Modules

In [1]:
import sys
import os
import pandas as pd

# Import your custom modules. Adjust the module paths as needed.
from src.data.load_data import loadTrainingData
from sklearn.model_selection import train_test_split
from src.features.create_feature_vectors import extract_features_with_expanding_window



# Load Data

In [2]:
directories = ['../../training_setA/', '../../training_setB/']
max_files = 1000  # Adjust as needed

patient_dict = {}

for directory in directories:
    pattern = os.path.join(directory, "*.psv")
    print(f"\nLoading data from: {pattern} with max_files={max_files}")
    patient_data = loadTrainingData(
        pattern,
        max_files,
        ignore_columns=['Age', 'Gender', 'Unit1', 'Unit2', 'HospAdmTime']
    )
    patient_dict.update(patient_data)


Loading data from: ../../training_setA/*.psv with max_files=1000


Loading PSV Files: 100%|██████████████████| 1000/1000 [00:00<00:00, 1158.27it/s]



Loading data from: ../../training_setB/*.psv with max_files=1000


Loading PSV Files: 100%|██████████████████| 1000/1000 [00:00<00:00, 1122.93it/s]


# Create Feature Vectors

In [None]:
feature_df = extract_features_with_expanding_window(patient_dict)

(77415, 36)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.it/s]
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.6s0:00<01:16, 25.96it
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    1.8s0:01<00:48, 40.47it
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    3.9s00:03<00:44, 41.73i
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:    6.9s00:06<00:40, 42.25i
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   11.2s00:11<00:47, 33.00i
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   16.7s00:16<00:35, 38.39i
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed:   21.9s00:21<00:26, 42.58i
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:   28.3s00:28<00:21, 40.51
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:   37.9s00:37<00:12, 44.24
extracting features with expanding window:  76%|▊| 1528/2000 [00:39<00:11, 40.88

 # Save or Load Feature

In [None]:
def save_feature_data(feature_df, file_path="feature_data.pkl"):
    feature_df.to_pickle(file_path)
    print(f"Feature data saved to {file_path}")

def load_feature_data(file_path="feature_data.pkl"):
    feature_df = pd.read_pickle(file_path)
    print(f"Feature data loaded from {file_path}")
    return feature_df

save_feature_data(feature_df)

# Add/Remove features

# Split Sets into Test and Train on Patient ID of Dictionary

In [None]:
# Group rows by patient_id to create a patient-wise dictionary.
patient_groups = {patient_id: group 
                  for patient_id, group in feature_df.groupby("patient_id")}

# Create a new dictionary with keys indicating sepsis status.
# For each patient, if any row's SepsisLabel equals 1, mark that patient as sepsis.
labeled_patients = {}
for patient_id, df in patient_groups.items():
    # Check if the patient ever had sepsis
    sepsis_label = "1" if df["SepsisLabel"].any() else "0"
    new_key = f"{patient_id}_{sepsis_label}"
    labeled_patients[new_key] = df

# Optional: Print counts to verify split counts
sepsis_count = sum(1 for key in labeled_patients if key.endswith('_1'))
nonsepsis_count = sum(1 for key in labeled_patients if key.endswith('_0'))
print(f"Number of SEPSIS patients: {sepsis_count}")
print(f"Number of NON-SEPSIS patients: {nonsepsis_count}")

# Now, create a list of keys and a matching list of binary labels for stratification.
keys = list(labeled_patients.keys())
labels = [1 if key.endswith('_sepsis') else 0 for key in keys]  # 1 = sepsis, 0 = no sepsis

# Split the keys into train and test sets while maintaining the sepsis proportion.
train_keys, test_keys, _, _ = train_test_split(
    keys, labels, test_size=0.2, random_state=42, stratify=labels
)

# Build train and test dictionaries from the split keys.
train_data_dict = {key: labeled_patients[key] for key in train_keys}
test_data_dict = {key: labeled_patients[key] for key in test_keys}

# Optional: Verify the stratification in your splits.
train_sepsis = sum(1 for key in train_data_dict if key.endswith('_1'))
test_sepsis = sum(1 for key in test_data_dict if key.endswith('_1'))
print(f"Train SEPSIS: {train_sepsis}, NON-SEPSIS: {len(train_data_dict) - train_sepsis}")
print(f"Test SEPSIS: {test_sepsis}, NON-SEPSIS: {len(test_data_dict) - test_sepsis}")

# If needed, you can also concatenate these dictionaries back into DataFrames:
train_df = pd.concat(train_data_dict.values(), ignore_index=True)

#  Train Model

# Evaluate Model

# Maximise Threshold

# Re-evaluate Model

# Utility Score