## 1. Load and Prepare Patient Data

In [33]:

import os
import pandas as pd

# Load 100 patient files from each dataset
def load_patients(path, limit=100):
    files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.psv')]
    patient_data = {}
    for file_path in files[:limit]:
        pid = os.path.basename(file_path).split(".")[0]
        df = pd.read_csv(file_path, sep='|')
        patient_data[pid] = df
    return patient_data

patients_A = load_patients('training_setA', 100)
patients_B = load_patients('training_setB', 100)

patient_dict = {**patients_A, **patients_B}
print(f"Loaded {len(patient_dict)} patients")


Loaded 200 patients


## 2. Apply MICE Imputation using `encode_dict_deltas`

In [34]:

import sys
from importlib.machinery import SourceFileLoader

mice_path = 'pipelines_le/mgbm_pipeline/src/models/mice/encode_data.py'
encode_data_module = SourceFileLoader("encode_data", mice_path).load_module()
encode_dict_deltas = encode_data_module.encode_dict_deltas

encoded_dict = encode_dict_deltas(patient_dict)
all_encoded_df = pd.concat(encoded_dict.values(), ignore_index=True)
all_encoded_df.head()


Encoding patients: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:07<00:00, 26.59it/s]


Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,Hgb_delta1,Hgb_delta2,PTT_delta1,PTT_delta2,WBC_delta1,WBC_delta2,Fibrinogen_delta1,Fibrinogen_delta2,Platelets_delta1,Platelets_delta2
0,80.0,100.0,36.5,121.0,58.0,41.0,13.5,,1.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,76.0,100.0,36.25,113.25,61.0,41.5,12.0,,1.0,25.0,...,0.0,0.0,,0.0,0.0,0.0,,0.0,,0.0
2,80.0,100.0,36.25,132.75,71.5,46.25,12.0,,,,...,,,,,,,,,,
3,78.0,100.0,36.1,103.5,58.0,43.0,12.0,,-3.0,,...,,,,,,,,,,
4,74.0,100.0,36.0,128.75,69.5,44.5,12.5,,-3.0,,...,,,,,,,,,,


## 3. Select Features and Impute Remaining Missing Values

In [35]:

from sklearn.impute import SimpleImputer

# Keep columns with less than 20% missing values
missing_ratios = all_encoded_df.isnull().mean()
valid_features = missing_ratios[missing_ratios < 0.2].index.tolist()

# Drop unwanted columns
for col in ['SepsisLabel', 'SepsisLabel_delta', 'patient_id']:
    if col in valid_features:
        valid_features.remove(col)

X = all_encoded_df[valid_features]
y = all_encoded_df['SepsisLabel']

# Impute remaining missing values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)


## 4. Train Random Forest Classifier

In [36]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Determine if each patient had sepsis (1) or not (0)
patient_labels = all_encoded_df.groupby("patient_id")["SepsisLabel"].max()

# Stratified split on patient sepsis occurrence
train_pids, test_pids = train_test_split(
    patient_labels.index, test_size=0.2, random_state=42, stratify=patient_labels
)

# Filter entire DataFrame based on patient IDs
train_mask = all_encoded_df['patient_id'].isin(train_pids)
test_mask = all_encoded_df['patient_id'].isin(test_pids)

X_train = X_imputed[train_mask]
X_test = X_imputed[test_mask]
y_train = y[train_mask]
y_test = y[test_mask]

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)


## 5. Evaluate Model Performance

In [37]:

from sklearn.metrics import classification_report, roc_auc_score

y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

print("Classification Report:\n")
print(classification_report(y_test, y_pred))

roc_auc = roc_auc_score(y_test, y_prob)
print(f"ROC AUC Score: {roc_auc:.4f}")


Classification Report:

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1536
           1       0.00      0.00      0.00        20

    accuracy                           0.99      1556
   macro avg       0.49      0.50      0.50      1556
weighted avg       0.97      0.99      0.98      1556

ROC AUC Score: 0.9019


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
