In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    average_precision_score,
    classification_report
)
from sklearn.model_selection import GridSearchCV

In [2]:
df= pd.read_csv('/content/training_filtered.csv')
df

Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,...,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
0,66154,25312,118,0,68.0,22.730000,0,Caucasian,M,180.3,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular
1,114252,59342,81,0,77.0,27.420000,0,Caucasian,F,160.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory
2,119783,50777,118,0,25.0,31.950000,0,Caucasian,F,172.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic
3,79267,46918,118,0,81.0,22.640000,1,Caucasian,F,165.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
4,92056,34377,33,0,19.0,,0,Caucasian,M,188.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6432,89615,70642,118,0,,21.171951,1,Caucasian,F,154.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma
6433,46022,2733,118,0,73.0,30.175733,0,Caucasian,M,172.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
6434,77042,35568,118,0,68.0,,1,Caucasian,F,157.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
6435,828,127649,118,0,30.0,49.001677,0,Hispanic,M,162.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma


In [3]:
target_column = "hospital_death"


In [4]:
X = df.drop(columns=[target_column, "patient_id",'encounter_id'])  # Adjust if patient_id is named differently
y = df[target_column]


In [5]:
#imputer and scalers
imputer = SimpleImputer(strategy="mean")##numeric
cat_imputer = SimpleImputer(strategy='most_frequent')##categorial
scaler = StandardScaler()


In [6]:
numeric_cols = X.select_dtypes(include=['number']).columns
categorical_cols = X.select_dtypes(exclude=['number']).columns

In [7]:
X_num = pd.DataFrame(imputer.fit_transform(X[numeric_cols]), columns=numeric_cols) ##impute numeric cols
X_cat = pd.DataFrame(cat_imputer.fit_transform(X[categorical_cols]), columns=categorical_cols)##imput categorial cols

In [8]:
X_num_scaled = pd.DataFrame(scaler.fit_transform(X_num), columns=numeric_cols)# Scale numeric columns

In [9]:
X_cat_encoded = pd.get_dummies(X_cat, drop_first=True)# Encode categorical features

In [10]:
X_processed = pd.concat([X_num_scaled, X_cat_encoded], axis=1) ##combine categorial and numeric

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# models = {
#     "Logistic Regression": LogisticRegression(max_iter=1000),
#     "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
#     "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
# }

In [12]:
# Define models and their hyperparameter grids
model_params = {
    "Logistic Regression": {
        "model": LogisticRegression(max_iter=1000),
        "params": {
            "C": [0.01, 0.1, 1, 10],
            "penalty": ["l2"],
            "solver": ["liblinear", "lbfgs"]
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5],
            "min_samples_leaf": [1, 2]
        }
    },
    "XGBoost": {
        "model": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [3, 5, 7],
            "learning_rate": [0.01, 0.1, 0.2],
            "subsample": [0.8, 1]
        }
    }
}


In [None]:
# Evaluate each model with GridSearchCV
for name, mp in model_params.items():
    print(f"\n Grid search for {name}...")
    grid = GridSearchCV(mp["model"], mp["params"], cv=5, scoring="average_precision", n_jobs=-1)
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    y_prob = best_model.predict_proba(X_test)[:, 1]

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    pr_auc = average_precision_score(y_test, y_prob)

    print(f"Best params: {grid.best_params_}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall (Sensitivity): {recall:.3f}")
    print(f"F1 Score: {f1:.3f}")
    print(f"PR AUC: {pr_auc:.3f}")


 Grid search for Logistic Regression...
Best params: {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
Precision: 0.706
Recall (Sensitivity): 0.188
F1 Score: 0.296
PR AUC: 0.451

 Grid search for Random Forest...


In [None]:
# # Evaluate each model
# for name, model in models.items():
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     y_prob = model.predict_proba(X_test)[:, 1]

#     precision = precision_score(y_test, y_pred)
#     recall = recall_score(y_test, y_pred)  # a.k.a. sensitivity
#     f1 = f1_score(y_test, y_pred)
#     pr_auc = average_precision_score(y_test, y_prob)

#     print(f"\n=== {name} ===")
#     print(f"Precision: {precision:.3f}")
#     print(f"Recall (Sensitivity): {recall:.3f}")
#     print(f"F1 Score: {f1:.3f}")
#     print(f"PR AUC: {pr_auc:.3f}")
