In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

## Prepare data for training

In [2]:
data_df = pd.read_csv('preprocessed_data.csv')
data_df

Unnamed: 0,Patient,Age,Sex,ROSC,OHCA,VFib,TTM,Outcome,CPC,0,...,890,891,892,893,894,895,896,897,898,899
0,ICARE_0284,53.0,Male,,True,True,33.0,Good,1,41136.0,...,2323.499419,1761.118277,2520.244034,2284.636104,2401.477446,1956.090654,1956.620355,2111.938459,2146.342639,2042.268531
1,ICARE_0286,85.0,Female,7.0,False,False,,Good,1,21386.0,...,1717.566899,1696.593390,1542.500997,1832.733559,1948.641961,1365.186974,1765.238679,2115.962946,2365.949837,1962.603546
2,ICARE_0296,48.0,Male,,True,True,36.0,Good,1,174588.0,...,641.080210,642.541277,488.454049,528.212869,405.561417,426.595821,391.946972,487.054377,660.326713,424.147856
3,ICARE_0299,45.0,Male,,True,True,33.0,Good,1,37668.0,...,1737.098619,1721.455626,2139.310922,2011.260680,1439.091972,1861.282457,2058.062771,2035.698380,1954.922482,1648.370151
4,ICARE_0303,51.0,Male,24.0,True,True,33.0,Good,1,101177.0,...,65375.045069,55764.893648,104130.086229,130710.873908,209894.798272,48588.876959,72681.417096,53985.366406,62553.147813,62974.861117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
602,ICARE_1016,87.0,Male,7.0,True,False,33.0,Poor,5,217710.0,...,3194.937052,3426.623506,2187.660292,3008.751281,1249.407402,1195.003110,529.097266,896.516031,636.795797,443.510157
603,ICARE_1017,26.0,Male,52.0,True,False,,Poor,5,20673.0,...,2316.011901,1382.017209,1335.365537,1497.886789,996.647316,1129.379420,1444.935750,750.279314,1316.333410,1752.069081
604,ICARE_1018,63.0,Male,,True,True,36.0,Poor,5,95755.0,...,303.691258,211.723734,499.036674,466.082978,569.939237,43.018710,279.049945,379.680030,706.629473,358.345057
605,ICARE_1019,72.0,Male,,True,True,36.0,Good,1,12644.0,...,551.074899,350.007848,548.862559,585.266452,415.641696,398.751891,569.851070,545.500934,313.270523,222.972065


In [3]:
data_df.isna().sum()

Patient      0
Age          1
Sex          3
ROSC       304
OHCA        41
          ... 
895          0
896          0
897          0
898          0
899          0
Length: 909, dtype: int64

In [4]:
data_df.drop(['Patient', 'ROSC', 'CPC'], axis=1, inplace=True)

In [5]:
from sklearn.impute import KNNImputer

# Fill missing values in categorical columns with mode
categorical_columns = ['Sex', 'OHCA', 'VFib']
for col in categorical_columns:
    data_df[col].fillna(data_df[col].mode().iloc[0], inplace=True)

imputer = KNNImputer(n_neighbors=5)
data_df['TTM'] = imputer.fit_transform(data_df[['TTM']])
data_df['Age'] = imputer.fit_transform(data_df[['Age']])

In [6]:
data_df.shape

(607, 906)

In [7]:
X = data_df.drop(['Outcome'], axis=1)
X = pd.get_dummies(X, drop_first=True, dtype=int)
y = data_df['Outcome']

In [8]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_enc = label_encoder.fit_transform(y)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(485, 905) (122, 905) (485,) (122,)


## XGBoost Classifier

In [10]:
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV

clf = XGBClassifier(objective='binary:logistic', random_state=42)

param_dist = {
    'learning_rate': uniform(0.01, 0.1),
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 10),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
}

random_search = RandomizedSearchCV(
    estimator=clf,
    param_distributions=param_dist,
    n_iter=10,
    scoring='accuracy',
    cv=5,
    random_state=42,
    n_jobs=-1,
)
random_search.fit(X_train, y_train)

In [11]:
import joblib

best_model = XGBClassifier(**random_search.best_params_, random_state=42)
best_model.fit(X_train, y_train)
model_filename = 'best_xgb_model.joblib'
joblib.dump(best_model, model_filename)

['best_xgb_model.joblib']

In [12]:
y_pred = best_model.predict(X_test)

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.66      0.55      0.60        42
           1       0.78      0.85      0.81        80

    accuracy                           0.75       122
   macro avg       0.72      0.70      0.71       122
weighted avg       0.74      0.75      0.74       122

