In [5]:
pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Using cached xgboost-2.1.3-py3-none-macosx_12_0_arm64.whl (1.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.1.3
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, f1_score

# Load the dataset
# Replace 'your_dataset.csv' with the actual path to your dataset
data = pd.read_csv('preprocessed.csv')

In [15]:
data = data.drop(columns='mean_embedding')

In [None]:
# Define the target variable and feature variables
target = 'ExpiredHospital'
features = [col for col in data.columns if col != target]

# Split the data into training and testing sets
X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)

lr_preds = lr_model.predict(X_test)
lr_probs = lr_model.predict_proba(X_test)[:, 1]

print("Logistic Regression:")
print(f"Accuracy: {accuracy_score(y_test, lr_preds):.4f}")
print(f"F1 Score: {f1_score(y_test, lr_preds):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, lr_probs):.4f}")
print(classification_report(y_test, lr_preds))

# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

xgb_preds = xgb_model.predict(X_test)
xgb_probs = xgb_model.predict_proba(X_test)[:, 1]

print("\nXGBoost:")
print(f"Accuracy: {accuracy_score(y_test, xgb_preds):.4f}")
print(f"F1 Score: {f1_score(y_test, xgb_preds):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, xgb_probs):.4f}")
print(classification_report(y_test, xgb_preds))

Logistic Regression:
Accuracy: 0.8944
F1 Score: 0.2212
ROC AUC: 0.8536
              precision    recall  f1-score   support

         0.0       0.90      0.99      0.94      4448
         1.0       0.60      0.14      0.22       552

    accuracy                           0.89      5000
   macro avg       0.75      0.56      0.58      5000
weighted avg       0.87      0.89      0.86      5000



Parameters: { "use_label_encoder" } are not used.




XGBoost:
Accuracy: 0.9252
F1 Score: 0.5908
ROC AUC: 0.9283
              precision    recall  f1-score   support

         0.0       0.94      0.98      0.96      4448
         1.0       0.75      0.49      0.59       552

    accuracy                           0.93      5000
   macro avg       0.84      0.73      0.77      5000
weighted avg       0.92      0.93      0.92      5000



In [17]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 1, 5],
}

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='accuracy',  # Choose the appropriate metric for your problem
    cv=5,                # 5-fold cross-validation
    verbose=1,           # Verbosity level
    n_jobs=-1            # Use all available CPU cores
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid_search.fit(X_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Set Accuracy:", accuracy)

Fitting 5 folds for each of 729 candidates, totalling 3645 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best Hyperparameters: {'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.6}
Best Score: 0.9270426606651663
Test Set Accuracy: 0.9338
