# import packages

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, cross_val_predict, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_curve, auc, accuracy_score, recall_score, precision_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

# read data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv('/content/drive/MyDrive/sample_data.csv', sep=',', low_memory=False)
df.shape

# preprocess the data

In [None]:
# encoding categorical features
X = df.drop(columns=['label'])
y = df['label']
one_hot_encoder = OneHotEncoder(sparse=False)
X_encoded = one_hot_encoder.fit_transform(X)
X = pd.DataFrame(X_encoded, columns=one_hot_encoder.get_feature_names_out(X.columns))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# initialize the models

In [None]:
classifiers = {
    "XGBoost": XGBClassifier(),
    "Decision Tree": DecisionTreeClassifier(criterion = 'entropy', random_state = 0),
    "Random Forest": RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2),
    "Support Vector Machine": SVC(kernel = 'linear', random_state = 0, probability=True),
    "Kernel SVM": SVC(kernel = 'rbf', random_state = 0, probability=True),
    "Naive Bayes": GaussianNB()
}

# train models

In [None]:
plt.figure(figsize=(8, 6))

mean_fpr = np.linspace(0, 1, 100)
tprs = []
aucs = []

cv = KFold(n_splits=5, shuffle=True, random_state=42)

for name, clf in classifiers.items():
    tprs_fold = []
    aucs_fold = []
    for train_idx, test_idx in cv.split(X_train, y_train):
        X_train_cv, X_test_cv = X_train.iloc[train_idx], X_train.iloc[test_idx]
        y_train_cv, y_test_cv = y_train.iloc[train_idx], y_train.iloc[test_idx]

        clf.fit(X_train_cv, y_train_cv)

        y_score = clf.predict_proba(X_test_cv)[:, 1]

        fpr, tpr, _ = roc_curve(y_test_cv, y_score)
        roc_auc = auc(fpr, tpr)

        tprs_fold.append(np.interp(mean_fpr, fpr, tpr))
        aucs_fold.append(roc_auc)

    mean_tpr = np.mean(tprs_fold, axis=0)
    mean_auc = np.mean(aucs_fold)
    std_auc = np.std(aucs_fold)

    plt.plot(mean_fpr, mean_tpr, label=f'{name} (AUC = {mean_auc:.2f} $\pm$ {std_auc:.2f})')

    tprs.append(mean_tpr)
    aucs.append(mean_auc)

plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Guess')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Different Classifiers (5-fold CV)')
plt.legend(loc='lower right')
plt.show()

tprs_df = pd.DataFrame(tprs).transpose()
aucs_df = pd.DataFrame(aucs).transpose()

# tuning the best model (XGBoost) with Optuna

In [None]:
!pip install optuna
!pip install optuna-integration

In [None]:
import optuna
import optuna.integration
import xgboost as xgb

X_train_encoded, X_test_encoded, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    dtrain = xgb.DMatrix(X_train_encoded, label=y_train)

    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'booster': 'gbtree',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 10.0),
        'random_state': 42
    }

    model = xgb.XGBClassifier(**params)

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "test-auc")
    history = xgb.cv(params, dtrain, num_boost_round=100, callbacks=[pruning_callback])

    mean_mean_accuracyauc = history["test-auc-mean"].values[-1]

    return mean_mean_accuracyauc


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
best_params = study.best_params
print("\n \n Best Parameters:", best_params)

# train XGBoost model with best parameters

In [None]:
model = xgb.XGBClassifier(**best_params)
model.fit(X, y)

y_pred = model.predict(X_test_encoded)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Test Set Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall (Sensitivity): {recall:.4f}")
print(f"Specificity: {(conf_matrix[0, 0] / (conf_matrix[0, 0] + conf_matrix[0, 1])):.4f}")
print(f"AUC: {roc_auc:.4f}")
print("Confusion matrix: ", conf_matrix)

# features ranking and features interactions

In [None]:
!pip install shap

In [None]:
import shap

# create an explainer object using the trained XGBoost model
explainer = shap.Explainer(model)
shap_values = explainer(X_test_encoded)

In [None]:
shap_df = pd.DataFrame(shap_values.values, columns=X_test_encoded.columns)
absolute_mean_shap = shap_df.abs().mean() # the mean absolute SHAP value for each feature
absolute_mean_shap_df = pd.DataFrame(absolute_mean_shap, columns=['absolute_mean_shap']).reset_index()
absolute_mean_shap_df.columns = ['feature', 'absolute_mean_shap']

In [None]:
shap.summary_plot(shap_values, X_test_encoded, plot_type='bar')

In [None]:
shap.summary_plot(shap_values, X_test_encoded, plot_type='dot')