In [8]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate

In [9]:
# 1 Load dataset

df = pd.read_csv("Dataset/dataset-uci.csv")
X = df.drop(columns=["Gallstone Status"])
y = df["Gallstone Status"]

In [10]:
# 2 Define 10-fold Cross Validation

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

def cv_model(name, model, scaled=False):
    if scaled:
        pipe = Pipeline([("scaler", StandardScaler()), ("model", model)])
        estimator = pipe
    else:
        estimator = model

    scoring = {
        "accuracy": "accuracy",
        "precision": "precision",
        "recall": "recall",
        "f1": "f1",
        "roc_auc": "roc_auc"
    }

    scores = cross_validate(
        estimator, X, y,
        cv=cv,
        scoring=scoring,
        return_train_score=False
    )

    return {
        "Model": name,
        "Accuracy Mean": scores["test_accuracy"].mean(),
        "Accuracy Std": scores["test_accuracy"].std(),
        "Precision Mean": scores["test_precision"].mean(),
        "Recall Mean": scores["test_recall"].mean(),
        "F1 Mean": scores["test_f1"].mean(),
        "AUC Mean": scores["test_roc_auc"].mean(),
        "AUC Std": scores["test_roc_auc"].std()
    }

In [11]:
# 3 Six Models to Compare Based on 10-fold Cross Validation

results = []

results.append(cv_model("Logistic Regression", LogisticRegression(max_iter=300), scaled=True))
results.append(cv_model("SVM (Linear)", SVC(kernel="linear", probability=True), scaled=True))
results.append(cv_model("MLP Neural Network", MLPClassifier(hidden_layer_sizes=(32,16), max_iter=600), scaled=True))

results.append(cv_model("Random Forest", RandomForestClassifier(n_estimators=300)))
results.append(cv_model("Gradient Boosting", GradientBoostingClassifier()))
results.append(cv_model("XGBoost", XGBClassifier(
    n_estimators=300, learning_rate=0.05,
    max_depth=4, subsample=0.9, colsample_bytree=0.9,
    eval_metric="logloss"
)))

In [12]:
# 4 Models Ranked by Accuracy

results_df = pd.DataFrame(results)
results_df.sort_values("Accuracy Mean", ascending=False)

Unnamed: 0,Model,Accuracy Mean,Accuracy Std,Precision Mean,Recall Mean,F1 Mean,AUC Mean,AUC Std
1,SVM (Linear),0.808871,0.087594,0.84093,0.76,0.794853,0.882907,0.060758
3,Random Forest,0.796169,0.058272,0.809883,0.779583,0.788645,0.858825,0.05922
0,Logistic Regression,0.78377,0.089805,0.826988,0.715417,0.763403,0.86046,0.064797
5,XGBoost,0.783669,0.055013,0.802843,0.77375,0.778995,0.857255,0.056884
2,MLP Neural Network,0.774496,0.06321,0.78146,0.760417,0.765661,0.842521,0.070628
4,Gradient Boosting,0.767843,0.066589,0.792643,0.735833,0.757129,0.857914,0.051992
