In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
# 1 Load dataset

df = pd.read_csv("Dataset/dataset-uci.csv")

target_col = "Gallstone Status"

X = df.drop(columns=[target_col])
y = df[target_col]

print("X shape:", X.shape)
print("y distribution:")
print(y.value_counts())

X shape: (319, 38)
y distribution:
Gallstone Status
0    161
1    158
Name: count, dtype: int64


In [3]:
# 2 Stratified to keep 50-50 balance

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

Train size: (223, 38)
Test size: (96, 38)


In [4]:
#3 Model Evaluation Code

def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    # Some models (SVM) need probability estimates enabled
    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X_test)[:, 1]
    else:
        # fallback using decision_function
        try:
            proba = model.decision_function(X_test)
        except:
            proba = np.zeros_like(preds)
    
    return {
        "Model": name,
        "Accuracy": accuracy_score(y_test, preds),
        "Precision": precision_score(y_test, preds),
        "Recall": recall_score(y_test, preds),
        "F1": f1_score(y_test, preds),
        "ROC-AUC": roc_auc_score(y_test, proba)
    }

In [5]:
# 4 Model Definition and Hyperparameters

models = []

# Scaled models
scaled_models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "KNN": KNeighborsClassifier(),
    "SVM (Linear)": SVC(kernel="linear", probability=True),
    "SVM (RBF)": SVC(kernel="rbf", probability=True),
    "MLP Neural Network": MLPClassifier(hidden_layer_sizes=(32,16), max_iter=600)
}

# Tree & ensemble models
tree_models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(
        n_estimators=300, 
        learning_rate=0.05,
        max_depth=4,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric="logloss"
    )
}

In [6]:
# 5 Results

results = []

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Scaled models
for name, model in scaled_models.items():
    res = evaluate_model(name, model, X_train_scaled, X_test_scaled, y_train, y_test)
    results.append(res)

# Tree-based models
for name, model in tree_models.items():
    res = evaluate_model(name, model, X_train, X_test, y_train, y_test)
    results.append(res)

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,ROC-AUC
0,Logistic Regression,0.78125,0.8,0.75,0.774194,0.884115
1,KNN,0.65625,0.674419,0.604167,0.637363,0.705512
2,SVM (Linear),0.833333,0.863636,0.791667,0.826087,0.914497
3,SVM (RBF),0.760417,0.777778,0.729167,0.752688,0.873264
4,MLP Neural Network,0.760417,0.790698,0.708333,0.747253,0.874132
5,Decision Tree,0.739583,0.744681,0.729167,0.736842,0.739583
6,Random Forest,0.8125,0.8,0.833333,0.816327,0.894314
7,Gradient Boosting,0.791667,0.791667,0.791667,0.791667,0.870226
8,XGBoost,0.84375,0.836735,0.854167,0.845361,0.902778


In [7]:
# 6 Rank Results

results_df.sort_values("Accuracy", ascending=False).reset_index(drop=True)  

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,ROC-AUC
0,XGBoost,0.84375,0.836735,0.854167,0.845361,0.902778
1,SVM (Linear),0.833333,0.863636,0.791667,0.826087,0.914497
2,Random Forest,0.8125,0.8,0.833333,0.816327,0.894314
3,Gradient Boosting,0.791667,0.791667,0.791667,0.791667,0.870226
4,Logistic Regression,0.78125,0.8,0.75,0.774194,0.884115
5,SVM (RBF),0.760417,0.777778,0.729167,0.752688,0.873264
6,MLP Neural Network,0.760417,0.790698,0.708333,0.747253,0.874132
7,Decision Tree,0.739583,0.744681,0.729167,0.736842,0.739583
8,KNN,0.65625,0.674419,0.604167,0.637363,0.705512
