In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Load the dataset
file_path = "diabetes_dataset.csv"  
df = pd.read_csv(file_path)

# Preprocessing
# Selecting features and target variable
X = df.drop(['class'], axis="columns")
y = df['class']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize models
svm_model = SVC(probability=True, random_state=42)
lr_model = LogisticRegression(random_state=42, max_iter=200000)  # Increased max_iter
xgb_model = XGBClassifier(eval_metric="logloss", random_state=42, max_depth=3, n_estimators=50)

# Train models
svm_model.fit(X_train, y_train)
lr_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)

# Make predictions
svm_probs = svm_model.predict_proba(X_test)[:, 1]
lr_probs = lr_model.predict_proba(X_test)[:, 1]
xgb_probs = xgb_model.predict_proba(X_test)[:, 1]

# KS statistic calculation function
def ks_stat(y_true, probs):
    df = pd.DataFrame({"y_true": y_true, "probs": probs})
    df = df.sort_values("probs")
    df["cum_event"] = (df["y_true"] == 1).cumsum()
    df["cum_non_event"] = (df["y_true"] == 0).cumsum()
    total_event = df["cum_event"].iloc[-1]
    total_non_event = df["cum_non_event"].iloc[-1]
    df["event_rate"] = df["cum_event"] / total_event
    df["non_event_rate"] = df["cum_non_event"] / total_non_event
    ks = (df["event_rate"] - df["non_event_rate"]).abs().max()
    return ks

# Calculate KS statistics
svm_ks = ks_stat(y_test, svm_probs)
lr_ks = ks_stat(y_test, lr_probs)
xgb_ks = ks_stat(y_test, xgb_probs)

# Evaluate models
results = {
    "SVM": {
        "Accuracy": accuracy_score(y_test, svm_model.predict(X_test)),
        "AUC": roc_auc_score(y_test, svm_probs),
        "KS": svm_ks,
    },
    "Logistic Regression": {
        "Accuracy": accuracy_score(y_test, lr_model.predict(X_test)),
        "AUC": roc_auc_score(y_test, lr_probs),
        "KS": lr_ks,
        "Classification Report": classification_report(y_test, lr_model.predict(X_test), zero_division=0)
    },
    "XGBoost": {
        "Accuracy": accuracy_score(y_test, xgb_model.predict(X_test)),
        "AUC": roc_auc_score(y_test, xgb_probs),
        "KS": xgb_ks,
    }
}

# Print the results
for model, metrics in results.items():
    print(f"=== {model} ===")
    print(f"Accuracy: {metrics['Accuracy']:.4f}")
    print(f"AUC: {metrics['AUC']:.4f}")
    print(f"KS Statistic: {metrics['KS']:.4f}")
    print("\n")

=== SVM ===
Accuracy: 0.7532
AUC: 0.7924
KS Statistic: 0.4833


=== Logistic Regression ===
Accuracy: 0.7143
AUC: 0.8230
KS Statistic: 0.5419


=== XGBoost ===
Accuracy: 0.7597
AUC: 0.8213
KS Statistic: 0.5519


