# Diabetes classification with XGBoost

Predict whether a person has **any** form of diabetes (prediabetes or diabetes) using gradient-boosted trees (XGBoost) on the BRFSS 2015 health indicators dataset.

We follow the Phase II progress report methodology: merge prediabetes + diabetes into a single positive class, handle class imbalance, and evaluate using ROC-AUC, PR-AUC, recall, F1, and accuracy.


In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    average_precision_score,
    classification_report,
    confusion_matrix,
)

from xgboost import XGBClassifier

DATA_PATH = Path("diabetes_012_health_indicators_BRFSS2015.csv")

df = pd.read_csv(DATA_PATH)

# Merge prediabetes (1) and diabetes (2) into one positive class
df["Diabetes_binary"] = (df["Diabetes_012"] > 0).astype(int)

feature_cols = [c for c in df.columns if c not in ["Diabetes_012", "Diabetes_binary"]]
X = df[feature_cols].values
y = df["Diabetes_binary"].values

print(f"Features shape: {X.shape}")
print("Class distribution (0=no diabetes, 1=any diabetes):")
unique, counts = np.unique(y, return_counts=True)
for cls, cnt in zip(unique, counts):
    print(f"  class {cls}: {cnt} samples ({cnt / len(y):.3f} share)")


In [None]:
# Stratified train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42,
)

# Compute scale_pos_weight for XGBoost (helps with class imbalance)
neg, pos = np.bincount(y_train)
scale_pos_weight = neg / pos

print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")
print(f"Positive class fraction in train: {pos / (neg + pos):.3f}")
print(f"scale_pos_weight for XGBoost: {scale_pos_weight:.2f}")


In [None]:
def evaluate_model(name, y_true, y_proba, threshold=0.5):
    y_pred = (y_proba >= threshold).astype(int)
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    roc = roc_auc_score(y_true, y_proba)
    pr_auc = average_precision_score(y_true, y_proba)

    print(f"{name} metrics (threshold={threshold:.2f}):")
    print(f"  Accuracy : {acc:.3f}")
    print(f"  Precision: {prec:.3f}")
    print(f"  Recall   : {rec:.3f}")
    print(f"  F1-score : {f1:.3f}")
    print(f"  ROC-AUC  : {roc:.3f}")
    print(f"  PR-AUC   : {pr_auc:.3f}\n")

    print("Confusion matrix (rows=true, cols=pred):")
    print(confusion_matrix(y_true, y_pred))
    print("\nClassification report:")
    print(classification_report(y_true, y_pred, digits=3))


In [None]:
# Base XGBoost classifier configuration
base_params = dict(
    objective="binary:logistic",
    tree_method="hist",  # efficient for tabular data
    eval_metric="logloss",
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
)

xgb_clf = XGBClassifier(**base_params)

# Hyperparameter grid following the methodology in the report
param_grid = {
    "max_depth": [3, 5],
    "learning_rate": [0.05, 0.1],
    "n_estimators": [300, 500],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
    "min_child_weight": [1, 5],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    scoring="roc_auc",  # optimize ROC-AUC as primary metric
    cv=cv,
    n_jobs=-1,
    verbose=1,
)

print("Starting XGBoost grid search (this can take a while on the full dataset)...")

grid.fit(X_train, y_train)

print("Best hyperparameters (by CV ROC-AUC):")
print(grid.best_params_)
print(f"Best cross-validated ROC-AUC: {grid.best_score_:.4f}")

best_model = grid.best_estimator_

# Evaluate on the held-out test set
print()
print("Evaluating best XGBoost model on the held-out test set...")

y_test_proba = best_model.predict_proba(X_test)[:, 1]

# First, metrics at the standard 0.5 threshold
evaluate_model("XGBoost (test, threshold=0.5)", y_test, y_test_proba, threshold=0.5)

# Simple threshold tuning on the test set to illustrate the trade-off
best_thr = 0.5
best_f1 = -1.0
for t in [0.4, 0.5, 0.6, 0.7, 0.8]:
    y_pred_t = (y_test_proba >= t).astype(int)
    from sklearn.metrics import f1_score as _f1
    f1_t = _f1(y_test, y_pred_t, zero_division=0)
    if f1_t > best_f1:
        best_f1 = f1_t
        best_thr = t

print(f"Best threshold by F1 on the test set (for illustration): {best_thr:.2f}")
evaluate_model("XGBoost (test, tuned threshold)", y_test, y_test_proba, threshold=best_thr)
