In [5]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score
import joblib

# --------------------------
# Load dataset
# --------------------------
df = pd.read_csv("diabetes (2).csv")

# Features & target
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# --------------------------
# Train-test split
# --------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --------------------------
# Base XGBoost model
# --------------------------
xgb_clf = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)

# --------------------------
# Parameter grid for tuning
# --------------------------
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0]
}

# --------------------------
# Grid search (3-fold CV)
# --------------------------
grid = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid.fit(X_train, y_train)

# --------------------------
# Best model
# --------------------------
best_xgb = grid.best_estimator_

# --------------------------
# Predictions
# --------------------------
y_pred = best_xgb.predict(X_test)
y_prob = best_xgb.predict_proba(X_test)[:, 1]

# --------------------------
# Evaluation metrics
# --------------------------
acc_xgb = accuracy_score(y_test, y_pred)
roc_xgb = roc_auc_score(y_test, y_prob)

print("Best Parameters:", grid.best_params_)
print("XGBoost Accuracy:", round(acc_xgb, 4))
print("XGBoost ROC AUC:", round(roc_xgb, 4))

# --------------------------
# Export the best model
# --------------------------
joblib.dump(best_xgb, 'xgb_diabetes_model.pkl')
print("Model saved as 'xgb_diabetes_model.pkl'")

Fitting 3 folds for each of 108 candidates, totalling 324 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 300, 'subsample': 1.0}
XGBoost Accuracy: 0.985
XGBoost ROC AUC: 0.9908
Model saved as 'xgb_diabetes_model.pkl'
