In [16]:
!pip -q install scikit-learn matplotlib pandas numpy

# --- Imports ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.experimental import enable_hist_gradient_boosting  # no-op in recent sklearn, safe to keep
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
from sklearn.inspection import permutation_importance
from scipy.stats import randint, loguniform
import joblib




In [17]:
# --- 1) Load a sample dataset (Iris) ---
iris = datasets.load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names
target_names = iris.target_names

In [18]:
# --- 2) Train/validation split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

In [19]:
# --- 3) Baseline HistGradientBoosting ---
hgb_base = HistGradientBoostingClassifier(
    learning_rate=0.1,
    max_depth=None,          # unlimited depth; consider 3-10 for regularization
    max_leaf_nodes=31,       # typical default
    max_iter=300,            # like n_estimators
    min_samples_leaf=20,
    l2_regularization=0.0,
    early_stopping=True,
    validation_fraction=0.1,
    random_state=42
)
hgb_base.fit(X_train, y_train)
y_pred_base = hgb_base.predict(X_test)

print("Baseline Accuracy:", accuracy_score(y_test, y_pred_base))
print("\nBaseline Classification Report:")
print(classification_report(y_test, y_pred_base, target_names=target_names))

Baseline Accuracy: 0.9210526315789473

Baseline Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        12
  versicolor       0.86      0.92      0.89        13
   virginica       0.92      0.85      0.88        13

    accuracy                           0.92        38
   macro avg       0.92      0.92      0.92        38
weighted avg       0.92      0.92      0.92        38



In [None]:
# --- 4) Hyperparameter tuning (RandomizedSearchCV) ---
# Uses log-uniform for LR & L2 (common for boosting); keeps cv=3 for speed.
param_dist = {
    "learning_rate": loguniform(1e-3, 3e-1),
    "max_depth": [None, 3, 5, 7, 10],
    "max_leaf_nodes": randint(15, 65),
    "min_samples_leaf": randint(1, 30),
    "l2_regularization": loguniform(1e-10, 1e-2),
    "max_iter": randint(150, 501),   # boosting rounds
}
hgb = HistGradientBoostingClassifier(
    early_stopping=True,
    validation_fraction=0.1,
    random_state=42
)
rand = RandomizedSearchCV(
    estimator=hgb,
    param_distributions=param_dist,
    n_iter=40,               # increase for more thorough search
    scoring="accuracy",
    cv=3,
    n_jobs=-1,
    random_state=42,
    verbose=0,
)
rand.fit(X_train, y_train)

print("\nBest Params:", rand.best_params_)
print("Best CV Accuracy:", rand.best_score_)

In [None]:
# --- 5) Evaluate best model ---
best_model = rand.best_estimator_
y_pred = best_model.predict(X_test)

print("\nTest Accuracy (Best Model):", accuracy_score(y_test, y_pred))
print("\nClassification Report (Best Model):")
print(classification_report(y_test, y_pred, target_names=target_names))

# Confusion matrix
disp = ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred, display_labels=target_names, xticks_rotation=45
)
plt.title("Confusion Matrix - HistGradientBoosting (Best Model)")
plt.tight_layout()
plt.show()

In [None]:
# --- 6) Feature importance (permutation) ---
# Works even when model doesn't expose tree-based importances cleanly.
perm = permutation_importance(
    best_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1
)
imp = perm.importances_mean
indices = np.argsort(imp)

plt.figure(figsize=(6, 4.5))
plt.barh(range(len(indices)), imp[indices])
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Mean Permutation Importance (Δ accuracy)")
plt.title("Permutation Feature Importances")
plt.tight_layout()
plt.show()

# --- 7) Save the trained model ---
joblib.dump(best_model, "hgb_best_model.joblib")
print("\nSaved best model to hgb_best_model.joblib")

In [None]:
# --- 8) Predict on a new sample (match feature order!) ---
# For Iris: [sepal length, sepal width, petal length, petal width]
sample = np.array([[5.4, 3.9, 1.7, 0.4]])
pred = best_model.predict(sample)[0]
proba = best_model.predict_proba(sample)[0]
print(f"\nNew sample predicted class: {target_names[pred]}")
print("Class probabilities:", dict(zip(target_names, np.round(proba, 3))))