In [1]:
from sklearn.datasets import fetch_california_housing

data, target = fetch_california_housing(return_X_y=True, as_frame=True)
target*=100

In [5]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import GradientBoostingRegressor

gradient_boosting = GradientBoostingRegressor(n_estimators=200)
cv_results_gbdt = cross_validate(
    gradient_boosting,
    data,
    target,
    scoring="neg_mean_absolute_error",
    n_jobs=2,
)

In [7]:
print("Gradient Boosting Decision Trees")
print(f"Mean absolute error via cross-validation: {-cv_results_gbdt['test_score'].mean()}+-{cv_results_gbdt['test_score'].std()}")

print(f"Average fit time: {cv_results_gbdt['fit_time'].mean():.3f} seconds")
print(f"Average score time: {cv_results_gbdt['score_time'].mean():.3f} seconds")

Gradient Boosting Decision Trees
Mean absolute error via cross-validation: 46.41267864527025+-2.912825303108825
Average fit time: 6.553 seconds
Average score time: 0.011 seconds


In [8]:
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer

discretizer = KBinsDiscretizer(
    n_bins=256, encode="ordinal", strategy="quantile"
)
data_trans = discretizer.fit_transform(data)
data_trans



array([[249.,  39., 231., ...,  83., 162.,  30.],
       [248.,  19., 203., ...,  28., 161.,  30.],
       [242.,  49., 249., ..., 125., 160.,  29.],
       ...,
       [ 17.,  15., 126., ...,  49., 200.,  82.],
       [ 23.,  16., 136., ...,  29., 200.,  77.],
       [ 53.,  14., 130., ...,  93., 199.,  81.]])

In [9]:
[len(np.unique(col)) for col in data_trans.T]

[256, 50, 256, 253, 256, 256, 207, 235]

In [11]:
from sklearn.pipeline import make_pipeline 

gradient_boosting = make_pipeline(
    discretizer, GradientBoostingRegressor(n_estimators=200)
)
cv_results_gbdt = cross_validate(
    gradient_boosting,
    data,
    target,
    scoring="neg_mean_absolute_error",
    n_jobs=2,
)

In [13]:
print("Gradient Boosting Decision Trees with KBinsDiscretizer")
print(f"Mean absolute error via cross-validation: {-cv_results_gbdt['test_score'].mean()}+-{cv_results_gbdt['test_score'].std()}")

print(f"Average fit time: {cv_results_gbdt['fit_time'].mean():.3f} seconds")
print(f"Average score time: {cv_results_gbdt['score_time'].mean():.3f} seconds")

Gradient Boosting Decision Trees with KBinsDiscretizer
Mean absolute error via cross-validation: 45.79508666972005+-2.025908263929683
Average fit time: 3.413 seconds
Average score time: 0.007 seconds


In [14]:
from sklearn.ensemble import HistGradientBoostingRegressor

histogram_gradient_boosting = HistGradientBoostingRegressor(max_iter=200, random_state=0)

cv_results_hgbdt = cross_validate(
    histogram_gradient_boosting,
    data,
    target,
    scoring="neg_mean_absolute_error",
    n_jobs=2,
)

In [15]:
print("Histogram Gradient Boosting Decision Tree")
print(f"Mean absolute error via cross-validation: {-cv_results_hgbdt['test_score'].mean()}+-{cv_results_hgbdt['test_score'].std()}")

print(f"Average fit time: {cv_results_hgbdt['fit_time'].mean():.3f} seconds")
print(f"Average score time: {cv_results_hgbdt['score_time'].mean():.3f} seconds")

Histogram Gradient Boosting Decision Tree
Mean absolute error via cross-validation: 43.75789403636745+-2.69445299316747
Average fit time: 0.987 seconds
Average score time: 0.015 seconds
