In [1]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

data, target = fetch_california_housing(return_X_y=True, as_frame=True)
target *= 100  # rescale the target in k$
data_train, data_test, target_train, target_test = train_test_split(data, target, random_state=0)

In [2]:
print(f"In this case, n_features={len(data.columns)}")

In this case, n_features=8


In [3]:
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

param_distributions = {
    "max_features": [1, 2, 3, 5, None],
    "max_leaf_nodes": [10, 100, 1000, None],
    "min_samples_leaf": [1, 2, 5, 10, 20, 50, 100],
}

search_cv = RandomizedSearchCV(
    RandomForestRegressor(n_jobs=2),
    param_distributions=param_distributions,
    scoring="neg_mean_absolute_error",
    n_iter=10,
    n_jobs=2,
    random_state=0,
)
search_cv.fit(data_train, target_train)

columns = [f"param_{name}" for name in param_distributions.keys()]
columns += ["mean_test_error", "std_test_error"]
cv_results = pd.DataFrame(search_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["std_test_error"] = cv_results["std_test_score"]
cv_results[columns].sort_values(by="mean_test_error")

Unnamed: 0,param_max_features,param_max_leaf_nodes,param_min_samples_leaf,mean_test_error,std_test_error
3,2.0,,2,34.058092,0.470535
0,2.0,1000.0,10,36.765141,0.519856
7,,,20,37.267855,0.401033
4,5.0,100.0,2,39.935879,0.557504
8,,100.0,10,40.485881,0.55297
6,,1000.0,50,40.868152,0.470018
2,1.0,100.0,1,49.765355,1.210293
9,1.0,100.0,2,49.868023,0.56169
5,1.0,,100,54.510465,0.788365
1,3.0,10.0,10,55.232914,0.756751


In [4]:
error = -search_cv.score(data_test, target_test)
print(f"Test error: {error:.2f} k$")

Test error: 34.07 k$


In [5]:
from scipy.stats import loguniform
from sklearn.ensemble import HistGradientBoostingRegressor

param_distributions = {
    "max_iter": [3, 10, 30, 100, 300, 1000],
    "max_leaf_nodes": [2, 5, 10, 20, 50, 100],
    "learning_rate": loguniform(0.01, 1),
}
search_cv = RandomizedSearchCV(
    HistGradientBoostingRegressor(),
    param_distributions=param_distributions,
    scoring="neg_mean_absolute_error",
    n_iter=20,
    n_jobs=2,
    random_state=0,
)

search_cv.fit(data_train, target_train)

columns = [f"param_{name}" for name in param_distributions.keys()]
columns += ["mean_test_error", "std_test_error"]
cv_results = pd.DataFrame(search_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["std_test_error"] = cv_results["std_test_score"]
cv_results[columns].sort_values(by="mean_test_error")

Unnamed: 0,param_max_iter,param_max_leaf_nodes,param_learning_rate,mean_test_error,std_test_error
14,300,100,0.01864,31.048214,0.270048
6,300,20,0.047293,31.890472,0.36834
13,300,10,0.297739,32.590475,0.591019
2,30,50,0.176656,32.645565,0.335198
9,100,20,0.083745,33.102698,0.466412
19,100,10,0.215543,33.231881,0.39874
12,100,20,0.067503,33.623766,0.480144
16,300,5,0.05929,35.768925,0.490158
1,100,5,0.160519,36.42386,0.543631
0,1000,2,0.125207,40.788208,0.579395


In [6]:
error = -search_cv.score(data_test, target_test)
print(f"On average, our HGBT regressor makes an error of {error:.2f} k$")

On average, our HGBT regressor makes an error of 30.50 k$
