## Using different types of Gradient Boosting algorithms and their tuning

In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from scipy.stats import loguniform

from sklearn.datasets import fetch_california_housing
from sklearn.datasets import load_boston

from alibi.explainers import ALE, plot_ale
from sklearn.inspection import PartialDependenceDisplay

2023-09-21 16:01:30.987516: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
housing_data = fetch_california_housing()
responseName = housing_data.target_names
featName = housing_data.feature_names
selcFeatName = ["MedInc", "AveOccup", "HouseAge", "AveRooms"]

X = pd.DataFrame(housing_data.data, columns=featName)
y = housing_data.target 

y_f = pd.DataFrame(housing_data.target, columns=responseName)
all_data=pd.concat([X,y_f], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

URLError: <urlopen error [Errno 60] Operation timed out>

In [None]:
gb = GradientBoostingRegressor(random_state=0)
gb.fit(X_train, y_train)
gb.score(X_train, y_train)
gb.score(X_test, y_test)

#### Types of hyper-param search
- Grid Search: GridSearchCV (model, parameter grid, scoring, cv, verbose, refit)
- Randomized Search: RandomizedSearchCV
- Bayesian Search: 
- dont know of any way to combine early stopping with hyper-param search

In [None]:
param_distributions = {
    "n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 500],
    "max_leaf_nodes": [2, 5, 10, 20, 50, 100],
}
search_cv = GridSearchCV(
    RandomForestRegressor(n_jobs=2), param_distributions=param_distributions,
    scoring="neg_mean_absolute_error", n_iter=10, random_state=0, n_jobs=2,
)
search_cv.fit(X_train, y_train)

columns = [f"param_{name}" for name in param_distributions.keys()]
columns += ["mean_test_error", "std_test_error"]
cv_results = pd.DataFrame(search_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["std_test_error"] = cv_results["std_test_score"]
cv_results[columns].sort_values(by="mean_test_error")

In [19]:
param_distributions = {
    "n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 500],
    "max_leaf_nodes": [2, 5, 10, 20, 50, 100],
    "learning_rate": loguniform(0.01, 1),
}
search_cv = RandomizedSearchCV(
    GradientBoostingRegressor(), param_distributions=param_distributions,
    scoring="neg_mean_absolute_error", n_iter=20, random_state=0, n_jobs=2
)
search_cv.fit(X_train, y_train)

columns = [f"param_{name}" for name in param_distributions.keys()]
columns += ["mean_test_error", "std_test_error"]
cv_results = pd.DataFrame(search_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["std_test_error"] = cv_results["std_test_score"]
cv_results[columns].sort_values(by="mean_test_error")

Unnamed: 0,param_n_estimators,param_max_leaf_nodes,param_learning_rate,mean_test_error,std_test_error
1,200,20,0.160519,0.338966,0.004287
12,200,50,0.110585,0.347918,0.002917
17,500,5,0.771785,0.348007,0.005193
10,200,20,0.109889,0.350046,0.003753
6,500,100,0.709894,0.354575,0.002911
18,10,5,0.637819,0.425357,0.003381
3,500,2,0.0750195,0.434579,0.007046
4,100,5,0.0351004,0.465589,0.005786
19,5,20,0.202432,0.613872,0.00611
8,5,2,0.462636,0.65114,0.00847


In [None]:
#### Early stopping

#### References

- https://inria.github.io/scikit-learn-mooc/python_scripts/ensemble_hyperparameters.html