## Using different types of Gradient Boosting algorithms and their tuning

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from scipy.stats import loguniform

from sklearn.datasets import fetch_california_housing
from sklearn.datasets import load_diabetes

In [2]:
housing_data = fetch_california_housing()
responseName = housing_data.target_names
featName = housing_data.feature_names
selcFeatName = ["MedInc", "AveOccup", "HouseAge", "AveRooms"]

X = pd.DataFrame(housing_data.data, columns=featName)
y = housing_data.target 

y_f = pd.DataFrame(housing_data.target, columns=responseName)
all_data=pd.concat([X,y_f], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [7]:
clf = GradientBoostingRegressor(random_state=0)
scoring="neg_root_mean_squared_error"
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.806049178950939
0.7809004568284745


#### Types of hyper-param search
- Grid Search: GridSearchCV (model, parameter grid, scoring, cv, verbose, refit)
- Randomized Search: RandomizedSearchCV
- Bayesian Search: 
- dont know of any way to combine early stopping with hyper-param search

#### Grid Search CV ####

In [10]:

param_grid = {
    "n_estimators": [5, 10, 50, 100],
    "max_leaf_nodes": [2, 5, 10],
    "learning_rate": [0.01, 0.1, 0.5, 1]
}
search_cv = GridSearchCV(clf, param_grid=param_grid, scoring=scoring)
search_cv.fit(X_train, y_train)

columns = [f"param_{name}" for name in param_grid.keys()]
columns += ["mean_test_error", "std_test_error"]
cv_results = pd.DataFrame(search_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["std_test_error"] = cv_results["std_test_score"]
cv_results[columns].sort_values(by="mean_test_error")

Unnamed: 0,param_n_estimators,param_max_leaf_nodes,param_learning_rate,mean_test_error,std_test_error
35,100,10,0.5,0.501955,0.011639
31,100,5,0.5,0.507891,0.01174
34,50,10,0.5,0.516581,0.009251
30,50,5,0.5,0.52516,0.011947
43,100,5,1.0,0.53167,0.008738
23,100,10,0.1,0.535901,0.010088
46,50,10,1.0,0.536379,0.010992
42,50,5,1.0,0.540155,0.005852
47,100,10,1.0,0.542493,0.009193
19,100,5,0.1,0.549276,0.009547


#### Random Search CV ####

In [11]:
np.linspace(30, 100, 10)

array([ 30.        ,  37.77777778,  45.55555556,  53.33333333,
        61.11111111,  68.88888889,  76.66666667,  84.44444444,
        92.22222222, 100.        ])

In [8]:
param_distributions = {
    "n_estimators": [int(x) for x in np.linspace(30, 100, 10)],
    "max_depth": [int(x) for x in np.linspace(3, 7, 5)],
    "max_leaf_nodes": [2, 5, 10, 20, 50, 100],
    "learning_rate": loguniform(0.01, 1),
}
search_cv = RandomizedSearchCV(clf, param_distributions=param_distributions,
    scoring=scoring, n_iter=20, random_state=0
)
search_cv.fit(X_train, y_train)

columns = [f"param_{name}" for name in param_distributions.keys()]
columns += ["mean_test_error", "std_test_error"]
cv_results = pd.DataFrame(search_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["std_test_error"] = cv_results["std_test_score"]
cv_results[columns].sort_values(by="mean_test_error")

Unnamed: 0,param_n_estimators,param_max_depth,param_max_leaf_nodes,param_learning_rate,mean_test_error,std_test_error
16,53,7,50,0.185587,0.482604,0.011488
1,45,6,100,0.494884,0.518105,0.009252
3,68,3,5,0.383222,0.518334,0.006937
4,61,3,5,0.470065,0.518543,0.006285
9,84,6,10,0.088553,0.522896,0.009986
13,53,4,5,0.637819,0.524289,0.008463
12,92,4,10,0.771785,0.528699,0.012867
6,53,3,5,0.906226,0.529127,0.010128
14,30,6,10,0.202432,0.534943,0.010453
0,53,3,20,0.125207,0.555047,0.009095


#### References

- https://inria.github.io/scikit-learn-mooc/python_scripts/ensemble_hyperparameters.html