In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import ShuffleSplit

In [2]:
np.random.seed(306)

In [3]:
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

In [4]:
# fetch dataset
features, labels = fetch_california_housing(as_frame=True, return_X_y=True)
labels *= 100

# train-test split
com_train_features, test_features, com_train_labels, test_labels = train_test_split(
features, labels, random_state=42)

# train -- > train + dev split
train_features, dev_features, train_labels, dev_labels = train_test_split(
com_train_features, com_train_labels, random_state=42)

# Training Different Regressor

In [5]:
def train_regressor(estimator, X_train, y_train, cv, name):
    cv_results = cross_validate(estimator,
                                X_train,
                                y_train,
                                cv=cv,
                                scoring="neg_mean_absolute_error",
                                return_train_score=True,
                                return_estimator=True)

    cv_train_error = -1* cv_results['train_score']
    cv_test_error = -1 * cv_results['test_score' ]

    print(f"On an average, {name} makes an error of "
    f"{cv_train_error.mean():.3f}k +/- {cv_train_error.std():.3f}k on the training set.")
    print(f"On an average, {name} makes an error of "
    f"{cv_test_error.mean():3f}k +/- {cv_test_error.std():.3f}k on the test set.")

In [6]:
train_regressor(GradientBoostingRegressor(), com_train_features, com_train_labels, cv, "GB")

On an average, GB makes an error of 35.394k +/- 0.273k on the training set.
On an average, GB makes an error of 36.774176k +/- 0.721k on the test set.


# XGBoost

Extreme gradient boosting (XGBoost) is the latest boosting technique. It is more regularized form of gradient boosting. With regularization, it is
able to achieve better generalization performance than gradient boosting.

In [9]:
from xgboost import XGBRegressor
xgb_regressor = XGBRegressor (objective='reg:squarederror')

In [10]:
train_regressor(
    xgb_regressor, com_train_features, com_train_labels, cv, "XGBoostRegressor"
)

On an average, XGBoostRegressor makes an error of 17.660k +/- 0.246k on the training set.
On an average, XGBoostRegressor makes an error of 31.340163k +/- 0.791k on the test set.
