In [4]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_california_housing

from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, ShuffleSplit
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

from sklearn.pipeline import Pipeline

In [5]:
np.random.seed(306)

In [2]:
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

In [6]:
features, labels = fetch_california_housing(as_frame=True, return_X_y = True)
# to get values in $1000
labels *= 100

com_train_features, test_features, com_train_labels, test_labels = train_test_split(features, labels, random_state=42)

train_features, dev_features, train_labels, dev_labels =  train_test_split(com_train_features, com_train_labels, random_state=42)

helper functions

In [8]:
def train_regressor(estimator, X_train, y_train, cv, name):
    cv_results = cross_validate(
        estimator, X_train, y_train, cv=cv,
        scoring="neg_mean_absolute_error",
        return_train_score=True, return_estimator=True
    )

    cv_train_error = -1*cv_results['train_score']
    cv_test_error = -1*cv_results['test_score']

    print(f"On an average, {name} makes an error of "
            f"{cv_train_error.mean():.3f}k +/- {cv_train_error.std():.3f}k on the training set.")

    print(f"On an average, {name} makes an error of "
            f"{cv_test_error.mean():.3f}k +/- {cv_test_error.std():.3f}k on the training set.")

#### AdaBoost

In [9]:
train_regressor(
    AdaBoostRegressor(), com_train_features,
    com_train_labels, cv, 'decision tree regressor'
)

On an average, decision tree regressor makes an error of 73.263k +/- 6.031k on the training set.
On an average, decision tree regressor makes an error of 73.623k +/- 6.057k on the training set.


#### Gradient Boosting



In [11]:
train_regressor(
    GradientBoostingRegressor(
        loss = 'squared_error', learning_rate = 0.1, n_estimators = 100,
        criterion = 'friedman_mse'
    ), com_train_features,
    com_train_labels, cv, 'decision tree regressor'
)

On an average, decision tree regressor makes an error of 35.394k +/- 0.273k on the training set.
On an average, decision tree regressor makes an error of 36.771k +/- 0.722k on the training set.


#### XGBoost

In [12]:
?XGBRegressor

In [13]:
train_regressor(
    XGBRegressor(objective = 'reg:squarederror'), com_train_features,
    com_train_labels, cv, 'decision tree regressor'
)

On an average, decision tree regressor makes an error of 17.660k +/- 0.246k on the training set.
On an average, decision tree regressor makes an error of 31.340k +/- 0.791k on the training set.
