In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor

from sklearn.model_selection import cross_validate, train_test_split, RandomizedSearchCV, ShuffleSplit
from sklearn.metrics import mean_absolute_error

from sklearn.tree import DecisionTreeRegressor

In [2]:
np.random.seed(306)

In [3]:
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

In [4]:
features, labels = fetch_california_housing(as_frame=True, return_X_y = True)
# to get values in $1000
labels *= 100

com_train_features, test_features, com_train_labels, test_labels = train_test_split(features, labels, random_state=42)

train_features, dev_features, train_labels, dev_labels =  train_test_split(com_train_features, com_train_labels, random_state=42)

helper functions

In [6]:
def train_regressor(estimator, X_train, y_train, cv, name):
    cv_results = cross_validate(
        estimator, X_train, y_train, cv=cv,
        scoring="neg_mean_absolute_error",
        return_train_score=True, return_estimator=True
    )

    cv_train_error = -1*cv_results['train_score']
    cv_test_error = -1*cv_results['test_score']

    print(f"On an average, {name} makes an error of "
            f"{cv_train_error.mean():.3f}k +/- {cv_train_error.std():.3f}k on the training set.")

    print(f"On an average, {name} makes an error of "
            f"{cv_test_error.mean():.3f}k +/- {cv_test_error.std():.3f}k on the training set.")

Decision tree regressor

In [7]:
# overfitting
train_regressor(
    DecisionTreeRegressor(), com_train_features,
    com_train_labels, cv, 'decision tree regressor'
)

On an average, decision tree regressor makes an error of 0.000k +/- 0.000k on the training set.
On an average, decision tree regressor makes an error of 47.259k +/- 1.142k on the training set.


Bagging regressor

In [8]:
# bagging helps us to reduce variance / overfitting in the base classifier
train_regressor(
    BaggingRegressor(), com_train_features,
    com_train_labels, cv, 'bagging regressor'
)

On an average, bagging regressor makes an error of 14.377k +/- 0.196k on the training set.
On an average, bagging regressor makes an error of 35.217k +/- 0.608k on the training set.


random forest regressor

In [9]:
train_regressor(
    RandomForestRegressor(), com_train_features,
    com_train_labels, cv, 'random forest regressor'
)

On an average, random forest regressor makes an error of 12.642k +/- 0.071k on the training set.
On an average, random forest regressor makes an error of 33.198k +/- 0.718k on the training set.


Parameter search for random forest regressor

In [10]:
parameter_distributions = {
    "n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 500],
    "max_leaf_nodes": [2, 5, 10, 20, 50, 100]
}

search_cv = RandomizedSearchCV(
    RandomForestRegressor(n_jobs=2), param_distributions=parameter_distributions,
    scoring="neg_mean_absolute_error", n_iter=10, random_state=0, n_jobs=2
)

search_cv.fit(com_train_features, com_train_labels)

columns = [f"param_{name}" for name in parameter_distributions.keys()]
columns += ["mean_test_error", "std_test_error"]
cv_results = pd.DataFrame(search_cv.cv_results_)
cv_results["mean_test_error"] = -cv_results["mean_test_score"]
cv_results["std_test_error"] = -cv_results["std_test_score"]
cv_results[columns].sort_values(by="mean_test_error")

Unnamed: 0,param_n_estimators,param_max_leaf_nodes,mean_test_error,std_test_error
0,500,100,40.616043,-0.76421
2,10,100,41.167172,-0.880918
7,100,50,43.660362,-0.856177
8,1,100,46.733375,-0.996448
1,100,20,49.50473,-1.103308
6,50,20,49.553503,-1.071306
9,10,20,49.960918,-0.981662
3,500,10,55.027453,-1.038467
4,5,5,61.461462,-1.305595
5,5,2,73.047106,-1.107615


In [11]:
error = -search_cv.score(test_features, test_labels)
print(f"On average, our random forest regressor makes an error of {error:.2f}k$")

On average, our random forest regressor makes an error of 40.46k$
