# Bagging and Random Forest Regressor on California Housing Dataset



In [56]:
from sklearn.datasets import fetch_california_housing

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor

from sklearn.pipeline import Pipeline

In [6]:
np.random.seed(306)

## Loading the California Housing Dataset

In [15]:
features, labels = fetch_california_housing(return_X_y=True, as_frame=True)
labels*=100
# Mulitplying the labels by 100 to get the value in "thousand dollars"

X_train,X_test, y_train,y_test = train_test_split(features,labels)

In [16]:
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

## Training Different Regressors

In [53]:
def train_regressors(estimator, X_train,y_train, cv, name):
  estimator.fit(X_train,y_train)
  cv_results = cross_validate(estimator, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error', return_train_score=True, return_estimator=True)

  # Using cross_validate to get train_score in addition to the test_score

  cv_train_error = -1 * cv_results['train_score']
  cv_test_error = -1 * cv_results['test_score']

  print(f'On an average the {name} makes an error of {cv_train_error.mean():.3f} +/- {cv_train_error.std():.3f} on the training dataset')
  print(f'On an average the {name} makes an error of {cv_test_error.mean():.3f} +/- {cv_test_error.std():.3f} on the test dataset')

### Decision Tree Regressor

In [54]:
train_regressors(DecisionTreeRegressor(),X_train, y_train, cv, 'Decision Tree Regressor')

On an average the Decision Tree Regressor makes an error of 0.000 +/- 0.000 on the training dataset
On an average the Decision Tree Regressor makes an error of 47.083 +/- 0.752 on the test dataset


As you can notice the DT  regressor results in a **zero error on the training set**. This is a case of **overfitted model**, as it obtains zero error on training but quite high error on the test set.

This precise problem is solved with $Bagging$ and $Random$ $Forest$

### Bagging Regressor

In [44]:
train_regressors(BaggingRegressor(),X_train,y_train,cv,'Bagging Regressor')

On an average the Bagging Regressor makes an error of 14.343 +/- 0.173 on the training dataset
On an average the Bagging Regressor makes an error of 36.135 +/- 0.704 on the test dataset


### Random Forest Regressor

In [50]:
train_regressors(RandomForestRegressor(),X_train,y_train,cv,'Random Forest Regressor')

On an average the Random Forest Regressor makes an error of 12.562 +/- 0.125 on the training dataset
On an average the Random Forest Regressor makes an error of 33.936 +/- 0.630 on the test dataset


As you can see bagging helps us in reducing the variance or overfitting in the base classifiers.

## Parameter Search for Random Forest Regressor

In [58]:
param_distributions = {'n_estimators': [1,2,5,10,20,50,100,200,500],
                       'max_leaf_nodes': [2,5,10,20,50,100] }
search_cv = RandomizedSearchCV( RandomForestRegressor(n_jobs=2), 
                               param_distributions=param_distributions, 
                               scoring='neg_mean_absolute_error',
                               n_iter=10, random_state=0,n_jobs=2)

search_cv.fit(X_train,y_train)

RandomizedSearchCV(estimator=RandomForestRegressor(n_jobs=2), n_jobs=2,
                   param_distributions={'max_leaf_nodes': [2, 5, 10, 20, 50,
                                                           100],
                                        'n_estimators': [1, 2, 5, 10, 20, 50,
                                                         100, 200, 500]},
                   random_state=0, scoring='neg_mean_absolute_error')

In [59]:
search_cv.cv_results_

{'mean_fit_time': array([48.97458858,  6.74607306,  1.24673705, 30.65742054,  0.44886131,
         0.27350922,  3.18712454,  7.37808309,  0.24530902,  0.81616149]),
 'mean_score_time': array([0.46840425, 0.12847791, 0.13214116, 0.35109878, 0.1291018 ,
        0.13505988, 0.129006  , 0.12024517, 0.00506806, 0.12918344]),
 'mean_test_score': array([-40.48727206, -49.50199406, -41.42554272, -55.28289325,
        -61.78084541, -73.9068453 , -49.69478645, -43.85637238,
        -46.54914759, -50.06968649]),
 'param_max_leaf_nodes': masked_array(data=[100, 20, 100, 10, 5, 2, 20, 50, 100, 20],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[500, 100, 10, 500, 5, 5, 50, 100, 1, 10],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object

In [62]:
columns = [f'param_{each}' for each in param_distributions.keys()]
columns += ['mean_test_error','std_test_error']
cv_results = pd.DataFrame(search_cv.cv_results_)
cv_results['mean_test_error']=-cv_results['mean_test_score']
cv_results['std_test_error'] = cv_results['std_test_score']

cv_results[columns].sort_values(by='mean_test_error')

Unnamed: 0,param_n_estimators,param_max_leaf_nodes,mean_test_error,std_test_error
0,500,100,40.487272,0.797443
2,10,100,41.425543,0.95833
7,100,50,43.856372,0.830426
8,1,100,46.549148,1.065403
1,100,20,49.501994,0.715232
6,50,20,49.694786,0.677652
9,10,20,50.069686,0.827382
3,500,10,55.282893,0.50168
4,5,5,61.780845,0.815793
5,5,2,73.906845,0.818109


In [66]:
error = -1 * search_cv.score(X_test,y_test)
print(f'On an average, Random Forest Classifier makes an error of {error:.3f}k $ on the test dataset.')

On an average, Random Forest Classifier makes an error of 39.331k $ on the test dataset.
