# Bagging Regressor

In [94]:
import numpy as np
from sklearn.datasets import fetch_california_housing

In [95]:
import pandas as pd

In [96]:
housing = fetch_california_housing()

In [97]:
df = pd.DataFrame(data=housing.data, columns=housing.feature_names)
df['target'] = housing.target

In [98]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [99]:
df = df.sample(500)

In [100]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
8522,5.5133,37.0,4.59322,0.889831,355.0,3.008475,33.9,-118.34,2.273
15108,4.1716,11.0,5.472144,1.010692,5247.0,2.952729,32.85,-116.96,1.661
2432,2.5562,43.0,6.072464,1.15942,1056.0,3.826087,36.65,-119.53,0.932
6350,6.2654,17.0,6.570637,1.00554,3538.0,3.266851,34.16,-117.95,2.738
4438,2.2222,41.0,4.261792,1.011792,1699.0,4.007075,34.08,-118.2,1.26


In [101]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

In [102]:
X = df.drop('target', axis=1)
y = df['target']

In [103]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=2)

In [104]:
lr = LinearRegression()
dt = DecisionTreeRegressor()
knn = KNeighborsRegressor()

In [105]:
lr.fit(X_train,y_train)
dt.fit(X_train,y_train)
knn.fit(X_train,y_train)

In [106]:
y_pred1 = lr.predict(X_test)
y_pred2 = dt.predict(X_test)
y_pred3 = knn.predict(X_test)

In [107]:
# linear regression 
r2_score(y_test,y_pred1)

0.7390926776584247

In [108]:
# decision tree 
r2_score(y_test,y_pred2)

0.4700609269669298

In [109]:
# knn 
r2_score(y_test,y_pred3)

-0.26345309376443526

In [110]:
from sklearn.ensemble import BaggingRegressor

In [111]:
bag_regressor = BaggingRegressor(random_state=1)
bag_regressor.fit(X_train,y_train)

In [112]:
y_preds = bag_regressor.predict(X_test)

In [113]:
bag_regressor.score(X_train,y_train)

0.9335183733031172

In [114]:
bag_regressor.score(X_test,y_test)

0.7532118636264806

In [115]:
n_samples = df.shape[0]
n_features = df.shape[1]

In [116]:
params = {'estimator': [None, LinearRegression(),KNeighborsRegressor()],
          'n_estimators': [20,50,100],
          'max_features': [0.5,1.0],
          'max_samples':[0.5,1.0],
          'bootstrap': [True, False],
          'bootstrap_features': [True,False]}

bagging_regressor_grid = GridSearchCV(BaggingRegressor(random_state=1,n_jobs=-1),param_grid=params, cv=3, n_jobs=-1, verbose=1)
bagging_regressor_grid.fit(X_train,y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


In [117]:
# train r2 score 
bagging_regressor_grid.best_estimator_.score(X_train,y_train)

0.9547630150123463

In [118]:
# test r2
bagging_regressor_grid.best_estimator_.score(X_test,y_test)

0.7974591983892014

In [119]:
# grid search r2 score
bagging_regressor_grid.best_score_

0.6621291805589972

In [120]:
# best parameters
bagging_regressor_grid.best_params_

{'bootstrap': True,
 'bootstrap_features': False,
 'estimator': None,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 100}