In [7]:
from sklearn import datasets
from sklearn.datasets import fetch_california_housing
import pandas as pd

In [8]:
california = fetch_california_housing()
X_boston, Y_boston = california.data, california.target

In [9]:
print('Dataset features names : ' + str(california.feature_names))
print('Dataset features size : ' + str(X_boston.shape))
print('Dataset target size : ' + str(Y_boston.shape))

Dataset features names : ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
Dataset features size : (20640, 8)
Dataset target size : (20640,)


In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X_boston, Y_boston , train_size=0.80, test_size=0.20, random_state=123)
print('Train/Test Sets Sizes : ', X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

Train/Test Sets Sizes :  (16512, 8) (4128, 8) (16512,) (4128,)


In [12]:
# Model initialization
lr = LinearRegression()
dt = DecisionTreeRegressor()
knn = KNeighborsRegressor()

In [17]:
# Fit models
lr.fit(X_train, Y_train)

In [16]:
dt.fit(X_train, Y_train)

In [15]:
knn.fit(X_train, Y_train)

In [14]:
# Predict
y_pred1 = lr.predict(X_test)
y_pred2 = dt.predict(X_test)
y_pred3 = knn.predict(X_test)

In [20]:
# R² scores
print("R2 score for LR", r2_score(Y_test, y_pred1))
print("R2 score for DT", r2_score(Y_test, y_pred2))
print("R2 score for KNN", r2_score(Y_test, y_pred3))

R2 score for LR 0.6104546894797871
R2 score for DT 0.6038044541494385
R2 score for KNN 0.16261917827057237


## Bagging Regressor

In [None]:

bag_regressor = BaggingRegressor(random_state=1)
bag_regressor.fit(X_train, Y_train)

In [None]:
Y_preds = bag_regressor.predict(X_test)

In [23]:
print('Training Coefficient of R^2 : %.3f' % bag_regressor.score(X_train, Y_train))
print('Test Coefficient of R^2 : %.3f' % bag_regressor.score(X_test, Y_test))

Training Coefficient of R^2 : 0.963
Test Coefficient of R^2 : 0.792


## Grid Search with Bagging Regressor

In [35]:
import time
start = time.time()

In [25]:
n_samples = X_boston.shape[0]
n_features = X_boston.shape[1]

In [32]:
params = {
    'estimator': [None, LinearRegression(), KNeighborsRegressor()],
    'n_estimators': [20, 50, 100],
    'max_samples': [0.5, 1.0],
    'max_features': [0.5, 1.0],
    'bootstrap': [True, False],
    'bootstrap_features': [True, False]
}

In [33]:
bagging_regressor_grid = GridSearchCV(
    BaggingRegressor(random_state=1, n_jobs=-1),
    param_grid=params,
    cv=3,
    n_jobs=-1,
    verbose=1
)

In [34]:
bagging_regressor_grid.fit(X_train, Y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


In [36]:
print('Train R^2 Score : %.3f' % bagging_regressor_grid.best_estimator_.score(X_train, Y_train))
print('Test R^2 Score : %.3f' % bagging_regressor_grid.best_estimator_.score(X_test, Y_test))
print('Best R^2 Score Through Grid Search : %.3f' % bagging_regressor_grid.best_score_)
print('Best Parameters : ', bagging_regressor_grid.best_params_)

Train R^2 Score : 0.973
Test R^2 Score : 0.816
Best R^2 Score Through Grid Search : 0.801
Best Parameters :  {'bootstrap': True, 'bootstrap_features': True, 'estimator': None, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 100}


In [37]:
end = time.time()
print("CPU Time: %.2f seconds" % (end - start))

CPU Time: 46.08 seconds
