In [3]:
import pandas as pd
import numpy as np

# Load Boston Housing dataset from the original source
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)

# Reconstruct the dataset
X_boston = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
Y_boston = raw_df.values[1::2, 2]

# Feature names (same as the original dataset)
feature_names = [
    "CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE",
    "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT"
]

print("Dataset feature names : " + str(feature_names))
print("Dataset features size : " + str(X_boston.shape))
print("Dataset target size   : " + str(Y_boston.shape))


Dataset feature names : ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
Dataset features size : (506, 13)
Dataset target size   : (506,)


In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_boston, Y_boston, train_size=0.80, test_size=0.20, random_state=123)
print("train / test sets sizes :",X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

train / test sets sizes : (404, 13) (102, 13) (404,) (102,)


In [8]:
lr = LinearRegression()
dt = DecisionTreeRegressor()
knn = KNeighborsRegressor()

In [9]:
lr.fit(X_train, Y_train)
dt.fit(X_train, Y_train)
knn.fit(X_train, Y_train)

In [10]:
y_pred1 = lr.predict(X_test)
y_pred2 = dt.predict(X_test)
y_pred3 = knn.predict(X_test)

In [12]:
print('R2score for LR',r2_score(Y_test,y_pred1))
print('R2score for dt',r2_score(Y_test,y_pred2))
print('R2score for knn',r2_score(Y_test,y_pred3))

R2score for LR 0.6592466510354094
R2score for dt 0.3942515115311318
R2score for knn 0.5475962186976784


In [13]:
from sklearn.ensemble import BaggingRegressor
bag_regressor = BaggingRegressor(random_state=1)
bag_regressor.fit(X_train, Y_train)

In [16]:
y_preds = bag_regressor.predict(X_test)
print('training coefficient of r2 :',bag_regressor.score(X_train, Y_train))
print('test coefficient of r2 :',bag_regressor.score(X_test, Y_test))

training coefficient of r2 : 0.9799359879973576
test coefficient of r2 : 0.8184644795411804


In [20]:
from sklearn.datasets import fetch_openml

boston = fetch_openml(name="boston", version=1, as_frame=True)
X, y = boston.data, boston.target


In [25]:
%%time

n_samples = boston.data.shape[0]
n_features = boston.data.shape[1]

params ={'estimator':[None, LinearRegression(), KNeighborsRegressor()],
         'n_estimators': [20,50,100],
         'max_samples': [0.5,1.0],
         'max_features': [0.5,1.0],
         'bootstrap': [True,False],
         'bootstrap_features': [True, False]}

bagging_regressor_grid = GridSearchCV(BaggingRegressor(random_state=1, n_jobs=-1), param_grid = params, cv=3 , n_jobs=-1, verbose=1)
bagging_regressor_grid.fit(X_train, Y_train)


print('train r2 score :', bagging_regressor_grid.best_estimator_.score(X_train, Y_train))
print('test r2 score :', bagging_regressor_grid.best_estimator_.score(X_test, Y_test))

print('best r2 score through grid search ',bagging_regressor_grid.best_score_)
print('best parameters :',bagging_regressor_grid.best_params_)




Fitting 3 folds for each of 144 candidates, totalling 432 fits
train r2 score : 0.9832238020638931
test r2 score : 0.805126651796481
best r2 score through grid search  0.8713456857539184
best parameters : {'bootstrap': True, 'bootstrap_features': False, 'estimator': None, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 50}
CPU times: total: 1.17 s
Wall time: 19.8 s
