## Bagging - [Bootstrap Aggregation]

> A general purpose procedure to reduce the Variance of a learning model. Here we gonna use Bagging in the context of Decision trees

> To Increase the prediction accuracy of a statistical learning method:
1. Take many training sets from the population
2. Build a separate prediction model using each training set
3. Average the resulting predictions

In [2]:
import pandas as pd
import numpy as np

In [7]:
boston_df = pd.read_csv("../data/Boston.csv", index_col=0)
boston_df.head(n=10)

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
6,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7
7,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9
8,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15,27.1
9,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93,16.5
10,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1,18.9


In [10]:
corr = boston_df.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
crim,1.0,-0.200469,0.406583,-0.0558916,0.420972,-0.219247,0.352734,-0.37967,0.625505,0.582764,0.289946,-0.385064,0.455621,-0.388305
zn,-0.200469,1.0,-0.533828,-0.0426967,-0.516604,0.311991,-0.569537,0.664408,-0.311948,-0.314563,-0.391679,0.17552,-0.412995,0.360445
indus,0.406583,-0.533828,1.0,0.062938,0.763651,-0.391676,0.644779,-0.708027,0.595129,0.72076,0.383248,-0.356977,0.6038,-0.483725
chas,-0.0558916,-0.0426967,0.062938,1.0,0.0912028,0.0912512,0.0865178,-0.0991758,-0.00736824,-0.0355865,-0.121515,0.0487885,-0.0539293,0.17526
nox,0.420972,-0.516604,0.763651,0.0912028,1.0,-0.302188,0.73147,-0.76923,0.611441,0.668023,0.188933,-0.380051,0.590879,-0.427321
rm,-0.219247,0.311991,-0.391676,0.0912512,-0.302188,1.0,-0.240265,0.205246,-0.209847,-0.292048,-0.355501,0.128069,-0.613808,0.69536
age,0.352734,-0.569537,0.644779,0.0865178,0.73147,-0.240265,1.0,-0.747881,0.456022,0.506456,0.261515,-0.273534,0.602339,-0.376955
dis,-0.37967,0.664408,-0.708027,-0.0991758,-0.76923,0.205246,-0.747881,1.0,-0.494588,-0.534432,-0.232471,0.291512,-0.496996,0.249929
rad,0.625505,-0.311948,0.595129,-0.00736824,0.611441,-0.209847,0.456022,-0.494588,1.0,0.910228,0.464741,-0.444413,0.488676,-0.381626
tax,0.582764,-0.314563,0.72076,-0.0355865,0.668023,-0.292048,0.506456,-0.534432,0.910228,1.0,0.460853,-0.441808,0.543993,-0.468536


In [35]:
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_absolute_error

In [50]:
X = boston_df[boston_df.columns.difference(['medv'])]
y = boston_df.medv

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 0)

print(X.columns)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

Index(['age', 'black', 'chas', 'crim', 'dis', 'indus', 'lstat', 'nox',
       'ptratio', 'rad', 'rm', 'tax', 'zn'],
      dtype='object')
(404, 13)
(102, 13)
(404,)
(102,)


In [51]:
model_bagging_regressor = BaggingRegressor(n_estimators=1000,max_features=13)

model_bagging_regressor.fit(X_train,y_train)

BaggingRegressor(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=13, max_samples=1.0,
         n_estimators=1000, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [52]:
predict_vals = model_bagging_regressor.predict(X_test)

In [58]:
predict_vals


array([24.1941, 28.2251, 22.0773, 10.9475, 20.8848, 20.682 , 21.1739,
       20.0532, 20.4803, 18.8036,  8.4734, 15.052 , 14.8819,  8.5186,
       47.3799, 34.0342, 21.1199, 34.7005, 25.6131, 20.9618, 23.6974,
       21.854 , 19.701 , 24.5824, 20.3752, 17.764 , 18.6672, 16.1719,
       44.1258, 19.0929, 14.8776, 17.4326, 20.1911, 21.5313, 22.9481,
       17.9676,  8.7105, 27.7249, 14.495 , 15.4156, 22.7648, 20.8852,
       22.5557, 15.2454, 23.7862, 22.5025, 21.2409, 16.8644, 14.4841,
       25.2229, 16.4178, 19.7271, 21.7622, 39.4714, 14.9821, 21.3385,
       19.6804, 18.992 , 21.9308, 20.0626, 21.5298, 21.7838, 33.2562,
       27.9878, 18.2822, 26.79  , 16.0812, 21.2288, 17.2316, 22.033 ,
       20.4538, 22.8618, 24.2261, 30.98  , 29.7272,  8.7305, 43.1584,
       22.52  , 22.8059, 20.1389, 26.4774, 18.2859, 22.5142, 42.4156,
       41.62  , 24.3692, 22.9139, 14.7594, 26.5595, 16.0982, 19.2011,
       11.9986, 22.542 , 29.9968, 21.0775, 21.8709, 11.798 , 23.3753,
       14.7493, 19.1

In [57]:
y_test

330    22.6
372    50.0
220    23.0
404     8.3
79     21.2
16     19.9
488    20.6
341    18.7
311    16.1
103    18.6
419     8.8
412    17.2
447    14.9
387    10.5
163    50.0
300    29.0
481    23.0
197    33.3
176    29.4
38     21.0
321    23.8
172    19.1
108    20.4
279    29.1
46     19.3
368    23.1
22     19.6
154    19.4
98     38.7
114    18.7
       ... 
66     23.5
345    31.2
482    23.7
388     7.4
234    48.3
207    24.4
91     22.6
498    18.3
240    23.3
138    17.1
408    27.9
225    44.8
226    50.0
327    23.0
97     21.4
427    10.2
160    23.3
392    23.2
55     18.9
436    13.4
255    21.9
301    24.8
506    11.9
247    24.3
375    13.8
57     24.7
456    14.1
61     18.7
214    28.1
109    19.8
Name: medv, Length: 102, dtype: float64

In [59]:
mean_absolute_error(y_test, predict_vals, multioutput='raw_values')

array([2.66847647])

In [43]:
y_test,dtype

NameError: name 'dtype' is not defined

In [38]:
predict_vals

array([14.08, 15.04, 17.19, 14.59,  6.57, 23.73, 49.31, 33.84,  9.37,
       21.77, 34.61, 24.2 , 28.18, 25.71, 34.24, 14.17, 18.59, 18.18,
       25.59, 44.1 , 25.64, 18.75, 24.29, 28.74, 27.26, 14.22, 25.85,
       20.11, 19.82, 19.45, 24.5 , 16.13, 11.16, 17.77, 30.37, 27.04,
       12.88, 18.37,  9.47,  9.83, 15.54, 26.36, 45.95, 14.2 , 34.64,
        9.98, 15.68, 44.82,  9.33, 22.71, 45.06, 23.9 , 24.39, 23.16,
       10.97, 22.84, 23.79, 14.63, 24.23, 30.04, 20.18, 35.08, 24.53,
       30.63,  7.73, 24.29, 19.21, 22.89, 24.31, 21.23, 20.72,  8.5 ,
       10.95, 19.49, 19.45, 19.25, 23.95, 25.68, 21.91, 24.42, 13.98,
       20.52, 11.48, 27.72, 18.84, 22.24, 18.69, 49.48, 14.95, 24.25,
       29.05, 25.53, 20.25, 28.42, 34.61, 20.46, 20.66, 23.  , 14.08,
       20.05, 19.57, 45.43, 14.54, 20.86, 17.13, 14.66, 19.19, 21.5 ,
       27.62, 20.62, 31.12, 23.23, 20.81, 45.89, 24.39, 20.22, 46.48,
       19.64, 19.39, 24.5 , 24.73, 29.41, 15.76, 45.65, 16.82, 15.79,
       14.7 ])