In [72]:
import numpy as np
import pandas as pd

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

In [73]:
X,y = make_classification(n_samples=10000, n_features=10, n_informative=3, random_state=42)
print(X.shape)
print(y.shape)

(10000, 10)
(10000,)


In [74]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [75]:
print(X_train.shape)
print(X_test.shape)

(8000, 10)
(2000, 10)


In [76]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)

print('Decision Tree Accuracy: ', accuracy_score(y_test,y_pred))

Decision Tree Accuracy:  0.9265


# Bagging using Decision Tree

In [77]:
bag = BaggingClassifier(
                        estimator=DecisionTreeClassifier(),       # base estimator
                        n_estimators = 500,                       # 500 decision trees
                        max_samples = 0.25,                       # 25% of X_train is fed to each decision tree
                        bootstrap=True,                           # sampling with replacement
                        random_state=42
                      )

In [78]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)

print('Bagging Accuracy: ', accuracy_score(y_test,y_pred))


Bagging Accuracy:  0.945


In [84]:
len(bag.estimators_samples_)    # returns the row number of the samples that each DT got while fit method

500

In [85]:
bag.estimators_samples_[0]

array([2523, 3113, 7114, ..., 4291, 4472, 3620], dtype=int32)

In [86]:
bag.estimators_samples_[0].shape

(2000,)

In [87]:
bag.estimators_features_[0].shape


(10,)

# Bagging with SVM

In [88]:
bag = BaggingClassifier(
                        estimator=SVC(),       # base estimator
                        n_estimators = 500,                       # 500 decision trees
                        max_samples = 0.25,                       # 25% of X_train is fed to each decision tree
                        bootstrap=True,                           # sampling with replacement
                        random_state=42
                      )

In [89]:
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)

print('Bagging using SVM Accuracy: ', accuracy_score(y_test,y_pred))

Bagging Accuracy:  0.9125


# Pasting

In [92]:
bag = BaggingClassifier(
                        estimator=DecisionTreeClassifier(),       # base estimator
                        n_estimators = 500,                       # 500 decision trees
                        max_samples = 0.25,                       # 25% of X_train is fed to each decision tree
                        bootstrap=False,                          # sampling without replacement
                        random_state=42,
                        verbose=1,
                        n_jobs=1
                      )


bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)

print('Pasting Accuracy: ', accuracy_score(y_test,y_pred))


Pasting Accuracy:  0.946


# Random Subspaces

In [94]:
bag = BaggingClassifier(
                        estimator=DecisionTreeClassifier(),       # base estimator
                        n_estimators = 500,                       # 500 decision trees
                        max_samples = 1.0,                        # 100% of X_train is fed to each decision tree
                        bootstrap=False,                          # sampling without replacement
                        random_state=42,
                        max_features=0.5,                         # column sampling
                        bootstrap_features=True                   # column sampling with replacement
                      ) 


bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)

print('Random Subspaces Accuracy: ', accuracy_score(y_test,y_pred))

Random Subspaces Accuracy:  0.9415


In [96]:
bag.estimators_features_[0]

array([9, 2, 9, 7, 7], dtype=int32)

# Random Patches

In [97]:
bag = BaggingClassifier(
                        estimator=DecisionTreeClassifier(),       # base estimator
                        n_estimators = 500,                       # 500 decision trees
                        max_samples = 0.25,                       # 100% of X_train is fed to each decision tree
                        bootstrap=False,                          # sampling without replacement
                        max_features=0.5,                         # column sampling
                        bootstrap_features=True,                  # column sampling with replacement
                        random_state=42,
                      ) 


bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)

print('Random Patches Accuracy: ', accuracy_score(y_test,y_pred))

Random Patches Accuracy:  0.9385


# OOB SCORE

In [99]:
bag = BaggingClassifier(
                        estimator=DecisionTreeClassifier(),       # base estimator
                        n_estimators = 500,                       # 500 decision trees
                        max_samples = 0.25,                       # 100% of X_train is fed to each decision tree
                        bootstrap=True,
                        oob_score=True,
                        random_state=42,
                      ) 


bag.fit(X_train,y_train)
print('OOB Score: ',bag.oob_score_)

y_pred = bag.predict(X_test)
print('Accuracy: ', accuracy_score(y_test,y_pred))

OOB Score:  0.942875
Accuracy:  0.945


# Bagging Tips <br>
- Bagging generally gives better results than Pasting 
- Good results come around the 25% to 50% row sampling mark 
- Random patches and subspaces should be used while dealing with high dimensional data 
- To find the correct hyperparameter values we can do GridSearchCV/RandomSearchCV

# GridSearchCV

In [104]:
from sklearn.model_selection import GridSearchCV

In [105]:
parameters = {
              'n_estimators': [500,100,500],
              'max_samples': [0.1,0.4,0.7,1.0],
              'bootstrap' : [True,False],
              'max_features': [0.1,0.4,0.7,1.0]
            }

In [107]:
search = GridSearchCV(BaggingClassifier(), parameters, cv=5)
search.fit(X_train,y_train)

KeyboardInterrupt: 

In [103]:
search.best_params_

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [None]:
search.best_score_