In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
X,y=make_classification(n_samples=10000,n_features=10,n_informative=3)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,
                                               random_state=42)
dt=DecisionTreeClassifier(random_state=42)
dt.fit(X_train,y_train)
y_pred=dt.predict(X_test)
print("Decision Tree",accuracy_score(y_test,y_pred))

Decision Tree 0.896


In [3]:
X.shape

(10000, 10)

### 1.   Bagging using Decision Trees

In [4]:
# n_estimators = no of trees.
# max_samples  = 25% data of X_train
# boostrap = True , sampling with replacement

bag=BaggingClassifier(
            base_estimator=DecisionTreeClassifier(),
            n_estimators=10,
            max_samples=0.25,
            bootstrap=True, # sampling with replacement 
            random_state=42)


In [5]:
bag.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.25,
                  random_state=42)

In [6]:
X_train.shape

(8000, 10)

In [7]:
y_predict=bag.predict(X_test)
print("Accuracy score -",accuracy_score(y_test,y_predict))

Accuracy score - 0.8925


In [8]:

# 10 trees are created with each tree having 200 samples

# bag.estimators_samples_[0].shape

bag.estimators_samples_

[array([2523, 3113, 7114, ..., 4291, 4472, 3620]),
 array([4782,  663, 7155, ..., 5963,  495, 1767]),
 array([5462, 6574, 4896, ..., 3979, 7827,   37]),
 array([2848, 2629, 1591, ..., 7723, 1314, 1565]),
 array([3821, 6494, 1606, ..., 5686, 7870, 2558]),
 array([2261, 7922, 3649, ..., 4478, 6286, 6943]),
 array([ 652, 1676, 2291, ..., 2723, 7007, 6344]),
 array([2478, 4107, 1958, ..., 7979, 5695, 7854]),
 array([5800, 3548, 6540, ..., 3899,  831,   55]),
 array([5256, 7181, 3409, ..., 5286, 7535, 1335])]

In [9]:
# It is taking all the features.
bag.estimators_features_[0].shape

(10,)

### 2.   Bagging using SVM

In [10]:
bag=BaggingClassifier(
            base_estimator=SVC(),
            n_estimators=50,
            max_samples=0.25,
            bootstrap=True,  # sampling with replacement 
            random_state=42)

bag.fit(X_train,y_train)
y_predict=bag.predict(X_test)
print("Accuracy score -",accuracy_score(y_test,y_predict))

Accuracy score - 0.882


### 3.  Pasting

In [11]:
bag=BaggingClassifier(
            base_estimator=DecisionTreeClassifier(),
            n_estimators=50,
            max_samples=0.25,
            bootstrap=False,  # Sampling without replacement
            random_state=42,
            verbose=1,  # To provide info about the training
            n_jobs=-1)  # Task is divided for fast processing
            
bag.fit(X_train,y_train)
y_predict=bag.predict(X_test)
print("Accuracy score -",accuracy_score(y_test,y_predict))

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.


Accuracy score - 0.9085


[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    1.3s remaining:    4.0s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.3s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    0.0s finished


### 4.   Random Subspaces

In [12]:

# No row sampling only column sampling 

bag=BaggingClassifier(
            base_estimator=DecisionTreeClassifier(),
            n_estimators=500,
            max_samples=1.0,
            bootstrap=False,  # Sampling without replacement
            random_state=42,
            max_features=0.5, # Column sampling 
            n_jobs=-1)  # Task is divided for fast processing
            
bag.fit(X_train,y_train)
y_predict=bag.predict(X_test)
print("Accuracy score -",accuracy_score(y_test,y_predict))

Accuracy score - 0.908


In [13]:
# No of features that is given for every tree.
# Total - 10 , given - 0.5 = 5

bag.estimators_features_[0].shape

(5,)

### 5.  Random Patches

In [14]:

# row and column sampling 

bag=BaggingClassifier(
            base_estimator=DecisionTreeClassifier(),
            n_estimators=500,
            max_samples=0.25,
            bootstrap=True,  # Sampling with replacement
            random_state=42,
            max_features=0.5, # Column sampling 
            n_jobs=-1)  # Task is divided for fast processing
            
bag.fit(X_train,y_train)
y_predict=bag.predict(X_test)
print("Accuracy score -",accuracy_score(y_test,y_predict))

Accuracy score - 0.896


### 6.  OOB - Out of Bag Sampling 



- While random sampling it happens that there are certain values which is not selected at least once and there are certain rows values which are selected mutliple times.


- It is said that only 63% of data is used else is not used - 37% , thus it is called as OOB.



In [15]:

# if OOB_Score = True then it will use all the values.

bag=BaggingClassifier(
            base_estimator=DecisionTreeClassifier(),
            n_estimators=500,
            max_samples=0.25,
            bootstrap=True,  # Sampling with replacement
            random_state=42,
            oob_score=True)
    
bag.fit(X_train,y_train)
y_predict=bag.predict(X_test)
print("Accuracy score -",accuracy_score(y_test,y_predict))

Accuracy score - 0.9085


In [16]:
bag.oob_score_

0.9075

### Bagging Tips 

- Bagging generally gives better results than Pasting.


- Good results come around 25% to 50% row sampling mark.


- Random patches and subspaces should be used while dealing with high dimensional data


- To find the correct hyperparameter value we can do GridSearchCV / RandomSearchCV


### Grid Seach Cv

In [17]:
from sklearn.model_selection import GridSearchCV
parameters={
    'n_estimators':[50,100,500],
    'max_samples':[0.1,0.4,1.0],
    'bootstrap':[True,False],
    'max_features':[0.1,0.4,1.0]
    }

In [None]:
search=GridSearchCV(BaggingClassifier(),param_grid=parameters,cv=10)
search.fit(X_train,y_train)

In [None]:
search.best_params_

In [None]:
search.best_scores_

### THE END 