In [2]:
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split

In [10]:
X,y = make_classification(n_samples=10000, n_features=10,n_informative=3)

In [12]:
X

array([[-0.13632931, -1.11984779, -1.32921283, ..., -0.34135509,
         0.22686666,  0.34120334],
       [ 0.80882663,  0.24515986, -0.26107378, ..., -0.01964765,
        -1.74254124,  0.5472156 ],
       [ 0.49762249, -1.08912592,  1.30501737, ..., -0.56698907,
        -0.33048243,  1.22192614],
       ...,
       [ 1.45164249,  0.48777916,  0.82408332, ..., -0.60011389,
        -0.821151  ,  0.88168252],
       [-1.22571025,  0.63882085,  1.84273179, ..., -0.64333226,
        -2.75698048,  2.55413603],
       [-0.56300798,  1.59181227,  1.8170343 , ..., -1.11987059,
        -1.58924018,  0.72433876]])

In [13]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
dt=DecisionTreeClassifier(random_state=42)

In [15]:
dt.fit(X_train,y_train)

DecisionTreeClassifier(random_state=42)

In [16]:
y_pred=dt.predict(X_test)

In [17]:
print("Decision Tree accurracy",accuracy_score(y_pred,y_test))

Decision Tree accurracy 0.85


# Bagging Technique

In [32]:
bag=BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.5,
    bootstrap=True,
    random_state=42
)

In [33]:
bag.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.5,
                  n_estimators=500, random_state=42)

In [34]:
y_pred1=bag.predict(X_test)

In [35]:
print("Bagging accurracy",accuracy_score(y_pred1,y_test))

Bagging accurracy 0.8965


In [36]:
bag.estimators_samples_[0].shape

(4000,)

In [37]:
bag.estimators_features_[0].shape

(10,)

# Bagging using SVC

In [40]:
bag1=BaggingClassifier(
    base_estimator=SVC(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    random_state=42
)

In [41]:
bag1.fit(X_train,y_train)

BaggingClassifier(base_estimator=SVC(), max_samples=0.25, n_estimators=500,
                  random_state=42)

In [42]:
y_pred2=bag1.predict(X_test)

In [43]:
print("Bagging accurracy",accuracy_score(y_pred2,y_test))

Bagging accurracy 0.8765


# Pasting

In [45]:
bag2=BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=False,
    random_state=42,
    verbose=1,
    n_jobs=1
    
)

In [46]:
bag2.fit(X_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.1s finished


BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  max_samples=0.25, n_estimators=500, n_jobs=1, random_state=42,
                  verbose=1)

In [47]:
y_pred3=bag2.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [48]:
print("Bagging accurracy",accuracy_score(y_pred3,y_test))

Bagging accurracy 0.8905


# Random Subspaces

In [50]:
bag3=BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=1,
    bootstrap=False,
    random_state=42,
    max_features=0.5,
    n_jobs=1
    
)

In [51]:
bag3.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  max_features=0.5, max_samples=1, n_estimators=500, n_jobs=1,
                  random_state=42)

In [52]:
y_pred4=bag2.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [53]:
print("Bagging accurracy",accuracy_score(y_pred4,y_test))

Bagging accurracy 0.8905


In [55]:
bag3.estimators_features_[0]

array([6, 7, 5, 8, 3])

# OOB Score

In [56]:
bag5=BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=.25,
    bootstrap=True,
    random_state=42,
    oob_score=True,
    n_jobs=1
    )

In [57]:
bag5.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.25,
                  n_estimators=500, n_jobs=1, oob_score=True, random_state=42)

In [58]:
bag5.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.25,
                  n_estimators=500, n_jobs=1, oob_score=True, random_state=42)

In [59]:
y_pred5=bag5.predict(X_test)

In [60]:
print("Bagging accurracy",accuracy_score(y_pred5,y_test))

Bagging accurracy 0.8895


In [61]:
bag5.oob_score_

0.896875

# Bagging 

Bagging generally gives better result than Pasting
Good result comes around 25% to 50% row sampling
Random patches  and subspaces should be used while dealing with high dimensional data
To find the correct hyperparameter values we can do Grid Search CV

In [62]:
from sklearn.model_selection import GridSearchCV

In [63]:
parameters = {
    'n_estimators': [50,100,500], 
    'max_samples': [0.1,0.4,0.7,1.0],
    'bootstrap' : [True,False],
    'max_features' : [0.1,0.4,0.7,1.0]
    }

In [67]:
search=GridSearchCV(BaggingClassifier(),parameters,cv=5)

In [68]:
search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=BaggingClassifier(),
             param_grid={'bootstrap': [True, False],
                         'max_features': [0.1, 0.4, 0.7, 1.0],
                         'max_samples': [0.1, 0.4, 0.7, 1.0],
                         'n_estimators': [50, 100, 500]})