## Ensemble Methods With Sklearn

Ensemble SKlearn

### Content

- [Packages](#pack)
- [Voting Hard And Soft](#sklearn1)
- [Bagging and Pasting](#sklearn2)
- [RandomForest](#sklearn3)
- [Extra Trees](#sklearn4)
- [Adaptive Boosting](#sklearn5)
- [Gradient Boosting !!!](#sklearn6)
- [Stacking](#sklearn7)

## <a id = 'pack'>Packages </a>

In [188]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_absolute_error, mean_squared_error
from sklearn.datasets import load_breast_cancer, load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, ExtraTreesClassifier, ExtraTreesRegressor, AdaBoostClassifier,AdaBoostRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import BaggingRegressor, BaggingClassifier, GradientBoostingClassifier, GradientBoostingRegressor

## <a id = 'sklearn1'>  Hard Voting and Soft Voting </a>

In [2]:
data, target = load_breast_cancer().data, load_breast_cancer().target

In [18]:
data2, target2 = load_diabetes().data, load_diabetes().target

In [3]:
data.shape, target.shape

((569, 30), (569,))

In [4]:
pd.DataFrame(data).isna().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
dtype: int64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2)

In [6]:
log_clf = LogisticRegression(C = 1, solver = 'liblinear')
svc_clf = SVC(C=1, kernel= 'linear', gamma = 'auto', probability = True)
naive_clf = GaussianNB()

In [7]:
for clf in (log_clf,svc_clf,naive_clf):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test,y_pred))

LogisticRegression 0.956140350877193
SVC 0.9649122807017544
GaussianNB 0.9912280701754386


In [8]:
clf_vote = VotingClassifier(estimators= [('lr',log_clf),
                                         ('svc',svc_clf),
                                          ('naive',naive_clf)],
                voting='hard',
                weights=None,
                n_jobs=5,
                flatten_transform=True,
                verbose=True,)
clf_vote.fit(X_train,y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1, solver='liblinear')),
                             ('svc',
                              SVC(C=1, gamma='auto', kernel='linear',
                                  probability=True)),
                             ('naive', GaussianNB())],
                 n_jobs=5, verbose=True)

In [9]:
clf_vote.estimators

[('lr', LogisticRegression(C=1, solver='liblinear')),
 ('svc', SVC(C=1, gamma='auto', kernel='linear', probability=True)),
 ('naive', GaussianNB())]

In [10]:
accuracy_score(y_test,clf_vote.predict(X_test))

0.9649122807017544

In [11]:
clf_vote = VotingClassifier(estimators= [('lr',log_clf),
                                         ('svc',svc_clf),
                                          ('naive',naive_clf)],
                voting='soft',
                weights=[0.4,0.3,0.3],
                n_jobs=5,
                flatten_transform=True,
                verbose=True,)
clf_vote.fit(X_train,y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1, solver='liblinear')),
                             ('svc',
                              SVC(C=1, gamma='auto', kernel='linear',
                                  probability=True)),
                             ('naive', GaussianNB())],
                 n_jobs=5, verbose=True, voting='soft',
                 weights=[0.4, 0.3, 0.3])

In [12]:
accuracy_score(y_test,clf_vote.predict(X_test))

0.9736842105263158

## <a id = 'sklearn2'>  Boosting and Pasting </a>

In [34]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(data2, target2, test_size = 0.2)

In [54]:
bag_reg = BaggingRegressor(
                            DecisionTreeRegressor(),
                            n_estimators = 1000,
                            bootstrap = False, #pasting if false
                            max_samples = 1,
                            bootstrap_features= True, #for features
                            max_features= 0.8,
                            n_jobs = 5,
                            oob_score = False
)
bag_reg.fit(X_train2,y_train2)

BaggingRegressor(base_estimator=DecisionTreeRegressor(), bootstrap=False,
                 bootstrap_features=True, max_features=0.8, max_samples=1,
                 n_estimators=1000, n_jobs=5)

In [55]:
mean_absolute_error(y_train2, bag_reg.predict(X_train2)),mean_absolute_error(y_test2, bag_reg.predict(X_test2))

(64.7991926345609, 68.92668539325842)

In [70]:
bag_clf = BaggingClassifier(
                            DecisionTreeClassifier(),
                            n_estimators = 1000,
                            bootstrap = True, #pasting if false
                            max_samples = 0.7,
                            bootstrap_features= True, #for features
                            max_features= 0.8,
                            n_jobs = 5,
                            oob_score = True
)
bag_clf.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                  bootstrap_features=True, max_features=0.8, max_samples=0.7,
                  n_estimators=1000, n_jobs=5, oob_score=True)

In [71]:
accuracy_score(y_train, bag_clf.predict(X_train)),accuracy_score(y_test,bag_clf.predict(X_test))

(0.9978021978021978, 0.9736842105263158)

In [72]:
bag_clf.oob_score_

0.9582417582417583

## <a id = 'sklearn3'>  Random Forest </a>

In [74]:
rnd_cls = RandomForestClassifier(n_estimators= 600, max_leaf_nodes= 12, n_jobs= 5)
rnd_cls.fit(X_train,y_train)

RandomForestClassifier(max_leaf_nodes=12, n_estimators=600, n_jobs=5)

In [75]:
accuracy_score(y_train, rnd_cls.predict(X_train)),accuracy_score(y_test,rnd_cls.predict(X_test))

(0.9912087912087912, 0.9736842105263158)

In [185]:
warm_random = RandomForestClassifier(max_leaf_nodes= 10, n_jobs= 4, warm_start= True)

In [186]:
min_error_thres = 0
error_in_row = 0

for n_estimators in range(1,1000):
    warm_random.n_estimators = n_estimators
    warm_random.fit(X_train,y_train)
    
    pred = warm_random.predict(X_test)
    error = accuracy_score(y_test,pred)
    
    print('Estimator', n_estimators)
    print('Acc',error)
    
    if error > min_error_thres:
        min_error_thres = error
        error_in_row = 0
    else:
        error_in_row += 1
        if error_in_row == 10:
            break

Estimator 1
Acc 0.9824561403508771
Estimator 2
Acc 0.9736842105263158
Estimator 3
Acc 0.9736842105263158
Estimator 4
Acc 0.9736842105263158
Estimator 5
Acc 0.9736842105263158
Estimator 6
Acc 0.9824561403508771
Estimator 7
Acc 0.9736842105263158
Estimator 8
Acc 0.9736842105263158
Estimator 9
Acc 0.9736842105263158
Estimator 10
Acc 0.9736842105263158
Estimator 11
Acc 0.9736842105263158


## <a id = 'sklearn4'> Extra Trees </a>

In [77]:
extra_reg = ExtraTreesRegressor(n_estimators=600, max_leaf_nodes= 12, n_jobs = 5)
extra_reg.fit(X_train2,y_train2)

ExtraTreesRegressor(max_leaf_nodes=12, n_estimators=600, n_jobs=5)

In [78]:
mean_absolute_error(y_train2, extra_reg.predict(X_train2)),mean_absolute_error(y_test2, extra_reg.predict(X_test2))

(41.29473328568838, 48.74414362872022)

In [85]:
extra_class = ExtraTreesClassifier(n_estimators=600, max_leaf_nodes= 12, n_jobs = 5)
extra_class.fit(X_train,y_train)

ExtraTreesClassifier(max_leaf_nodes=12, n_estimators=600, n_jobs=5)

In [86]:
accuracy_score(y_train, extra_class.predict(X_train)),accuracy_score(y_test,extra_class.predict(X_test))

(0.9736263736263736, 0.9824561403508771)

## <a id = 'sklearn5'> AdaBoost </a>

In [125]:
ada_reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth= 3),
                           n_estimators= 400,
                           learning_rate = 0.5)


ada_reg.fit(X_train2,y_train2)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=3),
                  learning_rate=0.5, n_estimators=400)

In [126]:
mean_absolute_error(y_train2, ada_reg.predict(X_train2)),mean_absolute_error(y_test2, ada_reg.predict(X_test2))

(40.56527190993425, 49.146343172747194)

In [127]:
ada_reg.estimator_weights_

array([0.61534854, 0.57370358, 0.52987145, 0.38452574, 0.5854403 ,
       0.37391868, 0.34610847, 0.38998908, 0.35827511, 0.33552915,
       0.19033028, 0.33757719, 0.3032106 , 0.25220635, 0.37965646,
       0.3602717 , 0.37781332, 0.31148735, 0.40914015, 0.15719728,
       0.42949303, 0.41209538, 0.24231872, 0.3988128 , 0.35924391,
       0.27105767, 0.52263581, 0.12556577, 0.16627611, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [132]:
ada_cls = AdaBoostClassifier(DecisionTreeClassifier(max_depth= 1),
                           n_estimators= 200,
                             algorithm= 'SAMME.R',
                           learning_rate = 0.5)


ada_cls.fit(X_train,y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200)

In [133]:
accuracy_score(y_train, ada_cls.predict(X_train)),accuracy_score(y_test,ada_cls.predict(X_test))

(1.0, 0.9912280701754386)

## <a id = 'sklearn6'> Gradient </a>

In [163]:
gb_reg = GradientBoostingRegressor(max_depth = 1, n_estimators= 50, learning_rate= 0.1)
gb_reg.fit(X_train2,y_train2)

GradientBoostingRegressor(max_depth=1, n_estimators=50)

In [164]:
mean_squared_error(y_train2, gb_reg.predict(X_train2)),mean_squared_error(y_test2, gb_reg.predict(X_test2))

(2683.1832538209114, 3267.6349505867233)

In [169]:
gbr = GradientBoostingRegressor(max_depth=1, warm_start= True)

In [170]:
min_val_error = float("inf")
error_increasing = 0

for n_estimators in range(1,1000):
    gbr.n_estimators = n_estimators
    gbr.fit(X_train2,y_train2)
    
    y_pred = gbr.predict(X_test2)
    val_error = mean_squared_error(y_test2,y_pred)
    
    print('No. of estimators:',gbr.n_estimators_)
    print('Validation error',val_error)
    
    if val_error < min_val_error:
        min_val_error = val_error
        error_increasing = 0
    else:
        error_increasing += 1
        if error_increasing == 10:
            break

No. of estimators: 1
Validation error 6120.301563637372
No. of estimators: 2
Validation error 5818.16174050051
No. of estimators: 3
Validation error 5570.086037896211
No. of estimators: 4
Validation error 5339.606026556624
No. of estimators: 5
Validation error 5148.374140750805
No. of estimators: 6
Validation error 4974.005085720683
No. of estimators: 7
Validation error 4821.540252525674
No. of estimators: 8
Validation error 4696.773148517174
No. of estimators: 9
Validation error 4569.567077650752
No. of estimators: 10
Validation error 4483.3671480196435
No. of estimators: 11
Validation error 4393.316343570109
No. of estimators: 12
Validation error 4303.877784543185
No. of estimators: 13
Validation error 4248.9312295592335
No. of estimators: 14
Validation error 4174.1830267214145
No. of estimators: 15
Validation error 4083.761029846394
No. of estimators: 16
Validation error 4023.75182788264
No. of estimators: 17
Validation error 3979.289294956679
No. of estimators: 18
Validation error 

## <a id = 'sklearn7'> Stacking </a>

In [206]:
x_train, x_hold_out, x_test = np.split(data, [int(.7*len(data)),int(.9*len(data))])  # 70 - 20 -10
y_train, y_hold_out, y_test = np.split(target, [int(.7*len(target)),int(.9*len(target))])

In [194]:
x_train.shape

(398, 30)

In [195]:
x_hold_out.shape

(114, 30)

In [197]:
x_test.shape

(57, 30)

In [198]:
data.shape

(569, 30)

In [207]:
clf1 = KNeighborsClassifier(n_neighbors= 10)
clf2 = RandomForestClassifier(n_estimators = 50)
clf3 = GaussianNB()

In [209]:
for clf in (clf1,clf2,clf3):
    clf.fit(x_train,y_train)

In [211]:
def get_predictions(x,y):
    pred_results = pd.DataFrame()
    
    i = 1
    for clf in (clf1,clf2,clf3):
        y_pred = clf.predict(x)
        
        print(clf.__class__.__name__, accuracy_score(y,y_pred))
        
        pred_results.insert(i-1, 'y_pred_' + str(i), y_pred)
        
        i +=1
        
    return pred_results


In [212]:
pred_result = get_predictions(x_hold_out,y_hold_out)

KNeighborsClassifier 0.9298245614035088
RandomForestClassifier 0.9736842105263158
GaussianNB 0.9736842105263158


In [213]:
pred_result.head()

Unnamed: 0,y_pred_1,y_pred_2,y_pred_3
0,1,1,1
1,1,1,1
2,0,0,0
3,1,1,1
4,1,1,1


In [214]:
x_stack_train = pred_result
y_stack_train = y_hold_out

In [215]:
clf_stack = LogisticRegression(solver = 'lbfgs', C = 1, max_iter = 200)

clf_stack.fit(x_stack_train, y_stack_train)

LogisticRegression(C=1, max_iter=200)

In [216]:
pred_results_test = get_predictions(x_test,y_test)

KNeighborsClassifier 0.9649122807017544
RandomForestClassifier 0.9824561403508771
GaussianNB 0.9473684210526315


In [218]:
pred_results_test.head()

Unnamed: 0,y_pred_1,y_pred_2,y_pred_3
0,0,0,0
1,1,1,1
2,0,0,1
3,1,1,1
4,0,0,0


In [219]:
x_stack_test = pred_results_test
y_stack_pred = clf_stack.predict(x_stack_test)

In [220]:
accuracy_score(y_test,y_stack_pred)

0.9473684210526315