# CH7 Ensemble Learning and Random Forests


#### Voting


In [1]:
from sklearn.datasets import make_moons
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [2]:
X, y = make_moons(n_samples=500, noise=.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

voting_clf = VotingClassifier(estimators=[
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('SVC', SVC(random_state=42))
])

voting_clf.fit(X_train, y_train)

In [3]:
voting_clf.estimators

[('Logistic Regression', LogisticRegression(random_state=42)),
 ('Random Forest', RandomForestClassifier(random_state=42)),
 ('SVC', SVC(random_state=42))]

In [19]:
try:
    voting_clf.estimators[0][1].score(X_test, y_test)
except:
    print("NotFittedError: This LogisticRegression instance is not fitted yet.\n Call 'fit' with appropriate arguments before using this estimator.")

NotFittedError: This LogisticRegression instance is not fitted yet.
 Call 'fit' with appropriate arguments before using this estimator.


In [21]:
voting_clf.estimators_[0].score(X_test, y_test)

0.864

> notice the difference between clone (fitted estimator) which in **clf.estimators\_** and the original (not fitted) which in **clf.estimators**


let's try all individuals..


In [22]:
voting_clf.named_estimators_

{'Logistic Regression': LogisticRegression(random_state=42),
 'Random Forest': RandomForestClassifier(random_state=42),
 'SVC': SVC(random_state=42)}

In [27]:
for name, clf in voting_clf.named_estimators_.items():
    print(name, 'Score = ', clf.score(X_test, y_test))

Logistic Regression Score =  0.864
Random Forest Score =  0.896
SVC Score =  0.896


In [30]:
voting_clf.predict(X_test[[0]])

array([1], dtype=int64)

In [33]:
for name, clf in voting_clf.named_estimators_.items():
    print(name, 'predicts:  ', *clf.predict(X_test[[0]]))

Logistic Regression predicts:   1
Random Forest predicts:   1
SVC predicts:   0


voting score:


In [34]:
voting_clf.score(X_test, y_test)

0.912

> notice the default type of voting in voting classifier is <font color='orange'>hard voting</font>, let's try <font color='orange'>soft voting</font> which we assume it will **outperforms** the other version.

the main reason behind this assumption is that voting classifier will give more weight for confident prediction than others. ex: probs: .90, .40, .49 ---hard voting prediction---> 1 0 0 -> 0. ----soft voting prediction---- (.90 + .40 + .49 )/3 = .59 which will be predicted ->1.


In [35]:
voting_clf.voting = 'soft'
voting_clf.named_estimators['SVC'].probability = True
voting_clf.fit(X_train, y_train)

In [36]:
voting_clf.score(X_test, y_test)

0.92

wow!


#### Bagging and Pasting


In [37]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [41]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
                            max_samples=100, n_jobs=-1, random_state=42)

bag_clf.fit(X_train, y_train)

In [42]:
bag_clf.score(X_test, y_test)

0.904

##### Out Of Bag Evaluation


In [49]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
                            max_samples=100, n_jobs=-1, random_state=42, oob_score=True
                            )

In [50]:
bag_clf.fit(X_train, y_train)

In [51]:
bag_clf.oob_score_

0.9253333333333333

In [54]:
bag_clf.oob_decision_function_[:5]

array([[0.35579515, 0.64420485],
       [0.43513514, 0.56486486],
       [1.        , 0.        ],
       [0.01030928, 0.98969072],
       [0.03174603, 0.96825397]])

Pasting

In [57]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
                            max_samples=100, n_jobs=-1, random_state=42,
                            bootstrap=False  # without replacement
                            )

In [58]:
bag_clf.fit(X_train, y_train)

In [59]:
bag_clf.score(X_test, y_test)

0.92

---
