# Ensemble Learning


In [1]:
# Preparing the data
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=1000, noise=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [2]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(estimators=[
    ('lr', log_clf), ('rf', rnd_clf), ('svm', svm_clf)
], voting='hard')

voting_clf.fit(X_train, y_train)



VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svm', SVC())])

In [4]:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_pred, y_test))

LogisticRegression 0.852
RandomForestClassifier 0.916
SVC 0.916
VotingClassifier 0.912


In [7]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC(probability=True)

voting_soft_clf = VotingClassifier(estimators=[
    ('lr', log_clf), ('rf', rnd_clf), ('svm', svm_clf)
], voting='soft')

voting_soft_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()),
                             ('svm', SVC(probability=True))],
                 voting='soft')

In [9]:
y_pred = voting_soft_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.92

# Bagging & Pasting
Ensemble learning techniques which are done by training multiple number of models of the same training algorithm and sampling the training dataset
<br>
If the sampling is done with replacement(bootstrapping) it's called *Bagging*, if it's without replacement it's called *Pasting*

In [10]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1
)

bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [11]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True
)

bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=100,
                  n_estimators=500, n_jobs=-1, oob_score=True)

In [12]:
bag_clf.oob_score_

0.9093333333333333

In [13]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_pred, y_test)

0.924

In [14]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

In [17]:
rnd_clf.feature_importances_

array([0.42821343, 0.57178657])

In [19]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris.data, iris.target)

for name, score in zip(iris.feature_names, rnd_clf.feature_importances_):
    print(name, score*100)

sepal length (cm) 9.45477591662616
sepal width (cm) 2.440994057251642
petal length (cm) 42.533873116921214
petal width (cm) 45.57035690920099


# AdaBoosting

In [20]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=1), n_estimators=200,
algorithm="SAMME.R", learning_rate=0.5)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200)