# Ensemble Learning and Random Forests

## Voting Classifiers

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC(probability = True) # 保证 soft voting 可以使用

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft'
    )

In [2]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

# iris = datasets.load_iris()
# X = iris["data"][:,(2,3)]
# y = (iris["target"]==2)

from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [3]:
voting_clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(accuracy_score(y_test, y_pred))

0.864
0.896
0.888
0.904


## Bagging and Pasting

- 为了得到一系列不同的分类器，一种做法是使用不同的分类算法，另一种是使用不同的训练子集
- 抽取子集中，有放回抽样称为 bagging ，无放回称为 pasting

### Bagging and Pasting in Scikit-Learn

In [4]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1
    )
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.92000000000000004

### Out-of-Bag Evaluation

In [5]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True
    )
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=100, n_estimators=500, n_jobs=-1, oob_score=True,
         random_state=None, verbose=0, warm_start=False)

In [6]:
bag_clf.oob_score_

0.92266666666666663

In [7]:
bag_clf.oob_decision_function_

array([[ 0.32901554,  0.67098446],
       [ 0.36314363,  0.63685637],
       [ 1.        ,  0.        ],
       [ 0.00540541,  0.99459459],
       [ 0.00795756,  0.99204244],
       [ 0.11253197,  0.88746803],
       [ 0.3655914 ,  0.6344086 ],
       [ 0.0835443 ,  0.9164557 ],
       [ 0.95115681,  0.04884319],
       [ 0.80901857,  0.19098143],
       [ 0.56657963,  0.43342037],
       [ 0.05583756,  0.94416244],
       [ 0.75      ,  0.25      ],
       [ 0.83739837,  0.16260163],
       [ 0.92287234,  0.07712766],
       [ 0.1       ,  0.9       ],
       [ 0.0483871 ,  0.9516129 ],
       [ 0.91232877,  0.08767123],
       [ 0.65364583,  0.34635417],
       [ 0.95348837,  0.04651163],
       [ 0.06015038,  0.93984962],
       [ 0.23469388,  0.76530612],
       [ 0.91002571,  0.08997429],
       [ 0.98691099,  0.01308901],
       [ 0.9507772 ,  0.0492228 ],
       [ 0.00255102,  0.99744898],
       [ 0.96883117,  0.03116883],
       [ 1.        ,  0.        ],
       [ 0.02872063,

## Random Patches and Random Subspaces

## Random Forests

In [8]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)

In [9]:
accuracy_score(y_test, y_pred_rf)

0.92000000000000004

### Extra-Trees
### Features Importance

In [10]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.0927775993862
sepal width (cm) 0.0222623951091
petal length (cm) 0.42929411052
petal width (cm) 0.455665894985


## Boosting

### AdaBoost

In [12]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200, algorithm="SAMME.R", learning_rate=0.5
    )
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=0.5, n_estimators=200, random_state=None)

## Gradient Boosting

In [13]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=1.0, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=3, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [15]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]

best_n_estimators = np.argmin(errors)

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=best_n_estimators)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=49, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [19]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float("inf")
error_going_uo=0
for n_estimators in range(1,120):
    gbrt.n_estimators=n_estimators
    gbrt.fit(X_train,y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error=val_error
        error_going_up = 0
    else:
        error_going_up = error_going_up+1
        if error_going_up == 5:
            break

## Stacking