In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

In [14]:
moons = make_moons(n_samples=1000, noise=0.4)

In [15]:
X = moons[0]
y = moons[1]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [17]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

### Hard Voting

In [18]:
voting_hard_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting="hard"
)

In [19]:
voting_hard_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svc', SVC())])

In [20]:
from sklearn.metrics import accuracy_score

In [21]:
for clf in (log_clf, rnd_clf, svm_clf, voting_hard_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.85
RandomForestClassifier 0.835
SVC 0.9
VotingClassifier 0.885


### Soft Voting 

In [22]:
svm_prob_clf = SVC(probability=True)

In [23]:
voting_soft_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_prob_clf)],
    voting="soft",
)

In [24]:
voting_soft_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()),
                             ('svc', SVC(probability=True))],
                 voting='soft')

In [25]:
for clf in (log_clf, rnd_clf, svm_clf, voting_hard_clf, voting_soft_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.85
RandomForestClassifier 0.83
SVC 0.9
VotingClassifier 0.88
VotingClassifier 0.875


In [26]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [27]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=1
)

In [28]:
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=100,
                  n_estimators=500, n_jobs=1)

In [29]:
y_pred = bag_clf.predict(X_test)

In [30]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    bootstrap=True, n_jobs=-1, oob_score=True
)

In [31]:
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.84125

In [32]:
from sklearn.metrics import accuracy_score

In [33]:
y_pred = bag_clf.predict(X_test)

In [34]:
accuracy_score(y_test, y_pred)

0.83

In [35]:
bag_clf.oob_decision_function_

array([[0.61797753, 0.38202247],
       [1.        , 0.        ],
       [1.        , 0.        ],
       ...,
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.03592814, 0.96407186]])

# Random Forest

In [36]:
from sklearn.ensemble import RandomForestClassifier

In [38]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

RandomForestClassifier(max_leaf_nodes=16, n_estimators=500, n_jobs=-1)

In [39]:
y_pred_rf = rnd_clf.predict(X_test)

In [40]:
accuracy_score(y_test, y_pred_rf)

0.89

In [43]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(splitter='random', max_leaf_nodes=16),
    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1
)

In [44]:
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(max_leaf_nodes=16,
                                                        splitter='random'),
                  n_estimators=500, n_jobs=-1)

In [45]:
bg_pred = bag_clf.predict(X_test)
accuracy_score(y_test, bg_pred)

0.9

## Feature Importances    

In [46]:
from sklearn.datasets import load_iris

In [47]:
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.10452952160803752
sepal width (cm) 0.02448962088119502
petal length (cm) 0.44201516248972295
petal width (cm) 0.4289656950210446


# Boosting

## AdaBoost

In [51]:
from sklearn.ensemble import AdaBoostClassifier

In [49]:
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5
)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200)

In [50]:
ada_pred = ada_clf.predict(X_test)
accuracy_score(y_test, ada_pred)

0.885

## Gradient Boosting

In [59]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor

In [61]:
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

In [62]:
tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)

DecisionTreeRegressor(max_depth=2)

In [63]:
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)

DecisionTreeRegressor(max_depth=2)

In [64]:
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)

DecisionTreeRegressor(max_depth=2)

In [66]:
y_pred = sum(tree.predict(np.array([[0.8]])) for tree in (tree_reg1, tree_reg2, tree_reg3))
y_pred

array([0.75026781])

In [67]:
from sklearn.ensemble import GradientBoostingRegressor

In [68]:
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)

GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=3)

### Early Stopping

In [70]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [71]:
X_train, X_val, y_train, y_val = train_test_split(X, y)

In [72]:
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=120)

In [76]:
errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors) + 1
bst_n_estimators

85

In [77]:
gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=85)

#### Warm Start

Warm Start performs incremental learning, i.e it will keep the previous Decision Trees that have been fit

In [91]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

In [92]:
min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    print("Validation Error:", val_error)
    if val_error < min_val_error:
        min_val_error = val_error
        print("Previous Minimum Validation Error:", min_val_error)
        print("Minimum Validation Error:", min_val_error, "\n")
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break

Validation Error: 0.04853645651402395
Previous Minimum Validation Error: 0.04853645651402395
Minimum Validation Error: 0.04853645651402395 

Validation Error: 0.040640268941000264
Previous Minimum Validation Error: 0.040640268941000264
Minimum Validation Error: 0.040640268941000264 

Validation Error: 0.034251627375732265
Previous Minimum Validation Error: 0.034251627375732265
Minimum Validation Error: 0.034251627375732265 

Validation Error: 0.030860253423403018
Previous Minimum Validation Error: 0.030860253423403018
Minimum Validation Error: 0.030860253423403018 

Validation Error: 0.026963941731481037
Previous Minimum Validation Error: 0.026963941731481037
Minimum Validation Error: 0.026963941731481037 

Validation Error: 0.023139718761101085
Previous Minimum Validation Error: 0.023139718761101085
Minimum Validation Error: 0.023139718761101085 

Validation Error: 0.02118892032500619
Previous Minimum Validation Error: 0.02118892032500619
Minimum Validation Error: 0.02118892032500619 

## XgBoost

In [97]:
import xgboost
from sklearn.metrics import mean_squared_error

In [98]:
xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)
np.sqrt(mean_squared_error(y_val, y_pred))

0.06070300146020461

In [100]:
xgb_reg.fit(X_train, y_train,
            eval_set=[(X_val, y_val)], early_stopping_rounds=2)
y_pred = xgb_reg.predict(X_val)

[0]	validation_0-rmse:0.22055
Will train until validation_0-rmse hasn't improved in 2 rounds.
[1]	validation_0-rmse:0.16547
[2]	validation_0-rmse:0.12243
[3]	validation_0-rmse:0.10044
[4]	validation_0-rmse:0.08467
[5]	validation_0-rmse:0.07344
[6]	validation_0-rmse:0.06728
[7]	validation_0-rmse:0.06383
[8]	validation_0-rmse:0.06125
[9]	validation_0-rmse:0.05959
[10]	validation_0-rmse:0.05902
[11]	validation_0-rmse:0.05852
[12]	validation_0-rmse:0.05844
[13]	validation_0-rmse:0.05801
[14]	validation_0-rmse:0.05747
[15]	validation_0-rmse:0.05772
[16]	validation_0-rmse:0.05778
Stopping. Best iteration:
[14]	validation_0-rmse:0.05747



# Excercise

## Ans.8

In [103]:
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC

In [104]:
mnist = fetch_openml('mnist_784', version=1, cache=True)

In [105]:
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [107]:
X_train, X_val, y_train, y_val = train_test_split(mnist['data'][:60000], mnist['target'][:60000], test_size=0.2)
X_test, y_test = mnist['data'][60000:], mnist['target'][60000:]

In [113]:
for data_type, data in zip(("Train", "Validation", "Test"), ((X_train, y_train), (X_val, y_val), (X_test, y_test))):
    print(f"--{data_type}--")
    print("Data:", data[0].shape)
    print("Target:", data[1].shape, "\n")

--Train--
Data: (48000, 784)
Target: (48000,) 

--Validation--
Data: (12000, 784)
Target: (12000,) 

--Test--
Data: (10000, 784)
Target: (10000,) 



In [116]:
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import VotingClassifier

In [122]:
randf_clf = RandomForestClassifier(n_estimators=100)
extrat_clf = ExtraTreesClassifier(n_estimators=100)
svm_clf = OneVsRestClassifier(SVC(probability=True)) 

In [125]:
voting_ensemble = VotingClassifier(
                            estimators=[('rf', randf_clf), ('et', extrat_clf), ('svc', svm_clf)],
                            voting="soft",
)

In [126]:
voting_ensemble.fit(X_train, y_train)

VotingClassifier(estimators=[('rf', RandomForestClassifier()),
                             ('et', ExtraTreesClassifier()),
                             ('svc',
                              OneVsRestClassifier(estimator=SVC(probability=True)))],
                 voting='soft')