# Generate Moon data:
In this assignment, we generate a synthetic dataset using Moon data and apply random forest to experience ensemble learning.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
import numpy as np

# Lets generate 500 random instances
X, y = make_moons(n_samples=500, noise=0.4, random_state=42)

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create a Forest of trees: Two ways

1- Use baggingclassifier and manually define a random forest by defining number of estimators, setting number of samples equal the number of training set and turn bootstrap on

2- Use RandomForestClassifier model itself

### Create a forest with 500 trees, max number of leaves = 16, and max trainint size for each tree equals training data while you do randomly sample instances with replacement

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(splitter="random", max_leaf_nodes=16, random_state=42),
    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1, random_state=42)

In [None]:
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy of bagging classifier on train dataset is:",accuracy_score(y_train, bag_clf.predict(X_train)))
print("Accuracy of bagging classifier on test dataset is:",accuracy_score(y_test, y_pred))

 

Accuracy of bagging classifier on train dataset is: 0.88
Accuracy of bagging classifier on test dataset is: 0.84


# This time, lets use RandomForestClassifier from sklearn

In [None]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy of RF classifier on train dataset is:",accuracy_score(y_train, rnd_clf.predict(X_train)))
print("Accuracy of RF classifier on test dataset is:",accuracy_score(y_test, y_pred_rf))

Accuracy of RF classifier on train dataset is: 0.928
Accuracy of RF classifier on test dataset is: 0.856


# Wait a minute, what happened?

Because randomforest implementation is randomly selecting features, it has better results (you need to check overfitting and regularize it).

# Which feature plays the most important role?

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)
rnd_clf.fit(iris["data"], iris["target"])

for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.11249225099876375
sepal width (cm) 0.02311928828251033
petal length (cm) 0.4410304643639577
petal width (cm) 0.4233579963547682


In [None]:
rnd_clf.feature_importances_

array([0.11249225, 0.02311929, 0.44103046, 0.423358  ])

## Out-of-Bag evaluation: Lets turn OOB on and experiment

In [None]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=500,
    bootstrap=True, n_jobs=-1, oob_score=True, random_state=40)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.8346666666666667

In [None]:
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.808