# Random Forests

In [1]:
# import packages
import pandas as pd

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

import matplotlib.pyplot as plt
%matplotlib inline

# Create toy data
X, y = make_classification(n_samples=1000, n_features=10,
                           n_informative=5, n_redundant=0,
                           random_state=123, shuffle=False)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test

In [2]:
# Create base model
base = DecisionTreeClassifier(max_depth=5)
ensemble = BaggingClassifier(estimator=base, n_estimators=100, random_state=7)

base.fit(X_train,y_train)
ensemble.fit(X_train,y_train)

print("Accuracy base:",base.score(X_test, y_test))
print("Accuracy ensemble:",ensemble.score(X_test, y_test))

Accuracy base: 0.94
Accuracy ensemble: 0.95


### Importance scores

A property of the Random Forest ensemble method in sklearn is that they let you print importance scores for features in the dataset.

In [3]:
forest = RandomForestClassifier(n_estimators=100, random_state=7)
forest.fit(X_train, y_train)

feature_imp = pd.Series(forest.feature_importances_).sort_values(ascending=False)
feature_imp

0    0.407711
1    0.242605
2    0.107312
3    0.099349
4    0.052257
8    0.021870
7    0.018615
6    0.018100
9    0.016355
5    0.015827
dtype: float64

At the start of this notebook, we specified that this dataset has 10 features, of which 5 are informative. The classifier indeed relied on the bottom five features less than on the top. An advantage of investigating the importance of features is that irrelevant features can be removed. This removal of noise tends to improve performance and reduce training time.

In [5]:
# select important features
X = X[:, :5]

print(X)

# retrain
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test
base.fit(X_train,y_train)
ensemble.fit(X_train,y_train)

print("Accuracy base:",base.score(X_test, y_test))
print("Accuracy ensemble:",ensemble.score(X_test, y_test))

[[-1.15631196  2.05252632  1.20259144 -0.6226919   0.71206718]
 [ 0.62624848  0.73932366 -0.72880265 -1.68051774  2.12639824]
 [ 0.57775307 -0.9339956  -1.76987208 -1.74278971  4.33145289]
 ...
 [ 3.19822875  1.35540207 -0.63373262 -0.72458723 -0.90014444]
 [ 1.43323912  1.26716707 -1.63542329  0.49421537 -0.22291577]
 [ 0.83443132  1.28253953 -1.37187708  1.21331601 -0.98042084]]
Accuracy base: 0.93
Accuracy ensemble: 0.9466666666666667
