# Ensemble methods: Tree Bagging; Random Forests; Adaboost from Scratch

In [21]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import mode 
from sklearn.datasets import load_iris
from sklearn.datasets import load_boston
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier 

# For producing decision tree diagrams.
from IPython.core.display import Image, display
from six import StringIO

Leverage the *boston housing data set* to try out ensemble methods. First, make the output binary for classification.

In [22]:
# load the boston housing data
boston = load_boston()
X, Y = boston.data, boston.target

# binarize the output so it is now a classification task
Y = (Y > np.median(Y)).astype(int)

# Shuffle the data, but make sure that the features and accompanying labels stay in sync.
np.random.seed(0)
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, Y = X[shuffle], Y[shuffle]

# Split into train and test.
train_data, train_labels = X[:350], Y[:350]
test_data, test_labels = X[350:], Y[350:]

Create a function to print out the pseudocode version of the tree as an alternative to GraphViz, of which a graphic will also be produced.

In [23]:
def recurse(left, right, threshold, features, value, node):
        if (threshold[node] != -2):
                print ("if ( " + features[node] + " <= " + str(threshold[node]) + " ) {")
                if left[node] != -1:
                        recurse (left, right, threshold, features, value, left[node])
                print ("} else {")
                if right[node] != -1:
                        recurse (left, right, threshold, features,value, right[node])
                print ("}")
        else:
                print ("return " + str(value[node]))

def get_code(tree, feature_names):
        left      = tree.tree_.children_left
        right     = tree.tree_.children_right
        threshold = tree.tree_.threshold
        features  = [feature_names[i] for i in tree.tree_.feature]
        value = tree.tree_.value

        recurse(left, right, threshold, features, value, 0)

In [24]:
# Create a decision tree and print the output of the decision tree rules
dt = DecisionTreeClassifier(criterion="entropy", splitter="best", random_state=0)
dt.fit(train_data, train_labels)
get_code(dt, boston.feature_names)

if ( LSTAT <= 7.684999942779541 ) {
if ( PTRATIO <= 21.5 ) {
return [[ 0. 94.]]
} else {
return [[1. 0.]]
}
} else {
if ( LSTAT <= 14.045000076293945 ) {
if ( RM <= 6.315499782562256 ) {
if ( INDUS <= 13.360000133514404 ) {
if ( CRIM <= 0.05717500112950802 ) {
return [[14.  0.]]
} else {
if ( CRIM <= 0.5821950137615204 ) {
if ( TAX <= 404.5 ) {
if ( INDUS <= 4.2200000286102295 ) {
return [[0. 4.]]
} else {
if ( AGE <= 28.049999237060547 ) {
return [[0. 3.]]
} else {
if ( RM <= 5.92300009727478 ) {
return [[8. 0.]]
} else {
if ( AGE <= 45.39999961853027 ) {
if ( TAX <= 255.0 ) {
return [[1. 0.]]
} else {
return [[0. 5.]]
}
} else {
if ( B <= 395.9150085449219 ) {
if ( NOX <= 0.47050000727176666 ) {
return [[4. 0.]]
} else {
if ( DIS <= 2.746250033378601 ) {
return [[2. 0.]]
} else {
return [[0. 5.]]
}
}
} else {
return [[8. 0.]]
}
}
}
}
}
} else {
return [[4. 0.]]
}
} else {
return [[7. 0.]]
}
}
} else {
if ( B <= 390.7099914550781 ) {
if ( NOX <= 0.6010000109672546 ) {
return [[4. 0.]]

In [25]:
# Now create a visual to be outputted
tree.export_graphviz(dt, 'tree.dot')

<img src="tree.png">

## Ensemble Methods

First, compare the performance of a single decision tree vs that of a random forest and an Adaboost forest.

In [26]:
print ('Accuracy (a decision tree):', dt.score(test_data, test_labels))

rfc = RandomForestClassifier(n_estimators=1000)
rfc.fit(train_data, train_labels)

print ('Accuracy (a random forest):', rfc.score(test_data, test_labels))

abc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100, learning_rate=0.2)

abc.fit(train_data, train_labels)
print ('Accuracy (adaboost with decision trees):', abc.score(test_data, test_labels))

Accuracy (a decision tree): 0.8782051282051282
Accuracy (a random forest): 0.9102564102564102
Accuracy (adaboost with decision trees): 0.9166666666666666


### Bagging from Scratch

Create the bagging functions:

In [43]:
np.random.seed(1)

def bootstrap_tree(bs_tree, train_data, train_labels, test_data):
    
    '''
    Function to create a decision tree on a bootstrapped sample and predict on test data.
    '''
    
    # Create a bootstrap sample
    bs_sample_idx = np.random.choice(range(train_data.shape[0]), size=train_data.shape[0], replace=True)
    
    # Create the bootstrap data and labels
    bs_data = train_data[bs_sample_idx, :]
    bs_labels = train_labels[bs_sample_idx]
    
    # Train the tree and predict on test_data
    bs_tree.fit(bs_data, bs_labels)
    bs_tree_preds = bs_tree.predict(test_data)
    
    return bs_tree_preds

def bagging(bs_tree, num_trees, train_data, train_labels, test_data):
    
    '''
    Function to perform bagging using a specified num_trees, where each tree is fit on a bootstrap sample.
    '''
    
    # Initialize the ndarray to hold the predictions per tree
    bagged_preds = np.zeros((num_trees, test_data.shape[0]))
    
    # For each tree, return the predictions and store in the bagged_preds ndarray
    for tree in range(num_trees):
        bagged_preds[tree] = bootstrap_tree(bs_tree, train_data, train_labels, test_data)
      
    # Find the most common prediction for each test sample
    val, count = mode(bagged_preds, axis = 0) 
    final_preds = val.ravel().tolist() 
    
    return final_preds

In [54]:
# Set the number of trees and tree model
num_trees = 100
clf = DecisionTreeClassifier(criterion="entropy", splitter="best")

# Generate the bagged trees and the majority vote classification for the predictions
bagged_trees_preds = bagging(clf, num_trees, train_data, train_labels, test_data)

# Create a single bagged tree
bs_tree_preds = bootstrap_tree(clf, train_data, train_labels, test_data)

# Compare the results
print("Accuracy score for a single tree:", sum(bs_tree_preds == test_labels) / len(test_labels))
print("Accuracy score for a bagged forest:", sum(bagged_trees_preds == test_labels) / len(test_labels))

Accuracy score for a single tree: 0.8333333333333334
Accuracy score for a bagged forest: 0.9038461538461539


### Random Forests from Scratch

Random forests add the twist of subsampling features at each node. Random forests take p' = sqrt(p) features. This can be implemented by updating the DecisionTreeClassifer parameters.

In [55]:
# Set the number of trees and tree models
num_trees = 100
clf = DecisionTreeClassifier(criterion="entropy", splitter="best")
random_clf = DecisionTreeClassifier(criterion="entropy", splitter="best", max_features="auto", random_state=0)

# Create a single bagged tree
bs_tree_preds = bootstrap_tree(clf, train_data, train_labels, test_data)

# Generate the bagged trees and the majority vote classification for the predictions
bagged_trees_preds = bagging(clf, num_trees, train_data, train_labels, test_data)

# Generate the random forest trees and the majority vote classification for the predictions
random_forest_preds = bagging(random_clf, num_trees, train_data, train_labels, test_data)

# Compare the results
print("Accuracy score for a single tree:", sum(bs_tree_preds == test_labels) / len(test_labels))
print("Accuracy score for a bagged forest:", sum(bagged_trees_preds == test_labels) / len(test_labels))
print("Accuracy score for a random forest:", sum(random_forest_preds == test_labels) / len(test_labels))

Accuracy score for a single tree: 0.8205128205128205
Accuracy score for a bagged forest: 0.8910256410256411
Accuracy score for a random forest: 0.9102564102564102


### AdaBoost from Scratch


Create a single implementation of Adaboost

In [56]:
# Convert labels into +/- 1
train_labels_pm = train_labels * 2 - 1
test_labels_pm = test_labels * 2 - 1

# Initialize with equal weights on each data point
data_weights = np.ones(train_data.shape[0]).astype("float") / float(train_data.shape[0])

# Fit the decision tree
bdtc = DecisionTreeClassifier(max_depth=1,criterion="entropy", splitter="best")
bdtc.fit(train_data, train_labels_pm, sample_weight=data_weights)

# Save the bdtc_predictions 
bdtc_predictions = bdtc.predict(train_data)

# Save the bdtc_predictions_test
bdtc_predictions_test = bdtc.predict(test_data)

# Save the weighted error rate
bdtc_weighted_error_rate = np.sum(data_weights * (1 * (bdtc_predictions != train_labels_pm)).astype("float"))

# Save the error_rate_alpha
error_rate_alpha = np.log((1 - bdtc_weighted_error_rate) / bdtc_weighted_error_rate) / 2
    
# Reweight the weights on each data point    
data_weights_updated = data_weights * np.exp(-1 * error_rate_alpha * bdtc_predictions * train_labels_pm)
data_weights_updated = data_weights_updated / sum(data_weights_updated)
data_weights = data_weights_updated

In [57]:
bdtc.score(test_data, test_labels_pm)

0.8269230769230769