# Random Forest Classifier using Entropy and Information Gain
Ryan Miller

In [1]:
from DecisionTree import entropy,partition_classes,information_gain,DecisionTree

from scipy import stats
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from time import time

ModuleNotFoundError: No module named 'DecisionTree'

### Reading in and Splitting the Data 

In [2]:
#reading in the Pima Indians Diabetes dataset
data = pd.read_csv('../Data/diabetes.csv')

#splitting data into features and target variables
target = np.array(data.iloc[:,-1]).reshape((-1,1))
features = data.iloc[:,:-1]

#scaling the features to mean 0 and unit variance
ss = StandardScaler()
features = ss.fit_transform(np.array(features))

#adding intercept column to features
features = np.append(features,np.ones((features.shape[0],1)),axis=1)

#splitting the data into train and test sets
X_train,X_test,y_train,y_test = train_test_split(features,target,test_size=0.2,random_state=0)

### Implementing the Random Forest Classifier

In [37]:
class RandomForest(object):
    #list to contain the decision trees made during fitting
    decision_trees = []

    #the bootstrapping datasets for trees
    #bootstraps_datasets is a list of lists, where each list in bootstraps_datasets is a bootstrapped dataset.
    bootstraps_datasets = []

    #the true class labels, corresponding to records in the bootstrapping datasets
    #bootstraps_labels is a list of lists, where the 'i'th list contains the labels corresponding to records in 
    #the 'i'th bootstrapped dataset.
    bootstraps_labels = []

    def _bootstrapping(self, X, y, boot_size):      
        #finding random sample of indices
        np.random.seed = 0
        idxs = np.random.randint(low=0,high=len(X),size=int(boot_size*len(X)))
        #creating the bootstrapped features and labels
        samples = [list(X[i,:]) for i in idxs]
        labels = [y[i] for i in idxs]
        return (samples, labels)

    def bootstrapping(self, X, y, boot_size):
        #checking to see if bootstraps_datasets is already populated
        if len(self.bootstraps_datasets) > 0:
            return
        #initializing one bootstapped dataset for each tree
        for i in range(self.num_trees):
            data_sample, data_label = self._bootstrapping(X, y, boot_size)
            self.bootstraps_datasets.append(data_sample)
            self.bootstraps_labels.append(data_label)

    def fitting(self, X, y, num_trees = 20, max_depth = 15, boot_size = 0.20):
        #initializing decision trees
        self.num_trees = num_trees
        self.decision_trees = [DecisionTree() for i in range(num_trees)]
        #self.bootstraps_datasets = []
        #creating bootstrapped datasets
        self.bootstrapping(X,y,boot_size)
        #training the decision trees using the bootstrapped datasets
        for i in range(len(self.decision_trees)):
            self.decision_trees[i].learn(X=self.bootstraps_datasets[i],y=self.bootstraps_labels[i], max_depth=max_depth)

    def voting(self, X):
        y = []
        #looping over all observations in X
        for record in X:
            votes = []
            #looping over all bootstrapped datasets
            for i in range(len(self.bootstraps_datasets)):
                dataset = self.bootstraps_datasets[i]
                #if the record is not in the bootstrapped dataset
                #getting the votes from the out-of-bag trees
                if list(record) not in dataset:
                    OOB_tree = self.decision_trees[i]
                    effective_vote = OOB_tree.classify_one(record)
                    votes.append(effective_vote[0])
            counts = np.bincount(votes)
            
            #if the record is not an out-of-bag sample for any of the trees
            #take the majority vote of all the trees 
            if len(counts) == 0:
                for i in range(len(self.bootstraps_datasets)):
                    OOB_tree = self.decision_trees[i]
                    effective_vote = OOB_tree.classify_one(record)
                    votes.append(effective_vote)
                counts = np.bincount(votes)
                y = np.append(y, np.argmax(counts))             
            else:
                y = np.append(y, np.argmax(counts))
        return y

### Comparing Performance to Sklearn's RandomForestClassifier
The test accuracy of my remade Random Forest Classifier is comparable to Sklearn's implementation, and unlike Sklearn's version, it is capable of handling categorical data without requiring preprocessing beforehand. The main downside is the noticeably slower runtime.

In [52]:
#sklearn
start = time()
rf = RandomForestClassifier(n_estimators = 20,criterion="entropy",random_state=0,max_depth=15,oob_score=True)
rf.fit(features,np.ravel(target))
end = time()
print("Sklearn's DecisionTreeClassifier Test Accuracy:",np.round(100*rf.oob_score_,3),'%')
print("Sklearn's DecisionTreeClassifier Runtime:",np.round(end-start,6),'seconds')

Sklearn's DecisionTreeClassifier Test Accuracy: 73.698 %
Sklearn's DecisionTreeClassifier Runtime: 0.057855 seconds


In [53]:
#self-made
start = time()
randomForest = RandomForest()
randomForest.fitting(features, target, max_depth=15, num_trees=20)
y_predicted = randomForest.voting(features)
end = time()
print("Self-Made Decision Tree Classifier Test Accuracy:",np.round(100*np.mean(y_predicted == np.ravel(target)),3),'%')
print("Self-Made Decision Tree Classifier Runtime:",np.round(end-start,6),'seconds')

Self-Made Decision Tree Classifier Test Accuracy: 75.391 %
Self-Made Decision Tree Classifier Runtime: 1.590771 seconds
