# Random Forest Classifier using Entropy and Information Gain
Python module contained [here](https://github.com/ryan-kp-miller/Machine-Learning-Algorithms/tree/master/RandomForest).

In [2]:
from RandomForest.RandomForest import RandomForest

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from time import time

### Reading in and Splitting the Data 

In [3]:
#reading in the Pima Indians Diabetes dataset
data = pd.read_csv('../Data/diabetes.csv')

#splitting data into features and target variables
target = np.array(data.iloc[:,-1]).reshape((-1,1))
features = data.iloc[:,:-1]

#scaling the features to mean 0 and unit variance
ss = StandardScaler()
features = ss.fit_transform(np.array(features))

#adding intercept column to features
features = np.append(features,np.ones((features.shape[0],1)),axis=1)

#splitting the data into train and test sets
X_train,X_test,y_train,y_test = train_test_split(features,target,test_size=0.2,random_state=0)

### Comparing Performance to Sklearn's RandomForestClassifier
The test accuracy of my Random Forest Classifier implementation is comparable to Sklearn's. Unlike Sklearn's version, it is capable of handling categorical data without requiring preprocessing beforehand but has a noticeably slower runtime.

In [4]:
#sklearn
start = time()
rf = RandomForestClassifier(n_estimators = 20,criterion="entropy",random_state=0,max_depth=15,oob_score=True)
rf.fit(features,np.ravel(target))
end = time()
print("Sklearn's DecisionTreeClassifier Test Accuracy:",np.round(100*rf.oob_score_,3),'%')
print("Sklearn's DecisionTreeClassifier Runtime:",np.round(end-start,6),'seconds')

Sklearn's DecisionTreeClassifier Test Accuracy: 73.568 %
Sklearn's DecisionTreeClassifier Runtime: 0.035954 seconds


In [5]:
#self-made
start = time()
randomForest = RandomForest()
randomForest.fitting(features, target, max_depth=15, num_trees=20)
y_predicted = randomForest.voting(features)
end = time()
print("Self-Made Decision Tree Classifier Test Accuracy:",np.round(100*np.mean(y_predicted == np.ravel(target)),3),'%')
print("Self-Made Decision Tree Classifier Runtime:",np.round(end-start,6),'seconds')

Self-Made Decision Tree Classifier Test Accuracy: 74.219 %
Self-Made Decision Tree Classifier Runtime: 1.390332 seconds
