In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_iris

from tree import DecisionTreeClassifier, DecisionTreeRegressor

import warnings
warnings.simplefilter('ignore')

In [3]:
# Load iris data.
data = load_iris()
dataset = {feature: data.data[:, i] for i, feature in enumerate(data.feature_names)}
dataset.update({'target': data.target})

df = pd.DataFrame(dataset)
X, y = df.iloc[:, :-1], df.iloc[:, -1]

In [4]:
class RandomForestClassifier:
    
    def __init__(self, n_estimators=10, criterion='entropy', 
                 min_samples_split=2, max_depth=5):

        self.estimators = [DecisionTreeClassifier(criterion=criterion, 
                                             min_samples_split=min_samples_split, 
                                             max_depth=max_depth) for _ in range(n_estimators)]
    
    def fit(self, X, y):
        
        self.sample_size = int(0.5 * len(y))
        for estimator in self.estimators:
            rand_samples = np.random.permutation(len(y))[:self.sample_size]
            estimator.fit(X[rand_samples], y[rand_samples])
    
    def predict(self, X):
        preds = np.array([estimator.predict(X) for estimator in self.estimators])
        return np.round(np.mean(preds, axis=0))
    
class RandomForestRegressor:
    
    def __init__(self, n_estimators=10, criterion='var', 
                 min_samples_split=2, max_depth=5):
        
        self.estimators = [DecisionTreeRegressor(criterion=criterion, 
                                             min_samples_split=min_samples_split, 
                                             max_depth=max_depth) for _ in range(n_estimators)]
    
    def fit(self, X, y):
                
        self.sample_size = int(0.5 * len(y))
        for estimator in self.estimators:
            rand_samples = np.random.permutation(len(y))[:self.sample_size]
            estimator.fit(X[rand_samples], y[rand_samples])
    
    def predict(self, X):
        preds = np.array([estimator.predict(X) for estimator in self.estimators])
        return np.mean(preds, axis=0)

### RandomForestClassifier

In [5]:
rf = RandomForestClassifier(n_estimators=3, criterion='entropy', 
                            min_samples_split=5, max_depth=5)
rf.fit(X.values, y.values)
preds = rf.predict(X.values)
print((preds == y).mean())

0.9733333333333334


In [6]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=3, criterion='entropy', min_samples_split=5, max_depth=5)
rf.fit(X, y)
preds = rf.predict(X)
print((preds == y).mean())

0.98


### RandomForestRegressor

In [7]:
filename = "data/ex1data1.txt"
data = np.loadtxt(filename, delimiter=',', usecols=(0, 1), unpack=True)

X = np.transpose(np.array(data[:-1]))
y = np.transpose(np.array(data[-1:]))

In [8]:
rfr = RandomForestRegressor(n_estimators=3, criterion='var', min_samples_split=5, max_depth=5)

rfr.fit(X, y)
preds = rfr.predict(X)
print("MSE: ", np.mean((preds - y.ravel()) ** 2))

MSE:  8.390376269934999


In [9]:
# Sklearn way
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(n_estimators=3, criterion='mse', min_samples_split=5, max_depth=5)
rfr.fit(X, y)
preds = rfr.predict(X)

print("MSE: ", np.mean((preds - y.ravel()) ** 2))

MSE:  6.924107025589401
