# Random Forests Implementation from Scratch 
- Try Analysing what is happening 
- Don't get stuck for long


In [None]:
import random
import pandas as pd
import numpy as np

# Define a Decision Tree class
class DecisionTree:
    def __init__(self, max_depth, min_samples_split):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = {}

    def entropy(self, y):
        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        entropy = sum(probabilities * -np.log2(probabilities))
        return entropy

    def information_gain(self, X_column, y, threshold):
        left_idxs, right_idxs = self.split(X_column, threshold)
        left_y = y[left_idxs]
        right_y = y[right_idxs]
        child_entropy = self.entropy(left_y) + self.entropy(right_y)
        ig = self.entropy(y) - child_entropy
        return ig

    def split(self, X_column, threshold):
        left_idxs = np.argwhere(X_column <= threshold).flatten()
        right_idxs = np.argwhere(X_column > threshold).flatten()
        return left_idxs, right_idxs

    def most_common_label(self, y):
        counter = counter(y)
        most_common = counter.most_common(1)
        return most_common[0][0]

    def build_tree(self, X, y, depth=0):
        if depth == self.max_depth or len(y) < self.min_samples_split:
            self.tree['label'] = self.most_common_label(y)
            return
        column, threshold = self.find_split_point(X, y)
        self.tree['feature_idx'] = column
        self.tree['split_point'] = threshold
        left_idxs, right_idxs = self.split(X[:, column], threshold)
        self.tree['left_split'] = {}
        self.tree['right_split'] = {}
        self.build_tree(X[left_idxs], y[left_idxs], depth=depth+1)
        self.build_tree(X[right_idxs], y[right_idxs], depth=depth+1)

    def predict_tree(self, X_test):
        feature_idx = self.tree['feature_idx']
        if X_test[feature_idx] <= self.tree['split_point']:
            if 'left_split' in self.tree:
                return self.predict_tree(self.tree['left_split'], X_test)
            else:
                return self.tree['label']
        else:
            if 'right_split' in self.tree:
                return self.predict_tree(self.tree['right_split'], X_test)
            else:
                return self.tree['label']

# Define a Random Forest class
class RandomForest:
    def __init__(self, n_trees, max_depth, min_samples_split, max_features):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_features = max_features
        self.trees = []

    def build_trees(self, X, y):
        for i in range(self.n_trees):
            tree = DecisionTree(self.max_depth, self.min_samples_split)
            column_idxs = np.random.choice(X.shape[1], self.max_features, replace=False)
            X_column_subset = X[:, column_idxs]
            tree.build_tree(X_column_subset, y)
            self.trees.append(tree)

    def predict_rf(self, X_test):
        ensemble_preds = [tree.predict_tree(X_test) for tree in self.trees]
        final_pred = max(ensemble_preds)

# Sample Random Forest Implementation

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# Load iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Initialize random forest classifier with 100 trees
rf = RandomForestClassifier(n_estimators=100)

# Fit the model to the data
rf.fit(X, y)

# Make predictions on new data
X_test = [[5.0, 3.5, 1.3, 0.2], [6.0, 3.0, 4.8, 1.8], [7.3, 2.9, 6.3, 1.8]]
y_pred = rf.predict(X_test)

print(y_pred)

[0 2 2]


# Decision Tree Implementation 

In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris

# Load iris dataset
iris = load_iris()
X = iris.data
print(X.shape)
y = iris.target

# Initialize decision tree classifier with max_depth=3
dt = DecisionTreeClassifier(max_depth=3)

# Fit the model to the data
dt.fit(X, y)

# Make predictions on new data
X_test = [[5.0, 3.5, 1.3, 0.2], [6.0, 3.0, 4.8, 1.8], [7.3, 2.9, 6.3, 1.8]]
y_pred = dt.predict(X_test)

print(y_pred)

(150, 4)
(150,)
[0 2 2]
