# Random Forest From Scratch

Building a Random Forest using Decision Trees:
1. Implement DecisionTree class
2. Bootstrap sampling
3. Random Forest ensemble
4. sklearn comparison

---

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import sys
sys.path.append('..')
from decision_trees.utils import (
    entropy, gini_impurity, information_gain, find_best_split, 
    most_common_label, accuracy_score, confusion_matrix
)
from utils import bootstrap_sample, majority_vote

sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)
np.random.seed(42)

---
## Part 1: Decision Tree (from previous component)

In [None]:
class Node:
    """Node in decision tree."""
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold  
        self.left = left
        self.right = right
        self.value = value
    
    def is_leaf(self):
        return self.value is not None

class DecisionTree:
    """Decision Tree Classifier."""
    
    def __init__(self, max_depth=None, min_samples_split=2, 
                 min_samples_leaf=1, criterion='gini'):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.criterion = criterion
        self.root = None
        self.n_features = None
    
    def fit(self, X, y):
        self.n_features = X.shape[1]
        self.root = self._build_tree(X, y, depth=0)
        return self
    
    def _build_tree(self, X, y, depth):
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))
        
        if (self.max_depth is not None and depth >= self.max_depth) or \
           n_classes == 1 or n_samples < self.min_samples_split:
            return Node(value=most_common_label(y))
        
        best_feature, best_threshold = self._find_best_split(X, y)
        
        if best_feature is None:
            return Node(value=most_common_label(y))
        
        left_indices = X[:, best_feature] <= best_threshold
        right_indices = ~left_indices
        
        if np.sum(left_indices) < self.min_samples_leaf or \
           np.sum(right_indices) < self.min_samples_leaf:
            return Node(value=most_common_label(y))
        
        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)
        
        return Node(feature=best_feature, threshold=best_threshold,
                   left=left_subtree, right=right_subtree)
    
    def _find_best_split(self, X, y):
        best_gain = 0
        best_feature = None
        best_threshold = None
        
        for feature_idx in range(self.n_features):
            threshold, gain = find_best_split(X, y, feature_idx, self.criterion)
            if gain > best_gain:
                best_gain = gain
                best_feature = feature_idx
                best_threshold = threshold
        
        return best_feature, best_threshold
    
    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])
    
    def _traverse_tree(self, x, node):
        if node.is_leaf():
            return node.value
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)

print('Decision Tree class ready')

---
## Part 2: Random Forest Class

In [None]:
class RandomForest:
    """Random Forest Classifier."""
    
    def __init__(self, n_estimators=100, max_depth=None,
                 min_samples_split=2, random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.random_state = random_state
        self.trees = []
    
    def fit(self, X, y):
        self.trees = []
        
        for i in range(self.n_estimators):
            # Bootstrap sample
            seed = self.random_state + i if self.random_state else None
            X_sample, y_sample = bootstrap_sample(X, y, random_state=seed)
            
            # Train tree
            tree = DecisionTree(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split
            )
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)
        
        return self
    
    def predict(self, X):
        tree_predictions = [tree.predict(X) for tree in self.trees]
        return majority_vote(tree_predictions)

print('Random Forest class ready')

---
## Part 3: Train on Iris

In [None]:
# Load data
iris = load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Train: {X_train.shape}, Test: {X_test.shape}')

In [None]:
# Our Random Forest
rf = RandomForest(n_estimators=10, max_depth=5, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print('='*50)
print('OUR RANDOM FOREST')
print('='*50)
print(f'Trees: {rf.n_estimators}')
print(f'Accuracy: {acc*100:.2f}%')

In [None]:
# sklearn Random Forest  
sklearn_rf = RandomForestClassifier(n_estimators=10, max_depth=5, random_state=42)
sklearn_rf.fit(X_train, y_train)
y_pred_sk = sklearn_rf.predict(X_test)

acc_sk = accuracy_score(y_test, y_pred_sk)
print('='*50)
print('SKLEARN RANDOM FOREST')
print('='*50)
print(f'Trees: {sklearn_rf.n_estimators}')
print(f'Accuracy: {acc_sk*100:.2f}%')

print(f'\nDifference: {abs(acc - acc_sk)*100:.2f}%')

---
## Part 4: Effect of Number of Trees

In [None]:
n_trees_range = [1, 5, 10, 25, 50]
accuracies = []

for n in n_trees_range:
    rf_temp = RandomForest(n_estimators=n, max_depth=5, random_state=42)
    rf_temp.fit(X_train, y_train)
    pred = rf_temp.predict(X_test)
    accuracies.append(accuracy_score(y_test, pred))

plt.figure(figsize=(10, 6))
plt.plot(n_trees_range, accuracies, 'o-', linewidth=2, markersize=8)
plt.xlabel('Number of Trees', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.title('Accuracy vs Number of Trees', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.show()

print(f'Best accuracy: {max(accuracies)*100:.2f}% with {n_trees_range[np.argmax(accuracies)]} trees')

---
## Summary

### Key Insights:
- **Ensemble averaging** reduces variance
- **Bootstrap sampling** creates diverse training sets
- **More trees** â†’ more stable predictions
- Performance matches sklearn

### Key Point:
"Random Forests combine multiple decision trees trained on bootstrap samples. Aggregating predictions through majority voting reduces variance and improves generalization without increasing bias."

---