<b>Random-Forest-Regression - Scratch</b> <br>
<i>Implementing Random Forest Regression using only NumPy, step-by-step. </i>

<b>requirements</b>

In [None]:
# example:- pip install numpy

<b>imports</b>

In [4]:
# imports
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from decision_tree_regression import DecisionTreeRegressor  # Importing your Decision Tree Regressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.datasets import fetch_california_housing


     x0   x1      x2    x3        x4        y
0   800  0.0  0.3048  71.3  0.002663  126.201
1  1000  0.0  0.3048  71.3  0.002663  125.201
2  1250  0.0  0.3048  71.3  0.002663  125.951
3  1600  0.0  0.3048  71.3  0.002663  127.591
4  2000  0.0  0.3048  71.3  0.002663  127.461
The Decision Tree: 
X_0 <= 3150.0 ? 7.132048702017748
 left:X_4 <= 0.0337792 ? 3.590330569067664
  left:X_3 <= 55.5 ? 1.17898999813184
    left:X_4 <= 0.00251435 ? 1.614396721819876
        left:128.9919833333333
        right:125.90953579676673
    right:X_1 <= 15.4 ? 2.2342245360792994
        left:129.39160280373832
        right:123.80422222222222
  right:X_0 <= 1250.0 ? 9.970884020498868
    left:X_4 <= 0.0483159 ? 6.35527515982486
        left:124.38024528301887
        right:118.30039999999998
    right:X_3 <= 39.6 ? 5.036286657241031
        left:113.58091666666667
        right:118.07284615384616
 right:X_4 <= 0.00146332 ? 29.08299210506528
  left:X_0 <= 8000.0 ? 11.886497073996964
    left:X_2 <= 0.0508 ?

<b>(1) DATA PRE-PROCESSING</b>

In [5]:
# Read Raw Dataset 
print("Fetching dataset...", flush=True)
housing = fetch_california_housing()
print("Dataset loaded!", flush=True)


# Feature-Matrix (X) & Dependent-Variable(y)
X, y = housing.data, housing.target

# Data-Splitting 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# Custom Dataset :-
# X, y = datasets.make_regression(n_samples=150, n_features=1, noise=20, random_state=42)
# X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=24)

# Data Pre-Processing
# Normalize (if needed)
# Data Visualization

Fetching dataset...
Dataset loaded!


<b>(2) ML ALGORITHM - SCRATCH</b>

In [6]:
# FUNCTIONS 
# Providing each tree with a random subset of data (Bootstrap Sampling)
def bootstrap_sample(X, y):
    n_samples = X.shape[0]  # Number of samples
    idxs = np.random.choice(n_samples, size=n_samples, replace=True)  # Sampling with replacement
    return X[idxs], y[idxs]


In [7]:
# RANDOM FOREST REGRESSOR CLASS
class RandomForestRegressor:
    def __init__(self, n_trees=100, min_samples_split=2, max_depth=2, n_feats=None):
        """
        Initializes the Random Forest Regressor.
        
        Parameters:
            n_trees : (int) Number of decision trees in the forest.
            min_samples_split : (int) Minimum samples required to split a node.
            max_depth : (int) Maximum depth of each tree.
            n_feats : (int) Number of features considered for splitting at each node (random subspace method).
        """
        self.n_trees = n_trees
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feats = n_feats
        self.trees = []  # Stores all decision trees in the forest

    def fit(self, X, y):
        """
        Trains the Random Forest Regressor model.
        
        Parameters:
            X : (np.array) Feature matrix.
            y : (np.array) Target variable.
        """
        self.trees = []
        for _ in range(self.n_trees):
            tree = DecisionTreeRegressor(min_samples_split=self.min_samples_split, max_depth=self.max_depth)
            X_sample, y_sample = bootstrap_sample(X, y)
            tree.fit(X_sample, y_sample.reshape(-1, 1))  # Ensuring correct shape for training
            self.trees.append(tree)

    def predict(self, X):
        """
        Predicts values using the trained Random Forest Regressor.
        
        Parameters:
            X : (np.array) Feature matrix.
            
        Returns:
            np.array : Predicted values (average of all trees' outputs).
        """
        # Get predictions from all trees
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        
        # Transpose to align predictions for averaging
        tree_preds = np.swapaxes(tree_preds, 0, 1)  
        
        # Compute the mean prediction for each sample
        y_pred = np.mean(tree_preds, axis=1)  # Average instead of majority voting
        # here, RF clf used most_common_label(), but regression averages predictions from all trees:
        return y_pred


<b>(3) MODEL TRAINING</b>

In [None]:
# IMPLEMENTING RANDOM FOREST - REGRESSOR 
print("Initializing Random Forest Regressor...", flush=True)
reg = RandomForestRegressor(n_trees=3, max_depth=4)  # Start with 1 tree first

print("Training started...", flush=True)
reg.fit(X_train, y_train)
print("Training completed!", flush=True)

<b>(4) PREDICTION</b>

In [None]:
print("Predicting...", flush=True)
y_pred = reg.predict(X_test)

<b>(5) EVALUATION-VISUALIZATION</b>

In [None]:
print("Calculating RMSE...", flush=True)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error (RMSE):", rmse, flush=True)

<b>CONCLUSION</b>
- Randopm Forest using DT