<a href="https://colab.research.google.com/github/rajdeepbanerjee-git/JNCLectures_Intro_to_ML/blob/main/Week11/2025/Writing_RF_using_sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
from collections import Counter

In [2]:
class SimpleRandomForestClassifier:
    """
    A simple implementation of the Random Forest Classifier using sklearn's DecisionTreeClassifier.

    Parameters
    ----------
    n_estimators : int
        Number of decision trees to use in the forest.
    max_features : int or float or 'sqrt' or 'log2'
        Number of features to consider when looking for the best split.
    max_depth : int
        Maximum depth of the individual decision trees.
    random_state : int
        Seed for reproducibility.
    """
    def __init__(self, n_estimators=10, max_features='sqrt', max_depth=None, random_state=None):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.max_depth = max_depth
        self.random_state = random_state
        self.trees = []
        self.bootstraps = []

    def _bootstrap_sample(self, X, y):
        """
        Generate a bootstrap sample (random sampling with replacement).
        """
        n_samples = X.shape[0]
        X_sample, y_sample = resample(
            X, y,
            n_samples=n_samples,
            random_state=self.random_state
        )
        return X_sample, y_sample

    def fit(self, X, y):
        """
        Train the Random Forest classifier.

        Parameters
        ----------
        X : ndarray of shape (n_samples, n_features)
            Training data.
        y : ndarray of shape (n_samples,)
            Target labels.
        """
        self.trees = []
        np.random.seed(self.random_state)

        for i in range(self.n_estimators):
            # Bootstrap sampling
            X_sample, y_sample = self._bootstrap_sample(X, y)

            # Initialize Decision Tree with feature subsetting
            tree = DecisionTreeClassifier(
                max_features=self.max_features,
                max_depth=self.max_depth,
                random_state=self.random_state + i if self.random_state is not None else None
            )
            tree.fit(X_sample, y_sample)

            self.trees.append(tree)
            self.bootstraps.append((X_sample, y_sample))

    def _most_common_label(self, predictions):
        """
        Return the most common label in a list of predictions.
        """
        return Counter(predictions).most_common(1)[0][0]

    def predict(self, X):
        """
        Predict the class labels for the given data.

        Parameters
        ----------
        X : ndarray of shape (n_samples, n_features)
            Input samples.

        Returns
        -------
        y_pred : ndarray of shape (n_samples,)
            Predicted class labels.
        """
        # Collect predictions from all trees
        all_preds = np.array([tree.predict(X) for tree in self.trees])

        # Majority vote (axis=0 means across all trees for each sample)
        y_pred = [self._most_common_label(preds) for preds in all_preds.T]
        return np.array(y_pred)

In [5]:
# how does majority prediction work?
predictions = [1, 0, 1, 1, 0]
print(Counter(predictions)) # 1 appears 3 times, 0 appears 2 times
print(Counter(predictions).most_common(1)) # most common is 1, it appears 3 times
print(Counter(predictions).most_common(1)[0]) # (1, 3)
print(Counter(predictions).most_common(1)[0][0]) # 1

Counter({1: 3, 0: 2})
[(1, 3)]
(1, 3)
1


In [11]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train custom Random Forest
rf = SimpleRandomForestClassifier(n_estimators=5, max_depth=3, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Evaluate
print(f"Accuracy: {np.round(accuracy_score(y_test, y_pred), 4)}")


Accuracy: 0.9556


#### In the above example, what is the number of features used at each node for splitting?