In [1]:
import numpy as np
from sklearn.datasets import load_iris
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

In [2]:
# Load the iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Random Forest Tree Classifier
clf = RandomForestClassifier(n_estimators=100, criterion='gini', random_state=42)

# Fit the classifier to the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 100.00%
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45



In [3]:
y_pred

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0])

## Random Forest From Scratch

In [4]:
class Node:
    def __init__(self, value=None, feature=None, threshold=None, left=None, right=None):
        self.value = value
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right

class DecisionTree:
    def __init__(self):
        self.tree = None
    
    def gini_impurity(self, y):
        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        return np.sum(probabilities*(1-probabilities))
    
    def gini_index(self, X, y, feature, threshold):
        mask = X[: ,feature] <= threshold
        left_gini_impurity = self.gini_impurity(y[mask])
        right_gini_impurity = self.gini_impurity(y[~mask])
        parent_gini_impurity = self.gini_impurity(y)
        p_left = sum(mask)/len(y)
        p_right = sum(~mask)/len(y)
        GI = parent_gini_impurity - (p_left*left_gini_impurity + p_right*right_gini_impurity)
        return GI
    
    def find_split(self, X, y):
        features = X.shape[1]
        best_feature = None
        best_threshold = None
        max_gain = -1
        n_features = int(np.sqrt(X.shape[1]))

        selected_features = np.random.choice(features, n_features, replace=False)

        for feature in selected_features:
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                IG = self.gini_index(X, y, feature, threshold)
                if IG > max_gain:
                    best_feature, best_threshold, max_gain = feature, threshold, IG
        return best_feature, best_threshold
        
    def build_tree(self, X, y):
        if len(np.unique(y)) == 1 or len(y) < 2:
            return Node(value=np.unique(y)[0])

        feature, threshold = self.find_split(X, y)
        
        mask = X[:, feature] <= threshold
        if sum(mask) < 1 or sum(~mask) < 1:
            return Node(value=Counter(y).most_common(1)[0][0])
        left = self.build_tree(X[mask], y[mask])
        right = self.build_tree(X[~mask], y[~mask])

        return Node(feature=feature, threshold=threshold, left=left, right=right)
    
    def prediction(self, X):
        node = self.tree
        while node.value == None:
            if X[node.feature] <= node.threshold:
                node = node.left
            else:
                node = node.right
        pred = node.value
        return pred

    def predict(self, X):
        predictions = np.apply_along_axis(self.prediction, axis=1, arr=X)
        return predictions
     
    def fit(self, X, y):
        self.tree = self.build_tree(X, y)

class RandomForest:
    def __init__(self, n_estimators=100):
        self.trees = []
        self.n_estimators = n_estimators
    
    def bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, n_samples, replace=True)
        return X[indices], y[indices]
    
    def fit(self, X, y):
        for _ in range(self.n_estimators):
            X_sample, y_sample = self.bootstrap_sample(X, y)
            tree = DecisionTree()
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)
    
    def predict(self, X):
        tree_predictions = np.array([tree.predict(X) for tree in self.trees])
        predictions = [Counter(tree_predictions[:, i]).most_common(1)[0][0] for i in range(X.shape[0])]
        return np.array(predictions)

In [5]:
model = RandomForest()
model.fit(X_train,y_train)
pred = model.predict(X_test)

In [6]:
# Evaluate the classifier
accuracy = accuracy_score(y_test, pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

print("Classification Report:")
print(classification_report(y_test, pred))

Accuracy: 100.00%
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

