In [1]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

X = np.array([
    [3, 7],
    [1, 8],
    [4, 5],
    [2, 6]
])

y = np.array([1, 0, 1, 0])


def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X, y)


predictions = clf.predict(X)

acc = accuracy(y, predictions)

print("Accuracy:", acc)

Accuracy: 1.0


In [9]:
import numpy as np

class Node:
  def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
    self.feature = feature
    self.threshold = threshold
    self.left = left
    self.right = right
    self.value = value

  def is_leaf_node(self):
    return self.value is not None

class DecisionTree:
  def __init__(self):
    self.root = None

  def fit(self, X, y):
    self.root = self._grow_tree(X, y)

  def _grow_tree(self, X, y):
    n_samples, n_features = X.shape
    n_labels = len(np.unique(y))
    # check the stopping criteria
    if (n_labels == 1):
      leaf_value = self._most_common_label(y)
      return Node(value=leaf_value)

    # feat_idxs = np.arange(n_features)

    # find the best split
    # best_feature, best_threshold = self._best_split(X, y, feat_idxs)
    best_feature, best_threshold = self._best_split(X, y)

    # create child notes
    left_idxs, right_idxs = self._split(X[:, best_feature], best_threshold)
    left = self._grow_tree(X[left_idxs, :], y[left_idxs])
    right = self._grow_tree(X[right_idxs, :], y[right_idxs])
    return Node(best_feature, best_threshold, left, right)

  def _best_split(self, X, y):
    # best_gain = -1
    best_gain = float("inf")
    split_idx, split_threshold = None, None
    for feat_idx in range(X.shape[1]):
      X_column = X[:, feat_idx]
      # Step 1: Sort the column
      X_column_sorted = np.sort(X_column)

      # Step 2: Calculate the middle values between each consecutive pair in the sorted array
      thresholds = (X_column_sorted[:-1] + X_column_sorted[1:]) / 2

      for thr in thresholds:
        gain = self._information_gain(y, X_column, thr)

        if gain < best_gain:
          best_gain = gain
          split_idx = feat_idx
          split_threshold = thr
    return split_idx, split_threshold

  def _information_gain(self, y, X_column, threshold):
    # parent entropy
    parent_entropy = self._entropy(y)

    # create children
    left_idxs, right_idxs = self._split(X_column, threshold)

    if len(left_idxs) == 0 or len(right_idxs) == 0:
      return 0
    # calculate the weighted avg. entropy of the children
    n = len(y)
    n_l, n_r = len(left_idxs), len(right_idxs)
    e_l, e_r = self._entropy(y[left_idxs]), self._entropy(y[right_idxs])
    information_gain = (n_l/n) * e_l + (n_r/n) * e_r

    # calculate the IG
    # information_gain = parent_entropy - child_entropy
    # information_gain = child_entropy
    return information_gain

  def _split(self, X_column, split_thresh):
    left_idxs = np.argwhere(X_column <= split_thresh).flatten()
    right_idxs = np.argwhere(X_column > split_thresh).flatten()
    return left_idxs, right_idxs
  def _entropy(self, y):
    # Calculate probability of each class
    unique_labels, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)

    # Avoid log(0) issues
    epsilon = 1e-10
    entropy = -np.sum(probabilities * np.log(probabilities + epsilon))
    return entropy

  def _most_common_label(self, y):
    # Calculate the unique values and their counts
    unique_labels, counts = np.unique(y, return_counts=True)
    # Find the index of the most common label
    most_common_index = np.argmax(counts)
    # Return the label with the highest count
    return unique_labels[most_common_index]

  def predict(self, X):
    return np.array([self._traverse_tree(x, self.root) for x in X])

  def _traverse_tree(self, x, node):
    if node.is_leaf_node():
      return node.value


    if x[node.feature] <= node.threshold:
      return self._traverse_tree(x, node.left)
    return self._traverse_tree(x, node.right)

class RandomForest:
  def __init__(self, n_trees=10, n_features=None):
    self.n_trees = n_trees
    self.n_features = n_features
    self.trees = []

  def fit(self, X, y):
    self.trees = []

    for _ in range(self.n_trees):
      tree = DecisionTree()
      X_sample, y_sample = self._bootstrap_samples(X, y)
      tree.fit(X_sample, y_sample)
      self.trees.append(tree)

  def _most_common_label(self, y):
    # Calculate the unique values and their counts
    unique_labels, counts = np.unique(y, return_counts=True)
    # Find the index of the most common label
    most_common_index = np.argmax(counts)
    # Return the label with the highest count
    return unique_labels[most_common_index]

  def _bootstrap_samples(self, X, y):
    n_samples, n_features = X.shape
    idxs = np.random.choice(n_samples, n_samples, replace=True)
    return X[idxs], y[idxs]


  def predict(self, X):
    predictions = np.array([tree.predict(X) for tree in self.trees])
    tree_preds = np.swapaxes(predictions, 0, 1)
    predictions = np.array([self._most_common_label(pred) for pred in tree_preds])
    return predictions

X = np.array([
    [3, 7],
    [1, 8],
    [4, 5],
    [2, 6]
])

y = np.array([1, 0, 1, 0])


def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

clf = RandomForest()
clf.fit(X, y)


predictions = clf.predict(X)

acc = accuracy(y, predictions)

print("Accuracy:", acc)

Accuracy: 1.0
