# Task
Write a random forest function from scratch in Python, including implementations for decision trees, bagging, and random subspace, and provide a prediction function and evaluation method.

## Implement a decision tree

### Subtask:
Create a function or class for a basic decision tree that can handle splitting data based on features and making predictions. This will be the building block of the random forest.


**Reasoning**:
Implement a class for a decision tree that includes methods for calculating Gini impurity, finding the best split, building the tree recursively, and making predictions. This addresses all parts of the subtask.



In [2]:
import numpy as np
from collections import Counter

class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def _gini_impurity(self, y):
        if len(y) == 0:
            return 0
        proportions = np.bincount(y) / len(y)
        return 1.0 - np.sum(proportions**2)

    def _best_split(self, X, y):
        m, n = X.shape
        if m < self.min_samples_split:
            return None, None

        best_gini = float('inf')
        best_feature_index = None
        best_threshold = None

        for feature_index in range(n):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                left_indices = np.where(X[:, feature_index] <= threshold)[0]
                right_indices = np.where(X[:, feature_index] > threshold)[0]

                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue

                gini_left = self._gini_impurity(y[left_indices])
                gini_right = self._gini_impurity(y[right_indices])
                gini = (len(left_indices) * gini_left + len(right_indices) * gini_right) / m

                if gini < best_gini:
                    best_gini = gini
                    best_feature_index = feature_index
                    best_threshold = threshold

        return best_feature_index, best_threshold

    def _build_tree(self, X, y, depth):
        if depth == self.max_depth or len(y) < self.min_samples_split or len(np.unique(y)) == 1:
            leaf_value = Counter(y).most_common(1)[0][0]
            return {'leaf': leaf_value}

        feature_index, threshold = self._best_split(X, y)

        if feature_index is None:
            leaf_value = Counter(y).most_common(1)[0][0]
            return {'leaf': leaf_value}

        left_indices = np.where(X[:, feature_index] <= threshold)[0]
        right_indices = np.where(X[:, feature_index] > threshold)[0]

        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return {'feature_index': feature_index, 'threshold': threshold,
                'left': left_subtree, 'right': right_subtree}

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, 0)

    def _predict_single(self, x, tree):
        if 'leaf' in tree:
            return tree['leaf']

        feature_index = tree['feature_index']
        threshold = tree['threshold']

        if x[feature_index] <= threshold:
            return self._predict_single(x, tree['left'])
        else:
            return self._predict_single(x, tree['right'])

    def predict(self, X):
        predictions = [self._predict_single(x, self.tree) for x in X]
        return np.array(predictions)

# Example usage (optional - for testing the class)
# from sklearn.model_selection import train_test_split
# from sklearn.datasets import load_iris
#
# iris = load_iris()
# X, y = iris.data, iris.target
#
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#
# tree = DecisionTree(max_depth=3)
# tree.fit(X_train, y_train)
#
# predictions = tree.predict(X_test)
# print(predictions)
# print(y_test)

## Implement bagging (bootstrap aggregating)

### Subtask:
Create a function to generate multiple bootstrap samples of the training data. Each sample will be used to train a separate decision tree.


**Reasoning**:
Define the `bootstrap_sample` function to generate bootstrap samples as per the instructions.



In [3]:
def bootstrap_sample(X, y):
    """Generates a single bootstrap sample of the training data.

    Args:
        X: The training features (numpy array).
        y: The training labels (numpy array).

    Returns:
        A tuple containing the bootstrap sample of features and labels (X_sample, y_sample).
    """
    n_samples = len(X)
    indices = np.random.choice(n_samples, size=n_samples, replace=True)
    X_sample = X[indices]
    y_sample = y[indices]
    return X_sample, y_sample

# Example usage (optional - for testing the function)
# from sklearn.datasets import load_iris
#
# iris = load_iris()
# X, y = iris.data, iris.target
#
# X_sample, y_sample = bootstrap_sample(X, y)
# print("Original data shape:", X.shape, y.shape)
# print("Bootstrap sample shape:", X_sample.shape, y_sample.shape)

## Implement random subspace

### Subtask:
Modify the decision tree implementation to randomly select a subset of features at each split point. This adds more randomness to the forest and helps prevent overfitting.


**Reasoning**:
Modify the `DecisionTree` class to include the `max_features` parameter and update the `_best_split` method to randomly select a subset of features.



In [4]:
import numpy as np
from collections import Counter

class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2, max_features=None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_features = max_features
        self.tree = None

    def _gini_impurity(self, y):
        if len(y) == 0:
            return 0
        proportions = np.bincount(y) / len(y)
        return 1.0 - np.sum(proportions**2)

    def _best_split(self, X, y):
        m, n = X.shape
        if m < self.min_samples_split:
            return None, None

        best_gini = float('inf')
        best_feature_index = None
        best_threshold = None

        # Determine the number of features to consider
        if self.max_features is None:
            num_features_to_consider = n
        elif isinstance(self.max_features, int):
            num_features_to_consider = min(self.max_features, n)
        elif isinstance(self.max_features, float):
            num_features_to_consider = int(self.max_features * n)
        else:
            num_features_to_consider = n # Default to all features if max_features is invalid

        # Randomly select a subset of features
        feature_indices_to_consider = np.random.choice(n, num_features_to_consider, replace=False)


        for feature_index in feature_indices_to_consider:
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                left_indices = np.where(X[:, feature_index] <= threshold)[0]
                right_indices = np.where(X[:, feature_index] > threshold)[0]

                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue

                gini_left = self._gini_impurity(y[left_indices])
                gini_right = self._gini_impurity(y[right_indices])
                gini = (len(left_indices) * gini_left + len(right_indices) * gini_right) / m

                if gini < best_gini:
                    best_gini = gini
                    best_feature_index = feature_index
                    best_threshold = threshold

        return best_feature_index, best_threshold

    def _build_tree(self, X, y, depth):
        if depth == self.max_depth or len(y) < self.min_samples_split or len(np.unique(y)) == 1:
            leaf_value = Counter(y).most_common(1)[0][0]
            return {'leaf': leaf_value}

        feature_index, threshold = self._best_split(X, y)

        if feature_index is None:
            leaf_value = Counter(y).most_common(1)[0][0]
            return {'leaf': leaf_value}

        left_indices = np.where(X[:, feature_index] <= threshold)[0]
        right_indices = np.where(X[:, feature_index] > threshold)[0]

        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return {'feature_index': feature_index, 'threshold': threshold,
                'left': left_subtree, 'right': right_subtree}

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, 0)

    def _predict_single(self, x, tree):
        if 'leaf' in tree:
            return tree['leaf']

        feature_index = tree['feature_index']
        threshold = tree['threshold']

        if x[feature_index] <= threshold:
            return self._predict_single(x, tree['left'])
        else:
            return self._predict_single(x, tree['right'])

    def predict(self, X):
        predictions = [self._predict_single(x, self.tree) for x in X]
        return np.array(predictions)