# Random Forest from Scratch

## Importing Data

## Decision Trees

### Information Gain and Entropy

### Building the Tree and Structure

### Pruning and Overfitting

## Random Forests

### Random Forest as an Ensemble of Decision Trees

### Random Subspace Method for Feature Selection

### Bootstrapping 

### Hyperparameter Tuning

## Evaluation of Models and Cross-validation

## Variations and Improvements

### Feature Importance Estimation with Random Forest

### Handling Missing Data and Outliers

### Extending Random Forest to Handle Imbalanced Datasets

### Multiclass Classification and Regression Problems

## Coursework for Random Forest

In [190]:
from typing import List, Tuple
import math
from collections import Counter


class Node:
    def __init__(self, split_dim=None, split_point=None, label=None):
        self.split_dim = split_dim
        self.split_point = split_point
        self.label = label
        self.left = None
        self.right = None
        
    def is_leaf(self):
        return self.label is not None
    
#_______________________________________________________________________________________________

class Solution:
    def __init__(self):
        self.root = None
        
    # my own created definition that will compute the information (or entropy) 
    def compute_info(self, labels: List[int]) -> float:
  
        total_samples = len(labels)
        label_counts = {}
        info = 0.0

        for label in labels:
            if label in label_counts:
                label_counts[label] += 1
            else:
                label_counts[label] = 1

        for count in label_counts.values():
            probability = count / total_samples
            # this is the key line: information is the negative sum of the probablity of each class times the log of
            # that same probablity value for each class
            info -= probability * math.log2(probability)

        return info
    
    def split_info(self, data: List[List[float]], label: List[int], split_dim: int, split_point: float) -> float:
        
        # left and right just represent the two subsets that are created when we split the data. We can capture the 
        # data that is in each subset, as well as the labels
        data_left = []
        data_right = []
        label_left = []
        label_right = []
        
        # now we iterate through each value of the given dimension (split_dim) and check if it is greater than or equal to
        # the split_point. Depending on the result, it goes to the left subset or right subset
        for i in range(len(data)):
            if data[i][split_dim] <= split_point:
                data_left.append(data[i])
                label_left.append(label[i])
            else:
                data_right.append(data[i])
                label_right.append(label[i])

        total_samples = len(data)
        total_left = len(data_left)
        total_right = len(data_right)

        p_left = total_left / total_samples
        p_right = total_right / total_samples
        
        # we then weigh the information we gather from each side's split, and return the sum of those as info_a
        info_left = self.compute_info(label_left)
        info_right = self.compute_info(label_right)
        info_a = p_left * info_left + p_right * info_right

        return info_a
    
#_______________________________________________________________________________________________

    def fit(self, train_data: List[List[float]], train_label: List[int]) -> None:
        self.root = self._recursive_build(train_data, train_label)

    def _recursive_build(self, data: List[List[float]], labels: List[int], depth: int = 1) -> Node:
        
        # get the majority class of this current subset incase we need to know
        label_counts = Counter(labels)
        sorted_labels = sorted(label_counts.items(), key=lambda x: (-x[1], x[0]))
        majority_label = sorted_labels[0][0]
        
        # sort of stopping criteria - if all of the data in the current subset is the same, no more splits are needed
        if len(set(labels)) == 1 or depth > 2:
            return Node(label=majority_label,split_dim=-1, split_point=-1.0)
        
        # grabs the number of features or attributes
        num_features = len(data[0])
        # initializes the best_info_gain to negative infinity
        best_info_gain = float('-inf')
        # initializes the best split dimension to -1
        best_split_dim = -1
        # sets the best split point at -1 to start
        best_split_point = -1
        
        # for each feature in all features...
        for split_dim in range(num_features):
            # get split points between all attributes of this subset
            split_points = self._calculate_split_points(data, split_dim)
            
            # check info gain and find best info gain
            for split_point in split_points:
                info_gain = self._calculate_info_gain(data, labels, split_dim, split_point)
                if info_gain > best_info_gain:
                    best_info_gain = info_gain
                    best_split_dim = split_dim
                    best_split_point = split_point
                    
        node = Node(split_dim=best_split_dim, split_point=best_split_point,label=majority_label)

        data_left, labels_left, data_right, labels_right = self._split_data(data, labels, best_split_dim, best_split_point)
        if data_left and data_right:
            node.left = self._recursive_build(data_left, labels_left,depth = depth + 1)
            node.right = self._recursive_build(data_right, labels_right, depth = depth + 1)
            
        return node
   
    def _split_data(self, data: List[List[float]], labels: List[int], split_dim: int, split_point: float) -> Tuple[List[List[float]], List[int], List[List[float]], List[int]]:
        data_left, labels_left, data_right, labels_right = [], [], [], []

        for i in range(len(data)):
            if data[i][split_dim] <= split_point:
                data_left.append(data[i])
                labels_left.append(labels[i])
            else:
                data_right.append(data[i])
                labels_right.append(labels[i])

        return data_left, labels_left, data_right, labels_right

    def _calculate_info_gain(self, data: List[List[float]], labels: List[int], split_dim: int, split_point: float) -> float:
        total_samples = len(data)
        total_left, total_right = 0, 0
        count_left, count_right = {}, {}

        for i in range(total_samples):
            if data[i][split_dim] <= split_point:
                total_left += 1
                label = labels[i]
                if label in count_left:
                    count_left[label] += 1
                else:
                    count_left[label] = 1
            else:
                total_right += 1
                label = labels[i]
                if label in count_right:
                    count_right[label] += 1
                else:
                    count_right[label] = 1

        info = self.compute_info(labels)
        info_left = 0.0
        info_right = 0.0

        for count in count_left.values():
            probability = count / total_left
            info_left -= probability * math.log2(probability)

        for count in count_right.values():
            probability = count / total_right
            info_right -= probability * math.log2(probability)

        info_gain = info - ((total_left / total_samples) * info_left + (total_right / total_samples) * info_right)
        return info_gain
    
    def _calculate_split_points(self, data: List[List[float]], split_dim: int) -> List[float]:
        attribute_values = sorted([data_point[split_dim] for data_point in data])
        split_points = [(attribute_values[i] + attribute_values[i + 1]) / 2 for i in range(len(attribute_values) - 1)]
        return split_points
    
#_______________________________________________________________________________________________

    def classify(self, train_data: List[List[float]], train_label: List[int], test_data: List[List[float]]) -> List[int]:
        self.fit(train_data, train_label)
        predictions = []

        for data_point in test_data:
            predicted_label = self._traverse_tree(data_point, self.root)
            predictions.append(predicted_label)

        return predictions

    def _traverse_tree(self, data_point: List[float], node: Node) -> int:
        if node.left is None and node.right is None:
            return node.label
        if data_point[node.split_dim] <= node.split_point:
            return self._traverse_tree(data_point, node.left)
        else:
            return self._traverse_tree(data_point, node.right)

In [191]:
# Split Info Test

def parse_input_file(file_path):
    with open(file_path, 'r') as file:
        split_dim = int(file.readline().strip())
        split_point = float(file.readline().strip())
        data = []
        labels = []
        for line in file:
            line = line.strip()
            parts = line.split()
            label = int(parts[0])
            attributes = [float(attr.split(':')[1]) for attr in parts[1:]]
            data.append(attributes)
            labels.append(label)
    return data, labels, split_dim, split_point

input_file = r"C:\Users\rogerree\Downloads\01mSTLanT0igEUUS8kDmDQ_2834ea673e8949dea706a3da771f23f1_PA-DT-Handout\dt_handout\sample_test_cases\split_info\input01.txt"
data, labels, split_dim, split_point = parse_input_file(input_file)

solution = Solution()


# Call the split_info method with the parsed data
result = solution.split_info(data, labels, split_dim, split_point)
print(result)

1.2452987659940624


In [192]:
# Tree Structure Test

filename = r"C:\Users\rogerree\Downloads\01mSTLanT0igEUUS8kDmDQ_2834ea673e8949dea706a3da771f23f1_PA-DT-Handout\dt_handout\sample_test_cases\tree_structure\input01.txt"

# Read the input file and extract dataset and labels
def read_input_file(file_path):
    dataset = []
    labels = []
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip().split()
            labels.append(int(line[0]))
            data_point = [float(entry.split(':')[1]) for entry in line[1:]]
            dataset.append(data_point)
    return dataset, labels

def preorder_traversal(node):
    if node is None:
        return ""
    result = "{" + f"split_dim: {node.split_dim}, split_point: {node.split_point}, label: {node.label}" + "}"
    if node.left or node.right:
        result += "{" + preorder_traversal(node.left) + "}"
        result += "{" + preorder_traversal(node.right) + "}"
    return result

def inorder_traversal(node):
    if node is None:
        return ""
    result = ""
    if node.left or node.right:
        result += "{" + inorder_traversal(node.left) + "}"
    result += "{" + f"split_dim: {node.split_dim}, split_point: {node.split_point}, label: {node.label}" + "}"
    if node.left or node.right:
        result += "{" + inorder_traversal(node.right) + "}"
    return result

# File path of the input file
input_file = filename

# Read the input file to extract dataset and labels
dataset, labels = read_input_file(input_file)

# Create an instance of the Solution class
solution = Solution()

# Build the decision tree using the fit function
solution.fit(dataset, labels)

# Perform preorder traversal on the decision tree
preorder_result = preorder_traversal(solution.root)
print(preorder_result)

print()

# Perform inorder traversal on the decision tree
inorder_result = inorder_traversal(solution.root)
print(inorder_result)

{split_dim: 1, split_point: 2.076767332959929, label: 3}{{split_dim: 2, split_point: 2.0413842935532105, label: 2}{{split_dim: -1, split_point: -1.0, label: 2}}{{split_dim: -1, split_point: -1.0, label: 2}}}{{split_dim: 2, split_point: 3.041219385179334, label: 3}{{split_dim: -1, split_point: -1.0, label: 3}}{{split_dim: -1, split_point: -1.0, label: 2}}}

{{{split_dim: -1, split_point: -1.0, label: 2}}{split_dim: 2, split_point: 2.0413842935532105, label: 2}{{split_dim: -1, split_point: -1.0, label: 2}}}{split_dim: 1, split_point: 2.076767332959929, label: 3}{{{split_dim: -1, split_point: -1.0, label: 3}}{split_dim: 2, split_point: 3.041219385179334, label: 3}{{split_dim: -1, split_point: -1.0, label: 2}}}


In [193]:
# Classification Test

def parse_classification_data(file_path):
    with open(file_path, 'r') as file:
        train_data = []
        train_label = []
        test_data = []
        
        for line in file:
            line = line.strip()
            parts = line.split()
            label = int(parts[0])
            attributes = [float(attr.split(':')[1]) for attr in parts[1:]]
            
            if label != -1:
                train_data.append(attributes)
                train_label.append(label)
            else:
                test_data.append(attributes)
    
    return train_data, train_label, test_data

input_file = r"C:\Users\rogerree\Downloads\01mSTLanT0igEUUS8kDmDQ_2834ea673e8949dea706a3da771f23f1_PA-DT-Handout\dt_handout\sample_test_cases\classification\input01.txt"
train_data, train_label, test_data = parse_classification_data(input_file)
result = solution.classify(train_data, train_label, test_data)
solution = Solution()


# Call the classify method with the extracted data
result = solution.classify(train_data, train_label, test_data)
print(result)

[2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 3, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 2, 3, 2, 2, 2, 2, 3, 3, 3, 2, 3, 2, 2, 2, 2, 3, 2, 2, 3, 2, 3, 3, 3, 2, 3, 3, 2, 2, 3, 2, 2, 3, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 3, 2, 2, 3, 2, 2, 3, 2, 2, 2, 3, 2, 3, 3, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2, 3, 2, 3, 3, 3, 3, 3, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 3, 2, 3, 3, 3, 2, 3, 2, 3, 3, 3, 3, 2, 3, 3, 2, 3, 3, 2, 2, 3, 2, 3, 2, 2, 2, 3, 2, 3, 2, 3, 2, 2, 2, 2, 3, 2, 2, 3, 2, 2, 2, 3, 2, 2, 3, 2, 3, 3, 2, 3, 2, 3, 2, 2, 2, 2, 3, 3, 3, 3, 2, 2, 3, 2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 3, 3]


In [1]:
print("hello")

hello


In [2]:
print("yo")

yo
