# Decision Tree from Scratch - Classification

Barebones implementation of decision tree classifier using ID3/CART algorithm.

**Key Concepts:**
- Information Gain / Gini Impurity
- Recursive tree building
- Split selection
- Prediction via tree traversal


In [None]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.datasets import load_iris, make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

## Tree Node Class

In [None]:
class Node:
    """
    Represents a node in the decision tree.
    
    Can be either:
    - Leaf node: has a value (class label)
    - Internal node: has feature, threshold, and left/right children
    """
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature      # Feature index to split on
        self.threshold = threshold  # Threshold value for split
        self.left = left           # Left child node
        self.right = right         # Right child node
        self.value = value         # Class label (for leaf nodes)
    
    def is_leaf_node(self):
        return self.value is not None

## Decision Tree Classifier

In [None]:
class DecisionTreeClassifier:
    """
    Decision Tree Classifier from scratch.
    
    Parameters:
    -----------
    max_depth : int, default=10
        Maximum depth of the tree
    min_samples_split : int, default=2
        Minimum samples required to split a node
    criterion : str, default='gini'
        Split criterion: 'gini' or 'entropy'
    """
    
    def __init__(self, max_depth=10, min_samples_split=2, criterion='gini'):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.root = None
        
    def fit(self, X, y):
        """
        Build decision tree from training data.
        """
        self.root = self._grow_tree(X, y)
        return self
    
    def _grow_tree(self, X, y, depth=0):
        """
        Recursively grow the decision tree.
        """
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))
        
        # Stopping criteria
        if (depth >= self.max_depth or 
            n_classes == 1 or 
            n_samples < self.min_samples_split):
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)
        
        # Find best split
        best_feature, best_threshold = self._best_split(X, y)
        
        # Split data
        left_idxs = X[:, best_feature] <= best_threshold
        right_idxs = X[:, best_feature] > best_threshold
        
        # Recursively grow children
        left = self._grow_tree(X[left_idxs], y[left_idxs], depth + 1)
        right = self._grow_tree(X[right_idxs], y[right_idxs], depth + 1)
        
        return Node(best_feature, best_threshold, left, right)
    
    def _best_split(self, X, y):
        """
        Find the best feature and threshold to split on.
        """
        best_gain = -1
        best_feature = None
        best_threshold = None
        
        n_features = X.shape[1]
        
        for feature_idx in range(n_features):
            X_column = X[:, feature_idx]
            thresholds = np.unique(X_column)
            
            for threshold in thresholds:
                # Calculate information gain
                gain = self._information_gain(y, X_column, threshold)
                
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature_idx
                    best_threshold = threshold
        
        return best_feature, best_threshold
    
    def _information_gain(self, y, X_column, threshold):
        """
        Calculate information gain from a split.
        """
        # Parent impurity
        parent_impurity = self._impurity(y)
        
        # Split
        left_idxs = X_column <= threshold
        right_idxs = X_column > threshold
        
        if len(y[left_idxs]) == 0 or len(y[right_idxs]) == 0:
            return 0
        
        # Weighted average of children impurity
        n = len(y)
        n_left, n_right = len(y[left_idxs]), len(y[right_idxs])
        impurity_left = self._impurity(y[left_idxs])
        impurity_right = self._impurity(y[right_idxs])
        child_impurity = (n_left / n) * impurity_left + (n_right / n) * impurity_right
        
        # Information gain
        information_gain = parent_impurity - child_impurity
        return information_gain
    
    def _impurity(self, y):
        """
        Calculate impurity (Gini or Entropy).
        """
        if self.criterion == 'gini':
            return self._gini(y)
        else:
            return self._entropy(y)
    
    def _gini(self, y):
        """
        Calculate Gini impurity.
        Gini = 1 - Σ(p_i^2)
        """
        counter = Counter(y)
        n = len(y)
        gini = 1.0
        for count in counter.values():
            p = count / n
            gini -= p ** 2
        return gini
    
    def _entropy(self, y):
        """
        Calculate entropy.
        Entropy = -Σ(p_i * log2(p_i))
        """
        counter = Counter(y)
        n = len(y)
        entropy = 0.0
        for count in counter.values():
            p = count / n
            if p > 0:
                entropy -= p * np.log2(p)
        return entropy
    
    def _most_common_label(self, y):
        """
        Return the most common label in y.
        """
        counter = Counter(y)
        return counter.most_common(1)[0][0]
    
    def predict(self, X):
        """
        Predict class labels for samples in X.
        """
        return np.array([self._traverse_tree(x, self.root) for x in X])
    
    def _traverse_tree(self, x, node):
        """
        Traverse tree to predict single sample.
        """
        if node.is_leaf_node():
            return node.value
        
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)
    
    def print_tree(self, node=None, depth=0):
        """
        Print tree structure.
        """
        if node is None:
            node = self.root
        
        if node.is_leaf_node():
            print("  " * depth + f"Leaf: {node.value}")
        else:
            print("  " * depth + f"Feature {node.feature} <= {node.threshold:.2f}")
            print("  " * depth + "Left:")
            self.print_tree(node.left, depth + 1)
            print("  " * depth + "Right:")
            self.print_tree(node.right, depth + 1)

## Example 1: Iris Dataset

In [None]:
# Load iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Iris Dataset")
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Features: {X.shape[1]}")
print(f"Classes: {len(np.unique(y))}")

In [None]:
# Train decision tree with Gini
dt_gini = DecisionTreeClassifier(max_depth=5, criterion='gini')
dt_gini.fit(X_train, y_train)

# Predictions
y_pred_gini = dt_gini.predict(X_test)
accuracy_gini = accuracy_score(y_test, y_pred_gini)

print(f"\nDecision Tree (Gini) Accuracy: {accuracy_gini:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_gini, target_names=iris.target_names))

In [None]:
# Train decision tree with Entropy
dt_entropy = DecisionTreeClassifier(max_depth=5, criterion='entropy')
dt_entropy.fit(X_train, y_train)

# Predictions
y_pred_entropy = dt_entropy.predict(X_test)
accuracy_entropy = accuracy_score(y_test, y_pred_entropy)

print(f"\nDecision Tree (Entropy) Accuracy: {accuracy_entropy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_entropy, target_names=iris.target_names))

In [None]:
# Print tree structure (small depth for readability)
dt_small = DecisionTreeClassifier(max_depth=3, criterion='gini')
dt_small.fit(X_train, y_train)

print("\nTree Structure (max_depth=3):")
dt_small.print_tree()

## Example 2: Synthetic Dataset

In [None]:
# Generate synthetic dataset
X_syn, y_syn = make_classification(
    n_samples=500,
    n_features=10,
    n_informative=8,
    n_redundant=2,
    n_classes=3,
    random_state=42
)

X_train_syn, X_test_syn, y_train_syn, y_test_syn = train_test_split(
    X_syn, y_syn, test_size=0.2, random_state=42
)

print("Synthetic Dataset")
print(f"Training samples: {len(X_train_syn)}")
print(f"Test samples: {len(X_test_syn)}")
print(f"Features: {X_syn.shape[1]}")
print(f"Classes: {len(np.unique(y_syn))}")

In [None]:
# Train and evaluate
dt_syn = DecisionTreeClassifier(max_depth=10, min_samples_split=5, criterion='gini')
dt_syn.fit(X_train_syn, y_train_syn)

y_pred_syn = dt_syn.predict(X_test_syn)
accuracy_syn = accuracy_score(y_test_syn, y_pred_syn)

print(f"\nDecision Tree Accuracy: {accuracy_syn:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_syn, y_pred_syn))

## Key Concepts Summary

**Gini Impurity:**
$$Gini = 1 - \sum_{i=1}^{C} p_i^2$$

where $p_i$ is the probability of class $i$

**Entropy:**
$$Entropy = -\sum_{i=1}^{C} p_i \log_2(p_i)$$

**Information Gain:**
$$IG = Impurity_{parent} - \sum_{children} \frac{N_{child}}{N_{parent}} \times Impurity_{child}$$

**Algorithm:**
1. For each feature and threshold, calculate information gain
2. Choose split that maximizes information gain
3. Recursively split until stopping criteria
4. Assign majority class at leaf nodes
