# Decision Tree from Scratch - Regression

Barebones implementation of decision tree regressor using CART algorithm.

**Key Concepts:**
- Mean Squared Error (MSE) / Mean Absolute Error (MAE)
- Recursive tree building
- Split selection
- Prediction via tree traversal


In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes, make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

## Tree Node Class

In [None]:
class Node:
    """
    Represents a node in the decision tree.
    
    Can be either:
    - Leaf node: has a value (predicted target value)
    - Internal node: has feature, threshold, and left/right children
    """
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature      # Feature index to split on
        self.threshold = threshold  # Threshold value for split
        self.left = left           # Left child node
        self.right = right         # Right child node
        self.value = value         # Predicted value (for leaf nodes)
    
    def is_leaf_node(self):
        return self.value is not None

## Decision Tree Regressor

In [None]:
class DecisionTreeRegressor:
    """
    Decision Tree Regressor from scratch.
    
    Parameters:
    -----------
    max_depth : int, default=10
        Maximum depth of the tree
    min_samples_split : int, default=2
        Minimum samples required to split a node
    criterion : str, default='mse'
        Split criterion: 'mse' (mean squared error) or 'mae' (mean absolute error)
    """
    
    def __init__(self, max_depth=10, min_samples_split=2, criterion='mse'):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.root = None
        
    def fit(self, X, y):
        """
        Build decision tree from training data.
        """
        self.root = self._grow_tree(X, y)
        return self
    
    def _grow_tree(self, X, y, depth=0):
        """
        Recursively grow the decision tree.
        """
        n_samples, n_features = X.shape
        
        # Stopping criteria
        if (depth >= self.max_depth or 
            n_samples < self.min_samples_split or
            self._all_same_values(y)):
            leaf_value = self._leaf_value(y)
            return Node(value=leaf_value)
        
        # Find best split
        best_feature, best_threshold = self._best_split(X, y)
        
        # Split data
        left_idxs = X[:, best_feature] <= best_threshold
        right_idxs = X[:, best_feature] > best_threshold
        
        # Recursively grow children
        left = self._grow_tree(X[left_idxs], y[left_idxs], depth + 1)
        right = self._grow_tree(X[right_idxs], y[right_idxs], depth + 1)
        
        return Node(best_feature, best_threshold, left, right)
    
    def _best_split(self, X, y):
        """
        Find the best feature and threshold to split on.
        """
        best_gain = -float('inf')
        best_feature = None
        best_threshold = None
        
        n_features = X.shape[1]
        
        for feature_idx in range(n_features):
            X_column = X[:, feature_idx]
            thresholds = np.unique(X_column)
            
            for threshold in thresholds:
                # Calculate variance reduction (information gain for regression)
                gain = self._variance_reduction(y, X_column, threshold)
                
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature_idx
                    best_threshold = threshold
        
        return best_feature, best_threshold
    
    def _variance_reduction(self, y, X_column, threshold):
        """
        Calculate variance reduction (information gain) from a split.
        """
        # Parent error
        parent_error = self._calculate_error(y)
        
        # Split
        left_idxs = X_column <= threshold
        right_idxs = X_column > threshold
        
        if len(y[left_idxs]) == 0 or len(y[right_idxs]) == 0:
            return 0
        
        # Weighted average of children error
        n = len(y)
        n_left, n_right = len(y[left_idxs]), len(y[right_idxs])
        error_left = self._calculate_error(y[left_idxs])
        error_right = self._calculate_error(y[right_idxs])
        child_error = (n_left / n) * error_left + (n_right / n) * error_right
        
        # Variance reduction (information gain)
        variance_reduction = parent_error - child_error
        return variance_reduction
    
    def _calculate_error(self, y):
        """
        Calculate error (MSE or MAE).
        """
        if self.criterion == 'mse':
            return self._mse(y)
        else:
            return self._mae(y)
    
    def _mse(self, y):
        """
        Calculate Mean Squared Error.
        MSE = (1/n) * Σ(y_i - ȳ)^2
        """
        if len(y) == 0:
            return 0
        mean = np.mean(y)
        return np.mean((y - mean) ** 2)
    
    def _mae(self, y):
        """
        Calculate Mean Absolute Error.
        MAE = (1/n) * Σ|y_i - median(y)|
        """
        if len(y) == 0:
            return 0
        median = np.median(y)
        return np.mean(np.abs(y - median))
    
    def _leaf_value(self, y):
        """
        Calculate the value to predict at a leaf node.
        - For MSE: mean
        - For MAE: median
        """
        if self.criterion == 'mse':
            return np.mean(y)
        else:
            return np.median(y)
    
    def _all_same_values(self, y):
        """
        Check if all values in y are the same.
        """
        return len(np.unique(y)) == 1
    
    def predict(self, X):
        """
        Predict target values for samples in X.
        """
        return np.array([self._traverse_tree(x, self.root) for x in X])
    
    def _traverse_tree(self, x, node):
        """
        Traverse tree to predict single sample.
        """
        if node.is_leaf_node():
            return node.value
        
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)
    
    def print_tree(self, node=None, depth=0):
        """
        Print tree structure.
        """
        if node is None:
            node = self.root
        
        if node.is_leaf_node():
            print("  " * depth + f"Leaf: {node.value:.2f}")
        else:
            print("  " * depth + f"Feature {node.feature} <= {node.threshold:.2f}")
            print("  " * depth + "Left:")
            self.print_tree(node.left, depth + 1)
            print("  " * depth + "Right:")
            self.print_tree(node.right, depth + 1)

## Example 1: Diabetes Dataset

In [None]:
# Load diabetes dataset
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Diabetes Dataset")
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Features: {X.shape[1]}")
print(f"Target range: [{y.min():.2f}, {y.max():.2f}]")

In [None]:
# Train decision tree with MSE
dt_mse = DecisionTreeRegressor(max_depth=5, criterion='mse')
dt_mse.fit(X_train, y_train)

# Predictions
y_pred_mse = dt_mse.predict(X_test)

# Metrics
mse = mean_squared_error(y_test, y_pred_mse)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred_mse)
r2 = r2_score(y_test, y_pred_mse)

print(f"\nDecision Tree (MSE) Results:")
print(f"  MSE:  {mse:.2f}")
print(f"  RMSE: {rmse:.2f}")
print(f"  MAE:  {mae:.2f}")
print(f"  R²:   {r2:.4f}")

In [None]:
# Train decision tree with MAE
dt_mae = DecisionTreeRegressor(max_depth=5, criterion='mae')
dt_mae.fit(X_train, y_train)

# Predictions
y_pred_mae = dt_mae.predict(X_test)

# Metrics
mse = mean_squared_error(y_test, y_pred_mae)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred_mae)
r2 = r2_score(y_test, y_pred_mae)

print(f"\nDecision Tree (MAE) Results:")
print(f"  MSE:  {mse:.2f}")
print(f"  RMSE: {rmse:.2f}")
print(f"  MAE:  {mae:.2f}")
print(f"  R²:   {r2:.4f}")

In [None]:
# Print tree structure (small depth for readability)
dt_small = DecisionTreeRegressor(max_depth=3, criterion='mse')
dt_small.fit(X_train, y_train)

print("\nTree Structure (max_depth=3):")
dt_small.print_tree()

## Example 2: Synthetic Dataset

In [None]:
# Generate synthetic dataset
X_syn, y_syn = make_regression(
    n_samples=500,
    n_features=10,
    n_informative=8,
    noise=10,
    random_state=42
)

X_train_syn, X_test_syn, y_train_syn, y_test_syn = train_test_split(
    X_syn, y_syn, test_size=0.2, random_state=42
)

print("Synthetic Dataset")
print(f"Training samples: {len(X_train_syn)}")
print(f"Test samples: {len(X_test_syn)}")
print(f"Features: {X_syn.shape[1]}")
print(f"Target range: [{y_syn.min():.2f}, {y_syn.max():.2f}]")

In [None]:
# Train and evaluate
dt_syn = DecisionTreeRegressor(max_depth=10, min_samples_split=5, criterion='mse')
dt_syn.fit(X_train_syn, y_train_syn)

y_pred_syn = dt_syn.predict(X_test_syn)

# Metrics
mse = mean_squared_error(y_test_syn, y_pred_syn)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_syn, y_pred_syn)
r2 = r2_score(y_test_syn, y_pred_syn)

print(f"\nDecision Tree Results:")
print(f"  MSE:  {mse:.2f}")
print(f"  RMSE: {rmse:.2f}")
print(f"  MAE:  {mae:.2f}")
print(f"  R²:   {r2:.4f}")

## Compare Different Max Depths

In [None]:
# Compare different max_depth values
depths = [2, 3, 5, 10, 15]
results = []

for depth in depths:
    dt = DecisionTreeRegressor(max_depth=depth, criterion='mse')
    dt.fit(X_train, y_train)
    
    y_pred = dt.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    results.append({
        'max_depth': depth,
        'R²': r2,
        'RMSE': rmse
    })

results_df = pd.DataFrame(results)
print("\nPerformance vs Max Depth:")
print(results_df.to_string(index=False))

## Key Concepts Summary

**Mean Squared Error (MSE):**
$$MSE = \frac{1}{n} \sum_{i=1}^{n} (y_i - \bar{y})^2$$

where $\bar{y}$ is the mean of target values

**Mean Absolute Error (MAE):**
$$MAE = \frac{1}{n} \sum_{i=1}^{n} |y_i - \text{median}(y)|$$

**Variance Reduction (Information Gain for Regression):**
$$VR = Error_{parent} - \sum_{children} \frac{N_{child}}{N_{parent}} \times Error_{child}$$

**Algorithm:**
1. For each feature and threshold, calculate variance reduction
2. Choose split that maximizes variance reduction
3. Recursively split until stopping criteria
4. Assign mean (MSE) or median (MAE) at leaf nodes

**Key Differences from Classification Trees:**
- Leaf nodes predict continuous values (mean/median) instead of class labels
- Split criterion uses MSE/MAE instead of Gini/Entropy
- Evaluation uses regression metrics (R², RMSE, MAE) instead of accuracy
