# Simple Isolation Forest Implementation

This notebook contains a bare-bones implementation of Isolation Forest for anomaly detection.

## Key Concepts

1. **Random splits**: Each tree randomly picks features and split values
2. **Path length**: Anomalies are isolated faster (shorter paths)
3. **Ensemble**: Average across many trees for robustness
4. **Scoring**: Shorter average path = higher anomaly score

**The intuition**: Outliers are rare and different, so random splits isolate them quickly near the root of the tree. Normal points take longer to isolate.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

## 1. Isolation Tree Class

A single isolation tree that recursively splits data using random features and random thresholds.

In [None]:
class IsolationTree:
    def __init__(self, max_depth):
        self.max_depth = max_depth
        self.split_feature = None
        self.split_value = None
        self.left = None
        self.right = None
        self.size = 0
        
    def fit(self, X, depth=0):
        self.size = len(X)
        
        # Stop if max depth reached or only one sample
        if depth >= self.max_depth or len(X) <= 1:
            return self
        
        # Randomly pick a feature and split value
        n_features = X.shape[1]
        self.split_feature = np.random.randint(0, n_features)
        
        col_data = X[:, self.split_feature]
        min_val, max_val = col_data.min(), col_data.max()
        
        # Stop if all values are the same
        if min_val == max_val:
            return self
        
        # Random split between min and max
        self.split_value = np.random.uniform(min_val, max_val)
        
        # Split data
        left_mask = col_data < self.split_value
        X_left = X[left_mask]
        X_right = X[~left_mask]
        
        # Recursively build left and right subtrees
        self.left = IsolationTree(self.max_depth).fit(X_left, depth + 1)
        self.right = IsolationTree(self.max_depth).fit(X_right, depth + 1)
        
        return self
    
    def path_length(self, x, depth=0):
        # If leaf node, return current depth
        if self.split_feature is None:
            return depth
        
        # Go left or right based on split
        if x[self.split_feature] < self.split_value:
            return self.left.path_length(x, depth + 1)
        else:
            return self.right.path_length(x, depth + 1)

## 2. Isolation Forest Class

An ensemble of isolation trees that combines multiple trees to detect anomalies.

In [None]:
class IsolationForest:
    def __init__(self, n_trees=100, max_samples=256):
        self.n_trees = n_trees
        self.max_samples = max_samples
        self.trees = []
        
    def fit(self, X):
        n_samples = len(X)
        max_depth = int(np.ceil(np.log2(self.max_samples)))
        
        # Build multiple trees
        for _ in range(self.n_trees):
            # Sample random subset
            sample_size = min(self.max_samples, n_samples)
            sample_idx = np.random.choice(n_samples, sample_size, replace=False)
            X_sample = X[sample_idx]
            
            # Build tree
            tree = IsolationTree(max_depth)
            tree.fit(X_sample)
            self.trees.append(tree)
        
        return self
    
    def anomaly_score(self, X):
        # Average path length across all trees
        avg_path_lengths = np.zeros(len(X))
        
        for x_idx, x in enumerate(X):
            path_sum = sum(tree.path_length(x) for tree in self.trees)
            avg_path_lengths[x_idx] = path_sum / self.n_trees
        
        # Normalize: shorter paths = higher anomaly score
        # Using simplified scoring (0 to 1 range)
        max_path = np.log2(self.max_samples)
        scores = 2 ** (-avg_path_lengths / max_path)
        
        return scores

## 3. Generate Sample Data

Create a dataset with normal points and some anomalies.

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

# Generate normal data (clustered around origin)
X_normal = np.random.randn(300, 2)

# Add some anomalies (scattered randomly)
X_anomalies = np.random.uniform(-4, 4, (20, 2))

# Combine datasets
X = np.vstack([X_normal, X_anomalies])

print(f"Total samples: {len(X)}")
print(f"Normal samples: {len(X_normal)}")
print(f"Anomaly samples: {len(X_anomalies)}")

## 4. Visualize the Data

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(X_normal[:, 0], X_normal[:, 1], alpha=0.6, label='Normal', s=30)
plt.scatter(X_anomalies[:, 0], X_anomalies[:, 1], alpha=0.8, label='Anomalies', 
            s=100, marker='x', color='red', linewidths=2)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Dataset: Normal Points vs Anomalies')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 5. Train Isolation Forest

In [None]:
# Create and train the model
iso_forest = IsolationForest(n_trees=100, max_samples=256)
iso_forest.fit(X)

print(f"Model trained with {iso_forest.n_trees} trees")

## 6. Calculate Anomaly Scores

In [None]:
# Get anomaly scores for all points
scores = iso_forest.anomaly_score(X)

# Separate scores for normal and anomalous points
normal_scores = scores[:300]
anomaly_scores = scores[300:]

print(f"Normal points - Average score: {normal_scores.mean():.3f}")
print(f"Normal points - Score range: [{normal_scores.min():.3f}, {normal_scores.max():.3f}]")
print()
print(f"Anomaly points - Average score: {anomaly_scores.mean():.3f}")
print(f"Anomaly points - Score range: [{anomaly_scores.min():.3f}, {anomaly_scores.max():.3f}]")

## 7. Visualize Anomaly Scores

In [None]:
# Plot 1: Score distribution
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.hist(normal_scores, bins=30, alpha=0.6, label='Normal', color='blue')
plt.hist(anomaly_scores, bins=30, alpha=0.6, label='Anomalies', color='red')
plt.xlabel('Anomaly Score')
plt.ylabel('Frequency')
plt.title('Distribution of Anomaly Scores')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 2: Scatter plot colored by anomaly score
plt.subplot(1, 2, 2)
scatter = plt.scatter(X[:, 0], X[:, 1], c=scores, cmap='coolwarm', 
                      s=50, alpha=0.7, edgecolors='black', linewidth=0.5)
plt.colorbar(scatter, label='Anomaly Score')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Points Colored by Anomaly Score')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Identify Top Anomalies

In [None]:
# Get indices of top 20 anomalies
top_anomaly_indices = np.argsort(scores)[-20:]

print("Top 20 anomalies (highest scores):")
for i, idx in enumerate(top_anomaly_indices[::-1], 1):
    print(f"{i:2d}. Index {idx:3d}: Score = {scores[idx]:.3f}, Point = {X[idx]}")

## 9. Test on New Data

In [None]:
# Create some test points
test_points = np.array([
    [0.0, 0.0],      # Should be normal (at center)
    [5.0, 5.0],      # Should be anomalous (far from center)
    [0.5, -0.5],     # Should be normal (close to center)
    [-4.0, 3.0]      # Should be anomalous (far from center)
])

# Score the test points
test_scores = iso_forest.anomaly_score(test_points)

print("Test point predictions:")
for i, (point, score) in enumerate(zip(test_points, test_scores), 1):
    label = "ANOMALY" if score > 0.6 else "NORMAL"
    print(f"Point {i}: {point} -> Score: {score:.3f} ({label})")

## Summary

This simple implementation demonstrates the core concepts of Isolation Forest:

- **Isolation Trees** use random splits to partition the data
- **Path Length** measures how quickly a point gets isolated
- **Anomalies** have shorter path lengths (easier to isolate)
- **Ensemble** of trees provides robust scoring

The algorithm works well because anomalies are rare and different from normal points, so random partitions isolate them quickly!