# Autoencoder for Anomaly Detection - Simple Tutorial

**Core Idea**: Train an autoencoder on normal data. It learns to compress and reconstruct normal patterns well, but struggles with anomalies, giving them high reconstruction errors.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
import torch
import torch.nn as nn
import torch.optim as optim

## Step 1: Create Synthetic Data
We'll create normal data and inject some anomalies

In [None]:
# Generate normal data (most samples)
np.random.seed(42)
normal_data = np.random.randn(1000, 10) * 0.5 + np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

# Generate anomalies (very different patterns)
anomalies = np.random.randn(50, 10) * 3 + np.array([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

# Normalize
scaler = StandardScaler()
normal_data = scaler.fit_transform(normal_data)
anomalies = scaler.transform(anomalies)

print(f"Normal data shape: {normal_data.shape}")
print(f"Anomalies shape: {anomalies.shape}")

## Step 2: Define Simple Autoencoder

**Architecture**:
- **Encoder**: 10 → 5 → 2 (compress to 2D bottleneck)
- **Decoder**: 2 → 5 → 10 (reconstruct back to original)

In [None]:
class SimpleAutoencoder(nn.Module):
    def __init__(self, input_dim=10):
        super().__init__()
        
        # Encoder: compress data
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 5),
            nn.ReLU(),
            nn.Linear(5, 2)  # Bottleneck: 2D representation
        )
        
        # Decoder: reconstruct data
        self.decoder = nn.Sequential(
            nn.Linear(2, 5),
            nn.ReLU(),
            nn.Linear(5, input_dim)
        )
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

model = SimpleAutoencoder()
print(model)

## Step 3: Train on Normal Data Only

**Key**: We only train on normal data so the model learns normal patterns

In [None]:
# Convert to PyTorch tensors
X_train = torch.FloatTensor(normal_data)

# Training setup
criterion = nn.MSELoss()  # Reconstruction loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 50
losses = []

for epoch in range(epochs):
    # Forward pass
    reconstructed = model(X_train)
    loss = criterion(reconstructed, X_train)
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    losses.append(loss.item())
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

# Plot training loss
plt.figure(figsize=(8, 4))
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Reconstruction Loss')
plt.title('Training Progress')
plt.grid(True)
plt.show()

## Step 4: Calculate Reconstruction Errors

**Anomaly Score** = How badly the autoencoder reconstructs a sample

In [None]:
def get_reconstruction_error(data, model):
    """Calculate mean squared error for each sample"""
    model.eval()
    with torch.no_grad():
        data_tensor = torch.FloatTensor(data)
        reconstructed = model(data_tensor)
        # MSE for each sample (along features)
        errors = torch.mean((data_tensor - reconstructed) ** 2, dim=1)
    return errors.numpy()

# Get errors for normal and anomalous data
normal_errors = get_reconstruction_error(normal_data, model)
anomaly_errors = get_reconstruction_error(anomalies, model)

print(f"Normal data - Mean error: {normal_errors.mean():.4f}, Max: {normal_errors.max():.4f}")
print(f"Anomalies - Mean error: {anomaly_errors.mean():.4f}, Max: {anomaly_errors.max():.4f}")

## Step 5: Visualize Results

Anomalies should have **higher reconstruction errors** than normal data

In [None]:
plt.figure(figsize=(12, 4))

# Plot 1: Histogram of errors
plt.subplot(1, 2, 1)
plt.hist(normal_errors, bins=30, alpha=0.7, label='Normal', color='blue')
plt.hist(anomaly_errors, bins=30, alpha=0.7, label='Anomaly', color='red')
plt.xlabel('Reconstruction Error')
plt.ylabel('Frequency')
plt.title('Distribution of Reconstruction Errors')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 2: Scatter plot showing separation
plt.subplot(1, 2, 2)
plt.scatter(range(len(normal_errors)), normal_errors, alpha=0.5, label='Normal', s=20)
plt.scatter(range(len(normal_errors), len(normal_errors) + len(anomaly_errors)), 
            anomaly_errors, alpha=0.5, label='Anomaly', s=20, color='red')
plt.xlabel('Sample Index')
plt.ylabel('Reconstruction Error')
plt.title('Reconstruction Errors by Sample')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Step 6: Set Threshold and Detect Anomalies

Use a threshold based on normal data errors (e.g., 95th percentile)

In [None]:
# Set threshold at 95th percentile of normal errors
threshold = np.percentile(normal_errors, 95)
print(f"Anomaly threshold: {threshold:.4f}")

# Classify
def detect_anomalies(errors, threshold):
    return errors > threshold

# Test on combined data
all_data = np.vstack([normal_data, anomalies])
all_errors = get_reconstruction_error(all_data, model)
predictions = detect_anomalies(all_errors, threshold)

# True labels (0 = normal, 1 = anomaly)
true_labels = np.array([0] * len(normal_data) + [1] * len(anomalies))

# Calculate metrics
from sklearn.metrics import classification_report, confusion_matrix

print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=['Normal', 'Anomaly']))

print("\nConfusion Matrix:")
cm = confusion_matrix(true_labels, predictions)
print(cm)
print(f"\nTrue Negatives: {cm[0,0]}, False Positives: {cm[0,1]}")
print(f"False Negatives: {cm[1,0]}, True Positives: {cm[1,1]}")

## Summary

**How it works**:
1. Train autoencoder on **normal data only**
2. Model learns to compress and reconstruct normal patterns
3. Normal data → low reconstruction error
4. Anomalies → high reconstruction error (model hasn't seen this pattern)
5. Set threshold on reconstruction error to flag anomalies

**When to use**:
- Abundant normal data, few/no anomaly examples
- High-dimensional data
- Unlabeled anomaly types

**Limitations**:
- Threshold selection can be tricky
- May not catch subtle anomalies
- Requires enough normal data to learn patterns