# 🔍 Notebook 04: Anomaly Detection

**Solar Swarm Intelligence - IEEE PES Energy Utopia Challenge**

Detect faults and anomalies in solar panel systems:
- Isolation Forest for outlier detection
- Autoencoder neural network
- Real-time fault identification
- Performance degradation detection

In [None]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import torch
import torch.nn as nn
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

print(" Libraries loaded")

## 1. Load and Prepare Data

In [None]:
df = pd.read_csv('../data/processed/synthetic/community_90days.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour'] = df['timestamp'].dt.hour

print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Create features for anomaly detection
df['production_per_temp'] = df['production_kwh'] / (df['temperature_c'] + 1)
df['production_cloud_ratio'] = df['production_kwh'] / (100 - df['cloud_cover_pct'] + 1)
df['efficiency'] = df['production_kwh'] / (df['temperature_c'] * 0.1 + 1)

# Select features
feature_cols = ['production_kwh', 'temperature_c', 'cloud_cover_pct', 
                'humidity_pct', 'wind_speed_kmh', 'production_per_temp',
                'production_cloud_ratio', 'efficiency']

# Filter daytime hours only (6-18)
daytime_df = df[(df['hour'] >= 6) & (df['hour'] <= 18)].copy()

print(f"Daytime data shape: {daytime_df.shape}")
print(f"Features: {feature_cols}")

## 2. Inject Synthetic Anomalies

In [None]:
# Create anomalies for testing (5% of data)
np.random.seed(42)
anomaly_indices = np.random.choice(daytime_df.index, size=int(len(daytime_df)*0.05), replace=False)

# Create labels
daytime_df['is_anomaly'] = 0
daytime_df.loc[anomaly_indices, 'is_anomaly'] = 1

# Inject anomalies (reduce production significantly)
for idx in anomaly_indices:
    anomaly_type = np.random.choice(['low_production', 'zero_production', 'erratic'])
    
    if anomaly_type == 'low_production':
        daytime_df.loc[idx, 'production_kwh'] *= 0.3  # 70% reduction
    elif anomaly_type == 'zero_production':
        daytime_df.loc[idx, 'production_kwh'] = 0
    else:  # erratic
        daytime_df.loc[idx, 'production_kwh'] *= np.random.uniform(0.1, 0.4)

# Recalculate derived features
daytime_df['production_per_temp'] = daytime_df['production_kwh'] / (daytime_df['temperature_c'] + 1)
daytime_df['production_cloud_ratio'] = daytime_df['production_kwh'] / (100 - daytime_df['cloud_cover_pct'] + 1)
daytime_df['efficiency'] = daytime_df['production_kwh'] / (daytime_df['temperature_c'] * 0.1 + 1)

print(f"Total samples: {len(daytime_df)}")
print(f"Normal samples: {(daytime_df['is_anomaly']==0).sum()}")
print(f"Anomalies: {(daytime_df['is_anomaly']==1).sum()}")
print(f"Anomaly rate: {(daytime_df['is_anomaly']==1).sum()/len(daytime_df)*100:.2f}%")

## 3. Isolation Forest

In [None]:
# Prepare data
X = daytime_df[feature_cols].values
y_true = daytime_df['is_anomaly'].values

# Normalize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train Isolation Forest
print(" Training Isolation Forest...\n")
iso_forest = IsolationForest(
    contamination=0.05,
    n_estimators=100,
    max_samples=256,
    random_state=42
)

iso_forest.fit(X_scaled)
print(" Training complete!")

# Predict (-1 for anomalies, 1 for normal)
iso_predictions = iso_forest.predict(X_scaled)
iso_predictions_binary = (iso_predictions == -1).astype(int)

# Anomaly scores
iso_scores = iso_forest.score_samples(X_scaled)
daytime_df['iso_anomaly_score'] = iso_scores
daytime_df['iso_prediction'] = iso_predictions_binary

In [None]:
# Evaluate Isolation Forest
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

print("📊 ISOLATION FOREST PERFORMANCE")
print("="*50)
print(f"Accuracy:  {accuracy_score(y_true, iso_predictions_binary):.4f}")
print(f"Precision: {precision_score(y_true, iso_predictions_binary):.4f}")
print(f"Recall:    {recall_score(y_true, iso_predictions_binary):.4f}")
print(f"F1-Score:  {f1_score(y_true, iso_predictions_binary):.4f}")
print("="*50)

# Confusion Matrix
cm = confusion_matrix(y_true, iso_predictions_binary)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Isolation Forest - Confusion Matrix', fontsize=14, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

## 4. Autoencoder Implementation

In [None]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim=8):
        super(Autoencoder, self).__init__()
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 4)
        )
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(4, 8),
            nn.ReLU(),
            nn.Linear(8, 16),
            nn.ReLU(),
            nn.Linear(16, input_dim)
        )
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

print(" Autoencoder model defined")

In [None]:
# Train on normal data only
normal_data = daytime_df[daytime_df['is_anomaly'] == 0][feature_cols].values
normal_scaled = scaler.transform(normal_data)

# Convert to tensor
X_train_ae = torch.FloatTensor(normal_scaled)
X_test_ae = torch.FloatTensor(X_scaled)

# Initialize model
autoencoder = Autoencoder(input_dim=len(feature_cols))
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(autoencoder.parameters(), lr=0.001)

# Training
print(" Training Autoencoder...\n")
epochs = 100
batch_size = 64

for epoch in range(epochs):
    autoencoder.train()
    epoch_loss = 0
    
    # Mini-batch training
    for i in range(0, len(X_train_ae), batch_size):
        batch = X_train_ae[i:i+batch_size]
        
        optimizer.zero_grad()
        outputs = autoencoder(batch)
        loss = criterion(outputs, batch)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    if (epoch + 1) % 20 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss/len(X_train_ae):.6f}")

print("\n Autoencoder training complete!")

In [None]:
# Calculate reconstruction error
autoencoder.eval()
with torch.no_grad():
    reconstructed = autoencoder(X_test_ae)
    reconstruction_errors = torch.mean((X_test_ae - reconstructed) ** 2, dim=1).numpy()

daytime_df['ae_reconstruction_error'] = reconstruction_errors

# Set threshold (95th percentile of normal data errors)
normal_errors = reconstruction_errors[y_true == 0]
threshold = np.percentile(normal_errors, 95)

print(f"Anomaly threshold: {threshold:.6f}")

# Predict anomalies
ae_predictions = (reconstruction_errors > threshold).astype(int)
daytime_df['ae_prediction'] = ae_predictions

# Evaluate
print("\n AUTOENCODER PERFORMANCE")
print("="*50)
print(f"Accuracy:  {accuracy_score(y_true, ae_predictions):.4f}")
print(f"Precision: {precision_score(y_true, ae_predictions):.4f}")
print(f"Recall:    {recall_score(y_true, ae_predictions):.4f}")
print(f"F1-Score:  {f1_score(y_true, ae_predictions):.4f}")
print("="*50)

In [None]:
# Confusion Matrix
cm_ae = confusion_matrix(y_true, ae_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_ae, annot=True, fmt='d', cmap='Greens', cbar=False)
plt.title('Autoencoder - Confusion Matrix', fontsize=14, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

## 5. Visualize Anomalies

In [None]:
# Plot anomaly scores
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Sample one house for visualization
house_sample = daytime_df[daytime_df['house_id'] == 0].iloc[:500]

# Isolation Forest scores
axes[0].scatter(range(len(house_sample)), house_sample['iso_anomaly_score'], 
                c=house_sample['is_anomaly'], cmap='RdYlGn', s=30, alpha=0.6)
axes[0].set_title('Isolation Forest Anomaly Scores (House 0)', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Anomaly Score')
axes[0].grid(True, alpha=0.3)

# Autoencoder reconstruction errors
axes[1].scatter(range(len(house_sample)), house_sample['ae_reconstruction_error'], 
                c=house_sample['is_anomaly'], cmap='RdYlGn', s=30, alpha=0.6)
axes[1].axhline(y=threshold, color='red', linestyle='--', label=f'Threshold: {threshold:.4f}')
axes[1].set_title('Autoencoder Reconstruction Error (House 0)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Sample Index')
axes[1].set_ylabel('Reconstruction Error')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Real-Time Anomaly Detection Example

In [None]:
# Simulate real-time detection
def detect_anomaly_realtime(production, temperature, cloud_cover, humidity, wind_speed):
    """Real-time anomaly detection function"""
    
    # Calculate derived features
    prod_per_temp = production / (temperature + 1)
    prod_cloud_ratio = production / (100 - cloud_cover + 1)
    efficiency = production / (temperature * 0.1 + 1)
    
    # Create feature vector
    features = np.array([[production, temperature, cloud_cover, humidity, wind_speed,
                         prod_per_temp, prod_cloud_ratio, efficiency]])
    
    # Scale
    features_scaled = scaler.transform(features)
    
    # Isolation Forest prediction
    iso_pred = iso_forest.predict(features_scaled)[0]
    iso_score = iso_forest.score_samples(features_scaled)[0]
    
    # Autoencoder prediction
    with torch.no_grad():
        features_tensor = torch.FloatTensor(features_scaled)
        reconstructed = autoencoder(features_tensor)
        ae_error = torch.mean((features_tensor - reconstructed) ** 2).item()
    
    ae_pred = 1 if ae_error > threshold else 0
    
    # Ensemble decision (both models agree)
    is_anomaly = (iso_pred == -1) and (ae_pred == 1)
    
    return {
        'is_anomaly': is_anomaly,
        'iso_score': iso_score,
        'ae_error': ae_error,
        'confidence': 'HIGH' if (iso_pred == -1) and (ae_pred == 1) else 'MEDIUM' if (iso_pred == -1) or (ae_pred == 1) else 'LOW'
    }

# Test with examples
print("🔍 REAL-TIME ANOMALY DETECTION EXAMPLES\n")
print("="*70)

# Normal case
result1 = detect_anomaly_realtime(3.5, 25, 20, 60, 5)
print("Test 1 - Normal Operation:")
print(f"  Production: 3.5 kWh, Temp: 25°C, Cloud: 20%")
print(f"  Anomaly: {result1['is_anomaly']}, Confidence: {result1['confidence']}")
print(f"  ISO Score: {result1['iso_score']:.4f}, AE Error: {result1['ae_error']:.6f}\n")

# Anomaly case
result2 = detect_anomaly_realtime(0.5, 28, 15, 55, 6)
print("Test 2 - Low Production (Potential Fault):")
print(f"  Production: 0.5 kWh, Temp: 28°C, Cloud: 15%")
print(f"  Anomaly: {result2['is_anomaly']}, Confidence: {result2['confidence']}")
print(f"  ISO Score: {result2['iso_score']:.4f}, AE Error: {result2['ae_error']:.6f}\n")

print("="*70)

## 7. Save Models

In [None]:
import pickle

# Save Isolation Forest
with open('../models/isolation_forest.pkl', 'wb') as f:
    pickle.dump(iso_forest, f)
print(" Isolation Forest saved")

# Save Autoencoder
torch.save(autoencoder.state_dict(), '../models/autoencoder.pth')
print(" Autoencoder saved")

# Save scaler
with open('../models/anomaly_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print(" Scaler saved")

# Save threshold
with open('../models/ae_threshold.pkl', 'wb') as f:
    pickle.dump(threshold, f)
print(" Threshold saved")

print("\n All anomaly detection models saved!")

## 8. Summary

**Anomaly Detection Complete! ✅**

**Implemented Methods:**
1. **Isolation Forest**: Efficient outlier detection
2. **Autoencoder**: Deep learning reconstruction-based detection

**Key Capabilities:**
- Real-time fault detection
- Performance degradation identification
- Panel malfunction alerts
- Ensemble approach for higher accuracy

**Applications:**
- Predictive maintenance
- System health monitoring
- Early fault warning
- Quality assurance

**Next Steps:**
- Notebook 05: Multi-agent swarm simulation integrating all models