# BGP Anomaly Traffic Generation - Systematic Algorithm Comparison

## Purpose
This notebook implements **synthetic BGP anomaly traffic generation** using multiple algorithms, systematically comparing their performance for generating realistic anomaly packets.

## Data Source
Uses high-confidence labeled anomaly data from the reinforcement pipeline:
- `medium_confidence`: 3 methods agree (7,820 samples)
- `high_confidence`: 4 methods agree (7,077 samples)  
- `very_high_confidence`: 5-6 methods agree (6,542 samples)

## Algorithms Implemented
1. **Enhanced Copula Generator** - Statistical approach with KDE marginals
2. **SMOTE** - Synthetic Minority Over-sampling
3. **CTGAN** - Conditional Tabular GAN
4. **TimeGAN** - Temporal sequence generation
5. **VAE** - Variational Autoencoder
6. **GMM** - Gaussian Mixture Model
7. **KDE Sampling** - Kernel Density Estimation
8. **Bootstrap Resampling** - Statistical resampling
9. **TVAE** - Tabular Variational Autoencoder
10. **Hybrid SMOTE-Copula** - Combined approach
11. **DoppelGANger** - Bidirectional LSTM GAN
12. **Diffusion Model** - Denoising diffusion approach

## Anomaly Types
- `dos_attack` - Denial of Service attacks
- `prefix_hijacking` - BGP prefix hijacking
- `path_manipulation` - AS path manipulation

---
## 1. Setup and Imports

In [None]:
# Core libraries
import numpy as np
import pandas as pd
import warnings
import os
import json
from datetime import datetime
from collections import defaultdict
warnings.filterwarnings('ignore')

# Create output directories
OUTPUT_DIR = '../results/anomaly_generation'
PLOTS_DIR = f'{OUTPUT_DIR}/plots'
DATA_DIR = f'{OUTPUT_DIR}/generated_data'
os.makedirs(PLOTS_DIR, exist_ok=True)
os.makedirs(DATA_DIR, exist_ok=True)

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-v0_8-whitegrid')

# Preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split

# Statistical methods
from scipy import stats
from scipy.stats import ks_2samp, wasserstein_distance
from scipy.spatial.distance import cdist

# Oversampling
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE

# Clustering
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import KernelDensity

# Deep Learning
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"PyTorch device: {DEVICE}")

# Set seeds for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

print(f"Output directory: {os.path.abspath(OUTPUT_DIR)}")
print("All imports successful!")

---
## 2. Load Reinforced Anomaly Data

In [None]:
# Load reinforced anomaly dataset
DATA_PATH = '/home/smotaali/BGP_Traffic_Generation/RIPE/RIPE_INCIDENTS/all_incidents_anomalies_reinforced_v2.csv'

# Try alternative paths
alt_paths = [
    'all_incidents_anomalies_reinforced_v2.csv',
    '../RIPE/RIPE_INCIDENTS/all_incidents_anomalies_reinforced_v2.csv',
    '/home/user/BGP_Traffic_Generation/dataset/all_incidents_anomalies_reinforced_v2.csv'
]

df = None
for path in [DATA_PATH] + alt_paths:
    try:
        df = pd.read_csv(path)
        print(f"Loaded data from: {path}")
        break
    except FileNotFoundError:
        continue

if df is None:
    raise FileNotFoundError("Could not find reinforced anomaly dataset!")

print(f"\nDataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

In [None]:
# Display reinforcement columns
print("\nREINFORCED CONFIDENCE DISTRIBUTION:")
print("="*60)
conf_counts = df['reinforced_confidence_label'].value_counts()
for conf, count in conf_counts.items():
    pct = count / len(df) * 100
    print(f"  {conf}: {count:,} ({pct:.1f}%)")

print("\nANOMALY TYPE DISTRIBUTION:")
print("="*60)
label_counts = df['label'].value_counts()
for label, count in label_counts.items():
    pct = count / len(df) * 100
    print(f"  {label}: {count:,} ({pct:.1f}%)")

In [None]:
# Filter for high-confidence samples only
HIGH_CONFIDENCE_LABELS = ['medium_confidence', 'high_confidence', 'very_high_confidence']

df_high_conf = df[df['reinforced_confidence_label'].isin(HIGH_CONFIDENCE_LABELS)].copy()

print(f"\nHIGH-CONFIDENCE SAMPLES FOR TRAINING:")
print("="*60)
print(f"Total high-confidence samples: {len(df_high_conf):,}")
print(f"\nBy confidence level:")
for conf in HIGH_CONFIDENCE_LABELS:
    count = len(df_high_conf[df_high_conf['reinforced_confidence_label'] == conf])
    print(f"  {conf}: {count:,}")

print(f"\nBy anomaly type:")
for label in df_high_conf['label'].unique():
    count = len(df_high_conf[df_high_conf['label'] == label])
    print(f"  {label}: {count:,}")

In [None]:
# Define feature columns (BGP metrics only)
METADATA_COLS = ['label', 'Incident', 'window_start', 'window_end', 
                 'method_agreement_count', 'reinforced_confidence_label',
                 'ensemble_score', 'classifier_confidence', 'incident_coherence',
                 'score_mahalanobis', 'flag_mahalanobis', 'score_ocsvm', 'flag_ocsvm',
                 'score_statistical', 'flag_statistical', 'score_lof', 'flag_lof',
                 'score_isolation_forest', 'flag_isolation_forest',
                 'score_elliptic_envelope', 'flag_elliptic_envelope']

FEATURE_COLS = [col for col in df.columns if col not in METADATA_COLS]
print(f"Feature columns ({len(FEATURE_COLS)}):")
print(FEATURE_COLS)

# Identify integer vs continuous features
INTEGER_FEATURES = [col for col in FEATURE_COLS if df[col].dtype in ['int64', 'int32'] 
                    or (df[col] == df[col].astype(int)).all()]
CONTINUOUS_FEATURES = [col for col in FEATURE_COLS if col not in INTEGER_FEATURES]

print(f"\nInteger features ({len(INTEGER_FEATURES)}): {INTEGER_FEATURES}")
print(f"Continuous features ({len(CONTINUOUS_FEATURES)}): {CONTINUOUS_FEATURES}")

In [None]:
# Prepare data per anomaly type
ANOMALY_TYPES = df_high_conf['label'].unique().tolist()
print(f"\nAnomaly types: {ANOMALY_TYPES}")

# Store data per type
data_by_type = {}
for atype in ANOMALY_TYPES:
    mask = df_high_conf['label'] == atype
    X = df_high_conf.loc[mask, FEATURE_COLS].values
    data_by_type[atype] = {
        'X': X,
        'n_samples': len(X),
        'df': df_high_conf[mask].copy()
    }
    print(f"  {atype}: {len(X):,} samples")

---
## 3. Evaluation Framework

In [None]:
class GenerationEvaluator:
    """Comprehensive evaluation framework for synthetic data quality."""
    
    def __init__(self, feature_names, integer_features=None):
        self.feature_names = feature_names
        self.integer_features = integer_features or []
        self.results = {}
    
    def evaluate(self, real_data, synthetic_data, method_name):
        """Run all evaluation metrics."""
        results = {
            'method': method_name,
            'n_real': len(real_data),
            'n_synthetic': len(synthetic_data)
        }
        
        # 1. Distribution metrics (KS statistic)
        ks_scores = []
        for i, feat in enumerate(self.feature_names):
            stat, _ = ks_2samp(real_data[:, i], synthetic_data[:, i])
            ks_scores.append(stat)
        results['ks_mean'] = np.mean(ks_scores)
        results['ks_max'] = np.max(ks_scores)
        results['ks_scores'] = ks_scores
        
        # 2. Wasserstein distance
        wd_scores = []
        for i in range(len(self.feature_names)):
            wd = wasserstein_distance(real_data[:, i], synthetic_data[:, i])
            wd_scores.append(wd)
        results['wasserstein_mean'] = np.mean(wd_scores)
        results['wasserstein_scores'] = wd_scores
        
        # 3. Mean/Std comparison
        real_mean = real_data.mean(axis=0)
        synth_mean = synthetic_data.mean(axis=0)
        real_std = real_data.std(axis=0)
        synth_std = synthetic_data.std(axis=0)
        
        results['mae_mean'] = np.mean(np.abs(real_mean - synth_mean))
        results['mae_std'] = np.mean(np.abs(real_std - synth_std))
        
        # 4. Correlation structure preservation
        real_corr = np.corrcoef(real_data.T)
        synth_corr = np.corrcoef(synthetic_data.T)
        
        # Handle NaN correlations
        real_corr = np.nan_to_num(real_corr, nan=0.0)
        synth_corr = np.nan_to_num(synth_corr, nan=0.0)
        
        corr_diff = np.abs(real_corr - synth_corr)
        results['corr_mae'] = np.mean(corr_diff)
        results['corr_max'] = np.max(corr_diff)
        
        # Correlation structure similarity
        mask = np.triu(np.ones_like(real_corr, dtype=bool), k=1)
        results['corr_structure'] = np.corrcoef(
            real_corr[mask].flatten(), 
            synth_corr[mask].flatten()
        )[0, 1]
        
        # 5. Cohen's d effect size
        pooled_std = np.sqrt((real_std**2 + synth_std**2) / 2)
        cohens_d = np.abs(real_mean - synth_mean) / (pooled_std + 1e-10)
        results['cohens_d_mean'] = np.mean(cohens_d)
        results['cohens_d_max'] = np.max(cohens_d)
        
        # 6. Composite score (0-100, higher is better)
        ks_score = max(0, 100 - results['ks_mean'] * 200)  # KS < 0.1 is good
        corr_score = max(0, results['corr_structure'] * 100) if not np.isnan(results['corr_structure']) else 50
        effect_score = max(0, 100 - results['cohens_d_mean'] * 100)  # d < 0.2 is negligible
        wd_norm = results['wasserstein_mean'] / (np.mean(real_std) + 1e-10)
        wd_score = max(0, 100 - wd_norm * 50)
        
        results['composite_score'] = (
            0.30 * ks_score + 
            0.25 * corr_score + 
            0.25 * effect_score + 
            0.20 * wd_score
        )
        
        self.results[method_name] = results
        return results
    
    def get_summary(self):
        """Get summary DataFrame of all evaluated methods."""
        summary = []
        for method, res in self.results.items():
            summary.append({
                'Method': method,
                'KS Mean': res['ks_mean'],
                'KS Max': res['ks_max'],
                'Wasserstein': res['wasserstein_mean'],
                'MAE Mean': res['mae_mean'],
                'Corr MAE': res['corr_mae'],
                'Corr Structure': res['corr_structure'],
                'Cohen\'s d': res['cohens_d_mean'],
                'Composite Score': res['composite_score']
            })
        return pd.DataFrame(summary).sort_values('Composite Score', ascending=False)

print("Evaluation framework defined.")

In [None]:
def post_process_synthetic(synthetic_data, feature_names, integer_features, real_data):
    """Apply post-processing constraints to synthetic data."""
    synthetic = synthetic_data.copy()
    
    # 1. Non-negativity
    synthetic = np.maximum(synthetic, 0)
    
    # 2. Round integer features
    for i, feat in enumerate(feature_names):
        if feat in integer_features:
            synthetic[:, i] = np.round(synthetic[:, i]).astype(int)
    
    # 3. Clip to realistic bounds (based on real data)
    for i in range(synthetic.shape[1]):
        p99 = np.percentile(real_data[:, i], 99.5)
        synthetic[:, i] = np.clip(synthetic[:, i], 0, p99 * 1.5)
    
    return synthetic

print("Post-processing function defined.")

---
## 4. Generation Algorithms

### 4.1 Enhanced Copula Generator

In [None]:
class EnhancedCopulaGenerator:
    """Copula-based generator with KDE marginals."""
    
    def __init__(self, random_state=42):
        self.random_state = random_state
        np.random.seed(random_state)
        self.fitted = False
        
    def fit(self, X, feature_names):
        """Fit copula model to data."""
        self.n_features = X.shape[1]
        self.feature_names = feature_names
        self.X_train = X.copy()
        
        # Compute correlation matrix (hybrid Pearson + Spearman)
        pearson_corr = np.corrcoef(X.T)
        spearman_corr, _ = stats.spearmanr(X)
        if spearman_corr.ndim == 0:
            spearman_corr = np.array([[1.0]])
        
        # Hybrid correlation
        self.corr_matrix = 0.3 * pearson_corr + 0.7 * spearman_corr
        self.corr_matrix = np.nan_to_num(self.corr_matrix, nan=0.0)
        np.fill_diagonal(self.corr_matrix, 1.0)
        
        # Ensure positive definiteness
        eigenvalues, eigenvectors = np.linalg.eigh(self.corr_matrix)
        eigenvalues = np.maximum(eigenvalues, 1e-6)
        self.corr_matrix = eigenvectors @ np.diag(eigenvalues) @ eigenvectors.T
        
        # Cholesky decomposition
        self.cholesky = np.linalg.cholesky(self.corr_matrix)
        
        # Fit marginals (KDE for each feature)
        self.marginals = []
        self.marginal_types = []
        
        for i in range(self.n_features):
            col_data = X[:, i]
            unique_vals = len(np.unique(col_data))
            zero_ratio = (col_data == 0).mean()
            
            if unique_vals < 20 or zero_ratio > 0.5:
                # Use empirical CDF for discrete/sparse features
                self.marginals.append({
                    'type': 'empirical',
                    'values': col_data,
                    'zero_ratio': zero_ratio
                })
                self.marginal_types.append('empirical')
            else:
                # Use KDE for continuous features
                # Log-transform for heavy-tailed
                if col_data.max() > 10 * col_data.mean():
                    kde_data = np.log1p(col_data)
                    log_transform = True
                else:
                    kde_data = col_data
                    log_transform = False
                
                bandwidth = 1.06 * np.std(kde_data) * len(kde_data) ** (-1/5)
                bandwidth = max(bandwidth, 0.01)
                
                kde = KernelDensity(bandwidth=bandwidth, kernel='gaussian')
                kde.fit(kde_data.reshape(-1, 1))
                
                self.marginals.append({
                    'type': 'kde',
                    'model': kde,
                    'log_transform': log_transform,
                    'min_val': col_data.min(),
                    'max_val': col_data.max()
                })
                self.marginal_types.append('kde')
        
        self.fitted = True
        return self
    
    def generate(self, n_samples):
        """Generate synthetic samples."""
        if not self.fitted:
            raise ValueError("Model not fitted.")
        
        # Generate correlated uniform samples via Gaussian copula
        Z = np.random.randn(n_samples, self.n_features)
        Z_corr = Z @ self.cholesky.T
        U = stats.norm.cdf(Z_corr)  # Transform to uniform [0,1]
        
        # Transform to marginal distributions
        synthetic = np.zeros((n_samples, self.n_features))
        
        for i in range(self.n_features):
            marginal = self.marginals[i]
            
            if marginal['type'] == 'empirical':
                # Sample from empirical distribution
                synthetic[:, i] = np.random.choice(
                    marginal['values'], 
                    size=n_samples, 
                    replace=True
                )
            else:
                # Sample from KDE
                kde_samples = marginal['model'].sample(n_samples).flatten()
                
                if marginal['log_transform']:
                    kde_samples = np.expm1(kde_samples)
                
                # Clip to observed range
                kde_samples = np.clip(kde_samples, marginal['min_val'], marginal['max_val'] * 1.1)
                synthetic[:, i] = kde_samples
        
        return synthetic

print("Enhanced Copula Generator defined.")

### 4.2 SMOTE-based Generator

In [None]:
class SMOTEGenerator:
    """SMOTE-based synthetic data generator."""
    
    def __init__(self, random_state=42, k_neighbors=5):
        self.random_state = random_state
        self.k_neighbors = k_neighbors
        
    def fit(self, X, feature_names):
        self.X_train = X.copy()
        self.feature_names = feature_names
        self.n_features = X.shape[1]
        return self
    
    def generate(self, n_samples):
        """Generate synthetic samples using SMOTE interpolation."""
        from sklearn.neighbors import NearestNeighbors
        
        X = self.X_train
        k = min(self.k_neighbors, len(X) - 1)
        
        # Fit nearest neighbors
        nn = NearestNeighbors(n_neighbors=k + 1)
        nn.fit(X)
        
        synthetic = []
        for _ in range(n_samples):
            # Random sample
            idx = np.random.randint(len(X))
            sample = X[idx]
            
            # Get neighbors
            distances, indices = nn.kneighbors([sample])
            neighbor_idx = np.random.choice(indices[0][1:])  # Exclude self
            neighbor = X[neighbor_idx]
            
            # Interpolate
            alpha = np.random.random()
            new_sample = sample + alpha * (neighbor - sample)
            synthetic.append(new_sample)
        
        return np.array(synthetic)

print("SMOTE Generator defined.")

### 4.3 Gaussian Mixture Model Generator

In [None]:
class GMMGenerator:
    """Gaussian Mixture Model based generator."""
    
    def __init__(self, n_components=10, random_state=42):
        self.n_components = n_components
        self.random_state = random_state
        
    def fit(self, X, feature_names):
        self.feature_names = feature_names
        self.n_features = X.shape[1]
        self.X_train = X.copy()
        
        # Determine optimal number of components
        n_comp = min(self.n_components, len(X) // 10)
        n_comp = max(n_comp, 2)
        
        # Fit GMM
        self.gmm = GaussianMixture(
            n_components=n_comp,
            covariance_type='full',
            random_state=self.random_state,
            max_iter=200
        )
        self.gmm.fit(X)
        return self
    
    def generate(self, n_samples):
        """Generate samples from GMM."""
        synthetic, _ = self.gmm.sample(n_samples)
        return synthetic

print("GMM Generator defined.")

### 4.4 KDE Sampling Generator

In [None]:
class KDEGenerator:
    """Multivariate KDE-based generator."""
    
    def __init__(self, bandwidth='scott', random_state=42):
        self.bandwidth = bandwidth
        self.random_state = random_state
        np.random.seed(random_state)
        
    def fit(self, X, feature_names):
        self.feature_names = feature_names
        self.n_features = X.shape[1]
        self.X_train = X.copy()
        
        # Compute bandwidth
        n = len(X)
        d = X.shape[1]
        
        if self.bandwidth == 'scott':
            bw = n ** (-1 / (d + 4))
        elif self.bandwidth == 'silverman':
            bw = (n * (d + 2) / 4) ** (-1 / (d + 4))
        else:
            bw = self.bandwidth
        
        self.kde = KernelDensity(bandwidth=bw, kernel='gaussian')
        self.kde.fit(X)
        return self
    
    def generate(self, n_samples):
        """Generate samples from KDE."""
        return self.kde.sample(n_samples)

print("KDE Generator defined.")

### 4.5 Bootstrap Resampling Generator

In [None]:
class BootstrapGenerator:
    """Bootstrap resampling with noise injection."""
    
    def __init__(self, noise_level=0.05, random_state=42):
        self.noise_level = noise_level
        self.random_state = random_state
        np.random.seed(random_state)
        
    def fit(self, X, feature_names):
        self.feature_names = feature_names
        self.X_train = X.copy()
        self.std = X.std(axis=0)
        return self
    
    def generate(self, n_samples):
        """Generate samples via bootstrap with noise."""
        # Random resampling with replacement
        indices = np.random.choice(len(self.X_train), size=n_samples, replace=True)
        synthetic = self.X_train[indices].copy()
        
        # Add Gaussian noise
        noise = np.random.randn(*synthetic.shape) * self.std * self.noise_level
        synthetic = synthetic + noise
        
        return synthetic

print("Bootstrap Generator defined.")

### 4.6 Variational Autoencoder (VAE) Generator

In [None]:
class VAEGenerator:
    """Variational Autoencoder for tabular data generation."""
    
    def __init__(self, latent_dim=16, hidden_dim=64, epochs=200, 
                 batch_size=64, lr=0.001, random_state=42):
        self.latent_dim = latent_dim
        self.hidden_dim = hidden_dim
        self.epochs = epochs
        self.batch_size = batch_size
        self.lr = lr
        self.random_state = random_state
        torch.manual_seed(random_state)
        
    def fit(self, X, feature_names):
        self.feature_names = feature_names
        self.n_features = X.shape[1]
        self.X_train = X.copy()
        
        # Normalize
        self.scaler = MinMaxScaler()
        X_scaled = self.scaler.fit_transform(X)
        
        # Build VAE
        self.encoder = nn.Sequential(
            nn.Linear(self.n_features, self.hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(self.hidden_dim),
            nn.Linear(self.hidden_dim, self.hidden_dim),
            nn.ReLU(),
        ).to(DEVICE)
        
        self.fc_mu = nn.Linear(self.hidden_dim, self.latent_dim).to(DEVICE)
        self.fc_var = nn.Linear(self.hidden_dim, self.latent_dim).to(DEVICE)
        
        self.decoder = nn.Sequential(
            nn.Linear(self.latent_dim, self.hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(self.hidden_dim),
            nn.Linear(self.hidden_dim, self.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.hidden_dim, self.n_features),
            nn.Sigmoid()
        ).to(DEVICE)
        
        # Training
        params = list(self.encoder.parameters()) + list(self.fc_mu.parameters()) + \
                 list(self.fc_var.parameters()) + list(self.decoder.parameters())
        optimizer = optim.Adam(params, lr=self.lr)
        
        dataset = TensorDataset(torch.FloatTensor(X_scaled))
        loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
        
        self.encoder.train()
        self.decoder.train()
        
        for epoch in range(self.epochs):
            total_loss = 0
            for batch in loader:
                x = batch[0].to(DEVICE)
                
                # Encode
                h = self.encoder(x)
                mu = self.fc_mu(h)
                log_var = self.fc_var(h)
                
                # Reparameterize
                std = torch.exp(0.5 * log_var)
                eps = torch.randn_like(std)
                z = mu + eps * std
                
                # Decode
                x_recon = self.decoder(z)
                
                # Loss
                recon_loss = nn.functional.mse_loss(x_recon, x, reduction='sum')
                kl_loss = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
                loss = recon_loss + 0.1 * kl_loss
                
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
                total_loss += loss.item()
            
            if (epoch + 1) % 50 == 0:
                print(f"  VAE Epoch {epoch+1}/{self.epochs}, Loss: {total_loss/len(X_scaled):.4f}")
        
        return self
    
    def generate(self, n_samples):
        """Generate samples from VAE."""
        self.decoder.eval()
        
        with torch.no_grad():
            z = torch.randn(n_samples, self.latent_dim).to(DEVICE)
            synthetic = self.decoder(z).cpu().numpy()
        
        # Inverse transform
        synthetic = self.scaler.inverse_transform(synthetic)
        return synthetic

print("VAE Generator defined.")

### 4.7 GAN Generator

In [None]:
class GANGenerator:
    """Basic GAN for tabular data generation."""
    
    def __init__(self, latent_dim=32, hidden_dim=128, epochs=300,
                 batch_size=64, lr_g=0.0002, lr_d=0.0001, random_state=42):
        self.latent_dim = latent_dim
        self.hidden_dim = hidden_dim
        self.epochs = epochs
        self.batch_size = batch_size
        self.lr_g = lr_g
        self.lr_d = lr_d
        self.random_state = random_state
        torch.manual_seed(random_state)
        
    def fit(self, X, feature_names):
        self.feature_names = feature_names
        self.n_features = X.shape[1]
        self.X_train = X.copy()
        
        # Normalize
        self.scaler = MinMaxScaler()
        X_scaled = self.scaler.fit_transform(X)
        
        # Generator
        self.generator = nn.Sequential(
            nn.Linear(self.latent_dim, self.hidden_dim),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(self.hidden_dim),
            nn.Linear(self.hidden_dim, self.hidden_dim),
            nn.LeakyReLU(0.2),
            nn.BatchNorm1d(self.hidden_dim),
            nn.Linear(self.hidden_dim, self.n_features),
            nn.Sigmoid()
        ).to(DEVICE)
        
        # Discriminator
        self.discriminator = nn.Sequential(
            nn.Linear(self.n_features, self.hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(self.hidden_dim, self.hidden_dim // 2),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(self.hidden_dim // 2, 1),
            nn.Sigmoid()
        ).to(DEVICE)
        
        # Optimizers
        opt_g = optim.Adam(self.generator.parameters(), lr=self.lr_g, betas=(0.5, 0.999))
        opt_d = optim.Adam(self.discriminator.parameters(), lr=self.lr_d, betas=(0.5, 0.999))
        
        criterion = nn.BCELoss()
        
        dataset = TensorDataset(torch.FloatTensor(X_scaled))
        loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
        
        for epoch in range(self.epochs):
            g_loss_total = 0
            d_loss_total = 0
            
            for batch in loader:
                real = batch[0].to(DEVICE)
                batch_size = real.size(0)
                
                # Labels with smoothing
                real_labels = torch.ones(batch_size, 1).to(DEVICE) * 0.9
                fake_labels = torch.zeros(batch_size, 1).to(DEVICE) + 0.1
                
                # Train Discriminator
                self.discriminator.zero_grad()
                
                # Real samples
                real_pred = self.discriminator(real)
                d_loss_real = criterion(real_pred, real_labels)
                
                # Fake samples
                z = torch.randn(batch_size, self.latent_dim).to(DEVICE)
                fake = self.generator(z)
                fake_pred = self.discriminator(fake.detach())
                d_loss_fake = criterion(fake_pred, fake_labels)
                
                d_loss = d_loss_real + d_loss_fake
                d_loss.backward()
                opt_d.step()
                
                # Train Generator
                self.generator.zero_grad()
                
                z = torch.randn(batch_size, self.latent_dim).to(DEVICE)
                fake = self.generator(z)
                fake_pred = self.discriminator(fake)
                g_loss = criterion(fake_pred, torch.ones(batch_size, 1).to(DEVICE))
                
                g_loss.backward()
                opt_g.step()
                
                g_loss_total += g_loss.item()
                d_loss_total += d_loss.item()
            
            if (epoch + 1) % 100 == 0:
                print(f"  GAN Epoch {epoch+1}/{self.epochs}, G_loss: {g_loss_total/len(loader):.4f}, D_loss: {d_loss_total/len(loader):.4f}")
        
        return self
    
    def generate(self, n_samples):
        """Generate samples from GAN."""
        self.generator.eval()
        
        with torch.no_grad():
            z = torch.randn(n_samples, self.latent_dim).to(DEVICE)
            synthetic = self.generator(z).cpu().numpy()
        
        synthetic = self.scaler.inverse_transform(synthetic)
        return synthetic

print("GAN Generator defined.")

### 4.8 WGAN-GP Generator

In [None]:
class WGANGPGenerator:
    """Wasserstein GAN with Gradient Penalty."""
    
    def __init__(self, latent_dim=32, hidden_dim=128, epochs=300,
                 batch_size=64, lr=0.0001, n_critic=5, lambda_gp=10, random_state=42):
        self.latent_dim = latent_dim
        self.hidden_dim = hidden_dim
        self.epochs = epochs
        self.batch_size = batch_size
        self.lr = lr
        self.n_critic = n_critic
        self.lambda_gp = lambda_gp
        self.random_state = random_state
        torch.manual_seed(random_state)
        
    def gradient_penalty(self, real, fake):
        batch_size = real.size(0)
        alpha = torch.rand(batch_size, 1).to(DEVICE)
        interpolated = (alpha * real + (1 - alpha) * fake).requires_grad_(True)
        
        d_interpolated = self.critic(interpolated)
        
        gradients = torch.autograd.grad(
            outputs=d_interpolated,
            inputs=interpolated,
            grad_outputs=torch.ones_like(d_interpolated),
            create_graph=True,
            retain_graph=True
        )[0]
        
        gradients = gradients.view(batch_size, -1)
        gradient_norm = gradients.norm(2, dim=1)
        gp = ((gradient_norm - 1) ** 2).mean()
        return gp
    
    def fit(self, X, feature_names):
        self.feature_names = feature_names
        self.n_features = X.shape[1]
        self.X_train = X.copy()
        
        self.scaler = MinMaxScaler()
        X_scaled = self.scaler.fit_transform(X)
        
        # Generator
        self.generator = nn.Sequential(
            nn.Linear(self.latent_dim, self.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.hidden_dim, self.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.hidden_dim, self.n_features),
            nn.Sigmoid()
        ).to(DEVICE)
        
        # Critic (no sigmoid for WGAN)
        self.critic = nn.Sequential(
            nn.Linear(self.n_features, self.hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Linear(self.hidden_dim, self.hidden_dim // 2),
            nn.LeakyReLU(0.2),
            nn.Linear(self.hidden_dim // 2, 1)
        ).to(DEVICE)
        
        opt_g = optim.Adam(self.generator.parameters(), lr=self.lr, betas=(0.0, 0.9))
        opt_c = optim.Adam(self.critic.parameters(), lr=self.lr, betas=(0.0, 0.9))
        
        dataset = TensorDataset(torch.FloatTensor(X_scaled))
        loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
        
        for epoch in range(self.epochs):
            for i, batch in enumerate(loader):
                real = batch[0].to(DEVICE)
                batch_size = real.size(0)
                
                # Train Critic
                for _ in range(self.n_critic):
                    self.critic.zero_grad()
                    
                    z = torch.randn(batch_size, self.latent_dim).to(DEVICE)
                    fake = self.generator(z).detach()
                    
                    c_real = self.critic(real).mean()
                    c_fake = self.critic(fake).mean()
                    gp = self.gradient_penalty(real, fake)
                    
                    c_loss = c_fake - c_real + self.lambda_gp * gp
                    c_loss.backward()
                    opt_c.step()
                
                # Train Generator
                self.generator.zero_grad()
                z = torch.randn(batch_size, self.latent_dim).to(DEVICE)
                fake = self.generator(z)
                g_loss = -self.critic(fake).mean()
                g_loss.backward()
                opt_g.step()
            
            if (epoch + 1) % 100 == 0:
                print(f"  WGAN-GP Epoch {epoch+1}/{self.epochs}")
        
        return self
    
    def generate(self, n_samples):
        self.generator.eval()
        with torch.no_grad():
            z = torch.randn(n_samples, self.latent_dim).to(DEVICE)
            synthetic = self.generator(z).cpu().numpy()
        return self.scaler.inverse_transform(synthetic)

print("WGAN-GP Generator defined.")

### 4.9 Diffusion Model Generator

In [None]:
class DiffusionGenerator:
    """Simple diffusion model for tabular data."""
    
    def __init__(self, n_steps=100, hidden_dim=128, epochs=200,
                 batch_size=64, lr=0.001, random_state=42):
        self.n_steps = n_steps
        self.hidden_dim = hidden_dim
        self.epochs = epochs
        self.batch_size = batch_size
        self.lr = lr
        self.random_state = random_state
        torch.manual_seed(random_state)
        
        # Noise schedule
        self.betas = torch.linspace(0.0001, 0.02, n_steps)
        self.alphas = 1 - self.betas
        self.alpha_cumprod = torch.cumprod(self.alphas, dim=0)
        
    def fit(self, X, feature_names):
        self.feature_names = feature_names
        self.n_features = X.shape[1]
        self.X_train = X.copy()
        
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X)
        
        # Denoising network
        self.model = nn.Sequential(
            nn.Linear(self.n_features + 1, self.hidden_dim),  # +1 for time embedding
            nn.ReLU(),
            nn.Linear(self.hidden_dim, self.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.hidden_dim, self.n_features)
        ).to(DEVICE)
        
        optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        
        dataset = TensorDataset(torch.FloatTensor(X_scaled))
        loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
        
        self.alpha_cumprod = self.alpha_cumprod.to(DEVICE)
        
        for epoch in range(self.epochs):
            total_loss = 0
            for batch in loader:
                x0 = batch[0].to(DEVICE)
                batch_size = x0.size(0)
                
                # Random timestep
                t = torch.randint(0, self.n_steps, (batch_size,)).to(DEVICE)
                
                # Add noise
                noise = torch.randn_like(x0)
                alpha_t = self.alpha_cumprod[t].unsqueeze(1)
                x_noisy = torch.sqrt(alpha_t) * x0 + torch.sqrt(1 - alpha_t) * noise
                
                # Time embedding (normalized)
                t_emb = (t.float() / self.n_steps).unsqueeze(1)
                
                # Predict noise
                x_input = torch.cat([x_noisy, t_emb], dim=1)
                noise_pred = self.model(x_input)
                
                loss = nn.functional.mse_loss(noise_pred, noise)
                
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
                total_loss += loss.item()
            
            if (epoch + 1) % 50 == 0:
                print(f"  Diffusion Epoch {epoch+1}/{self.epochs}, Loss: {total_loss/len(loader):.4f}")
        
        return self
    
    def generate(self, n_samples):
        self.model.eval()
        
        with torch.no_grad():
            # Start from noise
            x = torch.randn(n_samples, self.n_features).to(DEVICE)
            
            # Reverse diffusion
            for t in reversed(range(self.n_steps)):
                t_batch = torch.full((n_samples,), t, dtype=torch.long).to(DEVICE)
                t_emb = (t_batch.float() / self.n_steps).unsqueeze(1)
                
                x_input = torch.cat([x, t_emb], dim=1)
                noise_pred = self.model(x_input)
                
                alpha_t = self.alpha_cumprod[t]
                alpha_t_prev = self.alpha_cumprod[t-1] if t > 0 else torch.tensor(1.0).to(DEVICE)
                
                # Denoise step
                x = (x - (1 - alpha_t).sqrt() * noise_pred / alpha_t.sqrt())
                
                if t > 0:
                    noise = torch.randn_like(x)
                    sigma = ((1 - alpha_t_prev) / (1 - alpha_t) * (1 - alpha_t / alpha_t_prev)).sqrt()
                    x = x + sigma * noise
            
            synthetic = x.cpu().numpy()
        
        return self.scaler.inverse_transform(synthetic)

print("Diffusion Generator defined.")

### 4.10 Hybrid SMOTE-Copula Generator

In [None]:
class HybridSMOTECopulaGenerator:
    """Hybrid approach combining SMOTE and Copula."""
    
    def __init__(self, smote_ratio=0.5, random_state=42):
        self.smote_ratio = smote_ratio
        self.random_state = random_state
        
    def fit(self, X, feature_names):
        self.feature_names = feature_names
        self.smote_gen = SMOTEGenerator(random_state=self.random_state)
        self.copula_gen = EnhancedCopulaGenerator(random_state=self.random_state)
        
        self.smote_gen.fit(X, feature_names)
        self.copula_gen.fit(X, feature_names)
        return self
    
    def generate(self, n_samples):
        n_smote = int(n_samples * self.smote_ratio)
        n_copula = n_samples - n_smote
        
        smote_samples = self.smote_gen.generate(n_smote)
        copula_samples = self.copula_gen.generate(n_copula)
        
        return np.vstack([smote_samples, copula_samples])

print("Hybrid SMOTE-Copula Generator defined.")

### 4.11 CTGAN-style Generator

In [None]:
class CTGANGenerator:
    """Conditional Tabular GAN with mode-specific normalization."""
    
    def __init__(self, latent_dim=64, hidden_dim=256, epochs=300,
                 batch_size=500, random_state=42):
        self.latent_dim = latent_dim
        self.hidden_dim = hidden_dim
        self.epochs = epochs
        self.batch_size = batch_size
        self.random_state = random_state
        torch.manual_seed(random_state)
        
    def fit(self, X, feature_names):
        self.feature_names = feature_names
        self.n_features = X.shape[1]
        self.X_train = X.copy()
        
        # Mode-specific normalization (VGM-style)
        self.transformers = []
        X_transformed = []
        
        for i in range(self.n_features):
            col = X[:, i]
            # Fit GMM for mode normalization
            n_modes = min(5, len(np.unique(col)) // 10 + 1)
            n_modes = max(n_modes, 1)
            
            if len(np.unique(col)) > 10:
                gmm = GaussianMixture(n_components=n_modes, random_state=self.random_state)
                gmm.fit(col.reshape(-1, 1))
                self.transformers.append({'type': 'gmm', 'model': gmm})
                
                # Transform using mode assignment
                modes = gmm.predict(col.reshape(-1, 1))
                means = gmm.means_.flatten()[modes]
                stds = np.sqrt(gmm.covariances_.flatten())[modes] + 1e-6
                normalized = (col - means) / (4 * stds)
                normalized = np.tanh(normalized)
            else:
                # Simple normalization for discrete
                mean, std = col.mean(), col.std() + 1e-6
                self.transformers.append({'type': 'simple', 'mean': mean, 'std': std})
                normalized = (col - mean) / std
                normalized = np.tanh(normalized / 4)
            
            X_transformed.append(normalized)
        
        X_transformed = np.column_stack(X_transformed)
        
        # Generator with residual blocks
        class ResidualBlock(nn.Module):
            def __init__(self, dim):
                super().__init__()
                self.fc = nn.Sequential(
                    nn.Linear(dim, dim),
                    nn.BatchNorm1d(dim),
                    nn.ReLU(),
                    nn.Linear(dim, dim),
                    nn.BatchNorm1d(dim)
                )
                self.relu = nn.ReLU()
            
            def forward(self, x):
                return self.relu(x + self.fc(x))
        
        self.generator = nn.Sequential(
            nn.Linear(self.latent_dim, self.hidden_dim),
            nn.BatchNorm1d(self.hidden_dim),
            nn.ReLU(),
            ResidualBlock(self.hidden_dim),
            ResidualBlock(self.hidden_dim),
            nn.Linear(self.hidden_dim, self.n_features),
            nn.Tanh()
        ).to(DEVICE)
        
        self.discriminator = nn.Sequential(
            nn.Linear(self.n_features, self.hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.5),
            nn.Linear(self.hidden_dim, self.hidden_dim // 2),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.5),
            nn.Linear(self.hidden_dim // 2, 1)
        ).to(DEVICE)
        
        opt_g = optim.Adam(self.generator.parameters(), lr=2e-4, betas=(0.5, 0.9))
        opt_d = optim.Adam(self.discriminator.parameters(), lr=2e-4, betas=(0.5, 0.9))
        
        dataset = TensorDataset(torch.FloatTensor(X_transformed))
        loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, drop_last=True)
        
        for epoch in range(self.epochs):
            for batch in loader:
                real = batch[0].to(DEVICE)
                batch_size = real.size(0)
                
                # Train D
                self.discriminator.zero_grad()
                z = torch.randn(batch_size, self.latent_dim).to(DEVICE)
                fake = self.generator(z)
                
                d_real = self.discriminator(real)
                d_fake = self.discriminator(fake.detach())
                
                d_loss = -torch.mean(d_real) + torch.mean(d_fake)
                d_loss.backward()
                opt_d.step()
                
                # Clip weights
                for p in self.discriminator.parameters():
                    p.data.clamp_(-0.01, 0.01)
                
                # Train G
                self.generator.zero_grad()
                z = torch.randn(batch_size, self.latent_dim).to(DEVICE)
                fake = self.generator(z)
                g_loss = -torch.mean(self.discriminator(fake))
                g_loss.backward()
                opt_g.step()
            
            if (epoch + 1) % 100 == 0:
                print(f"  CTGAN Epoch {epoch+1}/{self.epochs}")
        
        return self
    
    def generate(self, n_samples):
        self.generator.eval()
        
        with torch.no_grad():
            z = torch.randn(n_samples, self.latent_dim).to(DEVICE)
            synthetic = self.generator(z).cpu().numpy()
        
        # Inverse transform
        result = np.zeros_like(synthetic)
        for i, trans in enumerate(self.transformers):
            col = synthetic[:, i]
            col = np.arctanh(np.clip(col, -0.99, 0.99))
            
            if trans['type'] == 'gmm':
                gmm = trans['model']
                # Use primary mode for inverse
                mean = gmm.means_.flatten()[0]
                std = np.sqrt(gmm.covariances_.flatten()[0]) + 1e-6
                result[:, i] = col * 4 * std + mean
            else:
                result[:, i] = col * 4 * trans['std'] + trans['mean']
        
        return result

print("CTGAN Generator defined.")

### 4.12 Autoencoder-based Generator

In [None]:
class AutoencoderGenerator:
    """Denoising autoencoder for generation via noise injection."""
    
    def __init__(self, latent_dim=16, hidden_dim=64, epochs=200,
                 batch_size=64, noise_factor=0.3, random_state=42):
        self.latent_dim = latent_dim
        self.hidden_dim = hidden_dim
        self.epochs = epochs
        self.batch_size = batch_size
        self.noise_factor = noise_factor
        self.random_state = random_state
        torch.manual_seed(random_state)
        
    def fit(self, X, feature_names):
        self.feature_names = feature_names
        self.n_features = X.shape[1]
        self.X_train = X.copy()
        
        self.scaler = MinMaxScaler()
        X_scaled = self.scaler.fit_transform(X)
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(self.n_features, self.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.hidden_dim, self.latent_dim)
        ).to(DEVICE)
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(self.latent_dim, self.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.hidden_dim, self.n_features),
            nn.Sigmoid()
        ).to(DEVICE)
        
        params = list(self.encoder.parameters()) + list(self.decoder.parameters())
        optimizer = optim.Adam(params, lr=0.001)
        
        dataset = TensorDataset(torch.FloatTensor(X_scaled))
        loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
        
        for epoch in range(self.epochs):
            total_loss = 0
            for batch in loader:
                x = batch[0].to(DEVICE)
                
                # Add noise
                noise = torch.randn_like(x) * self.noise_factor
                x_noisy = torch.clamp(x + noise, 0, 1)
                
                # Forward
                z = self.encoder(x_noisy)
                x_recon = self.decoder(z)
                
                loss = nn.functional.mse_loss(x_recon, x)
                
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
                total_loss += loss.item()
            
            if (epoch + 1) % 50 == 0:
                print(f"  AE Epoch {epoch+1}/{self.epochs}, Loss: {total_loss/len(loader):.4f}")
        
        # Store latent representations
        self.encoder.eval()
        with torch.no_grad():
            X_tensor = torch.FloatTensor(X_scaled).to(DEVICE)
            self.latent_codes = self.encoder(X_tensor).cpu().numpy()
        
        return self
    
    def generate(self, n_samples):
        self.decoder.eval()
        
        # Sample from latent space with noise
        indices = np.random.choice(len(self.latent_codes), n_samples, replace=True)
        latent_samples = self.latent_codes[indices]
        latent_samples = latent_samples + np.random.randn(*latent_samples.shape) * 0.2
        
        with torch.no_grad():
            z = torch.FloatTensor(latent_samples).to(DEVICE)
            synthetic = self.decoder(z).cpu().numpy()
        
        return self.scaler.inverse_transform(synthetic)

print("Autoencoder Generator defined.")

---
## 5. Run Systematic Comparison

In [None]:
# Define all generators
GENERATORS = {
    'Copula': lambda: EnhancedCopulaGenerator(random_state=RANDOM_STATE),
    'SMOTE': lambda: SMOTEGenerator(random_state=RANDOM_STATE, k_neighbors=5),
    'GMM': lambda: GMMGenerator(n_components=10, random_state=RANDOM_STATE),
    'KDE': lambda: KDEGenerator(bandwidth='scott', random_state=RANDOM_STATE),
    'Bootstrap': lambda: BootstrapGenerator(noise_level=0.05, random_state=RANDOM_STATE),
    'VAE': lambda: VAEGenerator(latent_dim=16, hidden_dim=64, epochs=150, random_state=RANDOM_STATE),
    'GAN': lambda: GANGenerator(latent_dim=32, hidden_dim=128, epochs=200, random_state=RANDOM_STATE),
    'WGAN-GP': lambda: WGANGPGenerator(latent_dim=32, hidden_dim=128, epochs=200, random_state=RANDOM_STATE),
    'Diffusion': lambda: DiffusionGenerator(n_steps=50, hidden_dim=128, epochs=150, random_state=RANDOM_STATE),
    'Hybrid-SMOTE-Copula': lambda: HybridSMOTECopulaGenerator(smote_ratio=0.5, random_state=RANDOM_STATE),
    'CTGAN': lambda: CTGANGenerator(latent_dim=64, hidden_dim=256, epochs=200, random_state=RANDOM_STATE),
    'Autoencoder': lambda: AutoencoderGenerator(latent_dim=16, hidden_dim=64, epochs=150, random_state=RANDOM_STATE),
}

print(f"Total generators to evaluate: {len(GENERATORS)}")
print(f"Generators: {list(GENERATORS.keys())}")

In [None]:
# Number of synthetic samples to generate per anomaly type
N_SYNTHETIC = 5000

# Store all results
all_results = {}
generated_data = {}

for atype in ANOMALY_TYPES:
    print(f"\n{'='*80}")
    print(f"PROCESSING ANOMALY TYPE: {atype.upper()}")
    print(f"{'='*80}")
    
    X_real = data_by_type[atype]['X']
    n_real = len(X_real)
    
    print(f"Real samples: {n_real:,}")
    print(f"Synthetic samples to generate: {N_SYNTHETIC:,}")
    
    # Initialize evaluator for this type
    evaluator = GenerationEvaluator(FEATURE_COLS, INTEGER_FEATURES)
    generated_data[atype] = {}
    
    for gen_name, gen_factory in GENERATORS.items():
        print(f"\n--- {gen_name} ---")
        try:
            # Create generator
            generator = gen_factory()
            
            # Fit
            print(f"  Fitting on {n_real:,} samples...")
            generator.fit(X_real, FEATURE_COLS)
            
            # Generate
            print(f"  Generating {N_SYNTHETIC:,} synthetic samples...")
            X_synthetic = generator.generate(N_SYNTHETIC)
            
            # Post-process
            X_synthetic = post_process_synthetic(X_synthetic, FEATURE_COLS, INTEGER_FEATURES, X_real)
            
            # Evaluate
            results = evaluator.evaluate(X_real, X_synthetic, gen_name)
            
            print(f"  KS Mean: {results['ks_mean']:.4f}")
            print(f"  Composite Score: {results['composite_score']:.2f}/100")
            
            # Store generated data
            generated_data[atype][gen_name] = X_synthetic
            
        except Exception as e:
            print(f"  ERROR: {str(e)}")
            continue
    
    # Store results
    all_results[atype] = evaluator.get_summary()
    
    print(f"\n{'-'*60}")
    print(f"RESULTS FOR {atype.upper()}:")
    print(all_results[atype].to_string())

---
## 6. Results Visualization

In [None]:
# Composite score comparison across all anomaly types
fig, axes = plt.subplots(1, len(ANOMALY_TYPES), figsize=(6*len(ANOMALY_TYPES), 8))

if len(ANOMALY_TYPES) == 1:
    axes = [axes]

for idx, atype in enumerate(ANOMALY_TYPES):
    ax = axes[idx]
    df_results = all_results[atype]
    
    colors = plt.cm.RdYlGn(df_results['Composite Score'] / 100)
    
    bars = ax.barh(range(len(df_results)), df_results['Composite Score'], color=colors)
    ax.set_yticks(range(len(df_results)))
    ax.set_yticklabels(df_results['Method'])
    ax.set_xlabel('Composite Score (0-100)')
    ax.set_title(f'{atype}\nGenerator Performance', fontsize=12, fontweight='bold')
    ax.set_xlim(0, 100)
    ax.axvline(80, color='green', linestyle='--', alpha=0.5, label='Good (80)')
    ax.axvline(60, color='orange', linestyle='--', alpha=0.5, label='Acceptable (60)')
    
    for bar, score in zip(bars, df_results['Composite Score']):
        ax.text(bar.get_width() + 1, bar.get_y() + bar.get_height()/2,
                f'{score:.1f}', va='center', fontsize=9)

plt.tight_layout()
plt.savefig(f'{PLOTS_DIR}/01_composite_score_comparison.png', dpi=150, bbox_inches='tight')
print(f"Plot saved: {PLOTS_DIR}/01_composite_score_comparison.png")
plt.show()

In [None]:
# KS statistic comparison
fig, axes = plt.subplots(1, len(ANOMALY_TYPES), figsize=(6*len(ANOMALY_TYPES), 8))

if len(ANOMALY_TYPES) == 1:
    axes = [axes]

for idx, atype in enumerate(ANOMALY_TYPES):
    ax = axes[idx]
    df_results = all_results[atype].sort_values('KS Mean')
    
    colors = plt.cm.RdYlGn_r(df_results['KS Mean'] / df_results['KS Mean'].max())
    
    bars = ax.barh(range(len(df_results)), df_results['KS Mean'], color=colors)
    ax.set_yticks(range(len(df_results)))
    ax.set_yticklabels(df_results['Method'])
    ax.set_xlabel('KS Statistic (lower is better)')
    ax.set_title(f'{atype}\nDistribution Match (KS)', fontsize=12, fontweight='bold')
    ax.axvline(0.1, color='green', linestyle='--', alpha=0.5, label='Good (<0.1)')
    ax.axvline(0.2, color='orange', linestyle='--', alpha=0.5, label='Acceptable (<0.2)')

plt.tight_layout()
plt.savefig(f'{PLOTS_DIR}/02_ks_statistic_comparison.png', dpi=150, bbox_inches='tight')
print(f"Plot saved: {PLOTS_DIR}/02_ks_statistic_comparison.png")
plt.show()

In [None]:
# Correlation structure preservation
fig, axes = plt.subplots(1, len(ANOMALY_TYPES), figsize=(6*len(ANOMALY_TYPES), 8))

if len(ANOMALY_TYPES) == 1:
    axes = [axes]

for idx, atype in enumerate(ANOMALY_TYPES):
    ax = axes[idx]
    df_results = all_results[atype].sort_values('Corr Structure', ascending=False)
    
    colors = plt.cm.RdYlGn(df_results['Corr Structure'])
    
    bars = ax.barh(range(len(df_results)), df_results['Corr Structure'], color=colors)
    ax.set_yticks(range(len(df_results)))
    ax.set_yticklabels(df_results['Method'])
    ax.set_xlabel('Correlation Structure Preservation')
    ax.set_title(f'{atype}\nCorrelation Preservation', fontsize=12, fontweight='bold')
    ax.set_xlim(0, 1)
    ax.axvline(0.9, color='green', linestyle='--', alpha=0.5)
    ax.axvline(0.7, color='orange', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.savefig(f'{PLOTS_DIR}/03_correlation_preservation.png', dpi=150, bbox_inches='tight')
print(f"Plot saved: {PLOTS_DIR}/03_correlation_preservation.png")
plt.show()

In [None]:
# Feature distribution comparison for best method per anomaly type
for atype in ANOMALY_TYPES:
    df_results = all_results[atype]
    best_method = df_results.iloc[0]['Method']
    
    print(f"\n{atype}: Best method = {best_method}")
    
    X_real = data_by_type[atype]['X']
    X_synth = generated_data[atype].get(best_method)
    
    if X_synth is None:
        continue
    
    # Plot first 12 features
    n_plot = min(12, len(FEATURE_COLS))
    fig, axes = plt.subplots(3, 4, figsize=(16, 12))
    axes = axes.flatten()
    
    for i in range(n_plot):
        ax = axes[i]
        feat = FEATURE_COLS[i]
        
        ax.hist(X_real[:, i], bins=50, alpha=0.6, label='Real', color='blue', density=True)
        ax.hist(X_synth[:, i], bins=50, alpha=0.6, label='Synthetic', color='orange', density=True)
        
        ks_stat, _ = ks_2samp(X_real[:, i], X_synth[:, i])
        ax.set_title(f'{feat}\nKS={ks_stat:.3f}', fontsize=10)
        ax.legend(fontsize=8)
    
    plt.suptitle(f'{atype} - {best_method}: Feature Distributions', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig(f'{PLOTS_DIR}/04_feature_dist_{atype}.png', dpi=150, bbox_inches='tight')
    plt.show()

---
## 7. Summary Report

In [None]:
print("\n" + "="*80)
print("BGP ANOMALY GENERATION - SYSTEMATIC COMPARISON SUMMARY")
print("="*80)

print("\n1. DATASET OVERVIEW")
print("-"*40)
print(f"   High-confidence samples used: {len(df_high_conf):,}")
print(f"   Anomaly types: {len(ANOMALY_TYPES)}")
print(f"   Features: {len(FEATURE_COLS)}")
print(f"   Generators evaluated: {len(GENERATORS)}")

print("\n2. BEST GENERATOR PER ANOMALY TYPE")
print("-"*40)
best_overall = []
for atype in ANOMALY_TYPES:
    df_results = all_results[atype]
    best = df_results.iloc[0]
    print(f"\n   {atype}:")
    print(f"      Best: {best['Method']}")
    print(f"      Composite Score: {best['Composite Score']:.2f}/100")
    print(f"      KS Mean: {best['KS Mean']:.4f}")
    print(f"      Corr Structure: {best['Corr Structure']:.4f}")
    best_overall.append((atype, best['Method'], best['Composite Score']))

print("\n3. GENERATOR RANKING (Average Composite Score)")
print("-"*40)

# Aggregate scores across all anomaly types
avg_scores = defaultdict(list)
for atype in ANOMALY_TYPES:
    for _, row in all_results[atype].iterrows():
        avg_scores[row['Method']].append(row['Composite Score'])

ranking = [(method, np.mean(scores)) for method, scores in avg_scores.items()]
ranking.sort(key=lambda x: x[1], reverse=True)

for i, (method, score) in enumerate(ranking, 1):
    print(f"   {i:2d}. {method:<25} {score:.2f}/100")

print("\n4. OUTPUT FILES")
print("-"*40)
print(f"   Plots: {os.path.abspath(PLOTS_DIR)}")
print(f"   Data: {os.path.abspath(DATA_DIR)}")

print("\n" + "="*80)
print("COMPARISON COMPLETE")
print("="*80)

---
## 8. Export Best Generated Data

In [None]:
# Export synthetic data from best generators
for atype in ANOMALY_TYPES:
    df_results = all_results[atype]
    best_method = df_results.iloc[0]['Method']
    
    X_synth = generated_data[atype].get(best_method)
    if X_synth is None:
        continue
    
    # Create DataFrame
    df_synth = pd.DataFrame(X_synth, columns=FEATURE_COLS)
    df_synth['label'] = atype
    df_synth['source'] = 'synthetic'
    df_synth['generator'] = best_method
    
    # Save
    output_path = f'{DATA_DIR}/synthetic_{atype}_{best_method.lower().replace("-", "_")}.csv'
    df_synth.to_csv(output_path, index=False)
    print(f"Saved: {output_path} ({len(df_synth):,} samples)")

In [None]:
# Export complete comparison results
results_summary = []
for atype in ANOMALY_TYPES:
    for _, row in all_results[atype].iterrows():
        results_summary.append({
            'anomaly_type': atype,
            'method': row['Method'],
            'composite_score': row['Composite Score'],
            'ks_mean': row['KS Mean'],
            'ks_max': row['KS Max'],
            'wasserstein': row['Wasserstein'],
            'corr_mae': row['Corr MAE'],
            'corr_structure': row['Corr Structure'],
            'cohens_d': row["Cohen's d"]
        })

df_summary = pd.DataFrame(results_summary)
df_summary.to_csv(f'{DATA_DIR}/generation_comparison_results.csv', index=False)
print(f"\nComparison results saved to: {DATA_DIR}/generation_comparison_results.csv")

In [None]:
# Display final summary table
print("\nFINAL RESULTS TABLE:")
print(df_summary.pivot_table(
    index='method', 
    columns='anomaly_type', 
    values='composite_score',
    aggfunc='first'
).round(2).to_string())