# Gaussian Process Uncertainty-Aware Detection with Multi-Scale Temporal Modeling for Cloud-Based Network Intrusion Detection Systems

# First modeling
## GAUSSIAN PROCESS UNCERTAINTY-AWARE DETECTION SYSTEM
## Complete Implementation for Integrated Cloud Security 3-Datasets (ICS3D)
## Building on Q1-Level Paper with Real Cloud Security Data



In [None]:

import torch
import torch.nn as nn
import torch.optim as optim
import gpytorch
from gpytorch.models import ApproximateGP
from gpytorch.variational import CholeskyVariationalDistribution, UnwhitenedVariationalStrategy
from gpytorch.kernels import ScaleKernel, RBFKernel, PeriodicKernel, MaternKernel, AdditiveKernel, ProductKernel
from gpytorch.means import ConstantMean, LinearMean
from gpytorch.likelihoods import BernoulliLikelihood
from gpytorch.mlls import VariationalELBO

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, calibration_curve
from sklearn.cluster import KMeans
from scipy.stats import entropy, ks_2samp, norm
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
import kagglehub
from pathlib import Path
import json
import pickle
from typing import List, Tuple, Dict, Optional, Union
warnings.filterwarnings('ignore')

# Set device and seeds
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

torch.manual_seed(42)
np.random.seed(42)

# ============================================================================
# SECTION 1: ICS3D DATASET LOADER AND PREPROCESSOR
# ============================================================================

class ICS3DDatasetLoader:
    """
    Comprehensive loader for Integrated Cloud Security 3-Datasets
    Handles multiple cloud providers' data formats
    """
    
    def __init__(self, base_path: str = None):
        """Initialize dataset loader"""
        if base_path is None:
            # Download from Kaggle
            self.base_path = kagglehub.dataset_download(
                "rogernickanaedevha/integrated-cloud-security-3datasets-ics3d"
            )
            print(f"Dataset downloaded to: {self.base_path}")
        else:
            self.base_path = Path(base_path)
        
        # Dataset configurations
        self.datasets = {
            'containers': {
                'file': 'Containers_Dataset.csv',
                'label_col': 'label',
                'time_col': 'timestamp',
                'provider': 'docker/kubernetes'
            },
            'edge_iot_dnn': {
                'file': 'DNN-EdgeIIoT-dataset.csv',
                'label_col': 'Attack_type',
                'time_col': 'frame.time',
                'provider': 'edge_computing'
            },
            'edge_iot_ml': {
                'file': 'ML-EdgeIIoT-dataset.csv',
                'label_col': 'Attack_type',
                'time_col': 'frame.time',
                'provider': 'edge_computing'
            },
            'microsoft_train': {
                'file': 'Microsoft_GUIDE_Train.csv',
                'label_col': 'Class',
                'time_col': 'Time',
                'provider': 'azure'
            },
            'microsoft_test': {
                'file': 'Microsoft_GUIDE_Test.csv',
                'label_col': 'Class',
                'time_col': 'Time',
                'provider': 'azure'
            }
        }
        
        self.loaded_data = {}
        self.preprocessed_data = {}
        
    def load_all_datasets(self) -> Dict[str, pd.DataFrame]:
        """Load all available datasets"""
        print("\n" + "="*60)
        print("LOADING ICS3D DATASETS")
        print("="*60)
        
        for name, config in self.datasets.items():
            file_path = Path(self.base_path) / config['file']
            
            if file_path.exists():
                print(f"\nLoading {name}...")
                df = pd.read_csv(file_path, low_memory=False)
                
                # Basic info
                print(f"  Shape: {df.shape}")
                print(f"  Provider: {config['provider']}")
                
                # Check for label column
                if config['label_col'] in df.columns:
                    attack_rate = (df[config['label_col']] != 'Normal').mean()
                    print(f"  Attack rate: {attack_rate:.2%}")
                
                self.loaded_data[name] = df
            else:
                print(f"  Warning: {config['file']} not found")
        
        return self.loaded_data
    
    def preprocess_dataset(self, dataset_name: str) -> Tuple[np.ndarray, np.ndarray, pd.DataFrame]:
        """
        Preprocess a specific dataset for GP model
        
        Returns:
            X: Feature matrix
            y: Labels (0=normal, 1=attack)
            metadata: Additional information (timestamps, attack types, etc.)
        """
        if dataset_name not in self.loaded_data:
            raise ValueError(f"Dataset {dataset_name} not loaded")
        
        df = self.loaded_data[dataset_name].copy()
        config = self.datasets[dataset_name]
        
        print(f"\nPreprocessing {dataset_name}...")
        
        # Handle label encoding
        if config['label_col'] in df.columns:
            # Convert labels to binary (normal=0, attack=1)
            if df[config['label_col']].dtype == 'object':
                y = (df[config['label_col']] != 'Normal').astype(int).values
                
                # Store attack types
                attack_types = df[config['label_col']].unique()
                print(f"  Found {len(attack_types)} unique attack types")
            else:
                y = df[config['label_col']].values
        else:
            # If no labels, assume all normal
            y = np.zeros(len(df))
        
        # Extract metadata
        metadata = pd.DataFrame()
        
        # Add timestamp if available
        if config['time_col'] in df.columns:
            metadata['timestamp'] = pd.to_datetime(df[config['time_col']], errors='coerce')
        else:
            # Create synthetic timestamps
            metadata['timestamp'] = pd.date_range(
                start='2024-01-01', 
                periods=len(df), 
                freq='S'
            )
        
        # Add attack type information
        if config['label_col'] in df.columns:
            metadata['attack_type'] = df[config['label_col']]
        
        # Feature engineering based on dataset type
        if 'containers' in dataset_name:
            X = self._process_container_features(df)
        elif 'edge_iot' in dataset_name:
            X = self._process_edge_iot_features(df)
        elif 'microsoft' in dataset_name:
            X = self._process_microsoft_features(df)
        else:
            # Generic processing
            X = self._process_generic_features(df, config)
        
        print(f"  Final shape: X={X.shape}, y={y.shape}")
        print(f"  Attack ratio: {y.mean():.2%}")
        
        self.preprocessed_data[dataset_name] = {
            'X': X,
            'y': y,
            'metadata': metadata
        }
        
        return X, y, metadata
    
    def _process_container_features(self, df: pd.DataFrame) -> np.ndarray:
        """Process container/Kubernetes specific features"""
        feature_cols = []
        
        # Network features
        network_cols = [col for col in df.columns if any(x in col.lower() for x in 
                       ['packet', 'byte', 'flow', 'port', 'protocol', 'ip'])]
        feature_cols.extend(network_cols)
        
        # Container metrics
        container_cols = [col for col in df.columns if any(x in col.lower() for x in 
                         ['cpu', 'memory', 'disk', 'container', 'pod', 'node'])]
        feature_cols.extend(container_cols)
        
        # Remove duplicates and non-numeric
        feature_cols = list(set(feature_cols))
        
        # Extract features
        X = []
        for col in feature_cols:
            if col in df.columns:
                if df[col].dtype in ['float64', 'int64']:
                    X.append(df[col].fillna(0).values)
                elif df[col].dtype == 'object':
                    # Encode categorical
                    le = LabelEncoder()
                    X.append(le.fit_transform(df[col].fillna('unknown')))
        
        if X:
            X = np.column_stack(X)
        else:
            # Fallback to all numeric columns
            numeric_cols = df.select_dtypes(include=[np.number]).columns
            X = df[numeric_cols].fillna(0).values
        
        return X
    
    def _process_edge_iot_features(self, df: pd.DataFrame) -> np.ndarray:
        """Process Edge/IoT specific features"""
        # IoT-specific features
        iot_features = []
        
        # Protocol features
        protocol_cols = [col for col in df.columns if any(x in col.lower() for x in 
                        ['tcp', 'udp', 'http', 'mqtt', 'coap', 'dns'])]
        
        # Flow statistics
        flow_cols = [col for col in df.columns if any(x in col.lower() for x in 
                    ['duration', 'packet', 'byte', 'rate', 'iat', 'flag'])]
        
        # Device features
        device_cols = [col for col in df.columns if any(x in col.lower() for x in 
                      ['device', 'sensor', 'actuator', 'gateway'])]
        
        all_cols = list(set(protocol_cols + flow_cols + device_cols))
        
        # Extract numeric features
        X = []
        for col in all_cols:
            if col in df.columns:
                if df[col].dtype in ['float64', 'int64']:
                    X.append(df[col].fillna(0).values)
        
        if X:
            X = np.column_stack(X)
        else:
            numeric_cols = df.select_dtypes(include=[np.number]).columns
            X = df[numeric_cols].fillna(0).values
        
        return X
    
    def _process_microsoft_features(self, df: pd.DataFrame) -> np.ndarray:
        """Process Microsoft Azure specific features"""
        # Azure-specific features
        azure_features = []
        
        # Cloud service features
        service_cols = [col for col in df.columns if any(x in col.lower() for x in 
                       ['vm', 'storage', 'network', 'compute', 'sql', 'cosmos'])]
        
        # Security features
        security_cols = [col for col in df.columns if any(x in col.lower() for x in 
                        ['firewall', 'nsg', 'ddos', 'waf', 'threat'])]
        
        # Performance metrics
        perf_cols = [col for col in df.columns if any(x in col.lower() for x in 
                    ['latency', 'throughput', 'iops', 'bandwidth', 'cpu', 'memory'])]
        
        all_cols = list(set(service_cols + security_cols + perf_cols))
        
        X = []
        for col in all_cols:
            if col in df.columns and df[col].dtype in ['float64', 'int64']:
                X.append(df[col].fillna(0).values)
        
        if X:
            X = np.column_stack(X)
        else:
            numeric_cols = df.select_dtypes(include=[np.number]).columns
            X = df[numeric_cols].fillna(0).values
        
        return X
    
    def _process_generic_features(self, df: pd.DataFrame, config: dict) -> np.ndarray:
        """Generic feature processing"""
        # Get all numeric columns
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        
        # Remove label column if present
        if config['label_col'] in numeric_cols:
            numeric_cols.remove(config['label_col'])
        
        # Extract features
        if numeric_cols:
            X = df[numeric_cols].fillna(0).values
        else:
            # If no numeric columns, create dummy features
            X = np.random.randn(len(df), 10)
        
        return X
    
    def create_unified_dataset(self) -> Tuple[np.ndarray, np.ndarray, pd.DataFrame]:
        """
        Create a unified dataset combining all sources
        
        Returns:
            X: Combined feature matrix
            y: Combined labels
            metadata: Combined metadata with source information
        """
        print("\n" + "="*60)
        print("CREATING UNIFIED ICS3D DATASET")
        print("="*60)
        
        X_list = []
        y_list = []
        metadata_list = []
        
        for name in self.preprocessed_data:
            data = self.preprocessed_data[name]
            
            # Add source information to metadata
            data['metadata']['source'] = name
            
            X_list.append(data['X'])
            y_list.append(data['y'])
            metadata_list.append(data['metadata'])
        
        # Combine all data
        if X_list:
            # Pad features to same dimensionality
            max_features = max(X.shape[1] for X in X_list)
            
            X_padded = []
            for X in X_list:
                if X.shape[1] < max_features:
                    padding = np.zeros((X.shape[0], max_features - X.shape[1]))
                    X = np.hstack([X, padding])
                X_padded.append(X)
            
            X_unified = np.vstack(X_padded)
            y_unified = np.hstack(y_list)
            metadata_unified = pd.concat(metadata_list, ignore_index=True)
            
            print(f"\nUnified dataset shape: {X_unified.shape}")
            print(f"Total samples: {len(X_unified):,}")
            print(f"Features: {X_unified.shape[1]}")
            print(f"Attack rate: {y_unified.mean():.2%}")
            
            # Dataset composition
            print("\nDataset composition:")
            for source in metadata_unified['source'].unique():
                count = (metadata_unified['source'] == source).sum()
                pct = count / len(metadata_unified) * 100
                print(f"  {source}: {count:,} samples ({pct:.1f}%)")
            
            return X_unified, y_unified, metadata_unified
        else:
            raise ValueError("No data available for unification")

# ============================================================================
# SECTION 2: ADVANCED MULTI-SCALE GP FOR CLOUD SECURITY
# ============================================================================

class CloudSecurityGP(ApproximateGP):
    """
    Specialized Gaussian Process for Cloud Security
    Implements methodology from Q1 paper
    """
    
    def __init__(self, inducing_points: torch.Tensor, 
                 feature_dim: int,
                 cloud_provider: str = 'multi',
                 config: dict = None):
        """
        Initialize Cloud Security GP
        
        Args:
            inducing_points: Inducing points for sparse GP
            feature_dim: Number of input features
            cloud_provider: One of 'aws', 'azure', 'gcp', 'multi'
            config: Model configuration
        """
        self.config = config or {
            'num_inducing': 500,
            'learn_inducing': True,
            'use_ard': True,
            'num_mixtures': 4,
            'use_spectral': True,
            'use_matern': True,
            'cloud_specific': True
        }
        
        # Variational setup
        variational_distribution = CholeskyVariationalDistribution(
            inducing_points.size(0)
        )
        variational_strategy = UnwhitenedVariationalStrategy(
            self, inducing_points, variational_distribution,
            learn_inducing_locations=self.config['learn_inducing']
        )
        
        super().__init__(variational_strategy)
        
        self.feature_dim = feature_dim
        self.cloud_provider = cloud_provider
        
        # Build model components
        self.mean_module = self._build_mean_function()
        self.covar_module = self._build_cloud_kernel()
        
        # Track kernel components
        self.kernel_components = {}
        
    def _build_mean_function(self):
        """Build mean function with cloud-specific trends"""
        if self.cloud_provider == 'multi':
            # Linear mean to capture provider-specific trends
            return LinearMean(self.feature_dim)
        else:
            # Constant mean for single provider
            return ConstantMean()
    
    def _build_cloud_kernel(self):
        """Build kernel structure for cloud security"""
        kernels = []
        
        # 1. Spatial kernel for feature similarities
        if self.config['use_ard']:
            spatial_kernel = ScaleKernel(
                RBFKernel(
                    ard_num_dims=self.feature_dim,
                    lengthscale_prior=gpytorch.priors.LogNormalPrior(0.0, 1.0)
                )
            )
        else:
            spatial_kernel = ScaleKernel(RBFKernel())
        
        kernels.append(spatial_kernel)
        self.kernel_components['spatial'] = spatial_kernel
        
        # 2. Multi-scale temporal kernels (from paper)
        time_scales = [
            ('microsecond', -6, 0.5),  # Side-channel attacks
            ('millisecond', -3, 0.5),  # Buffer overflows
            ('second', 0, 0.5),        # SYN floods
            ('minute', 2, 0.5),        # Port scanning
            ('hour', 4, 0.5),          # Data exfiltration
            ('day', 5, 0.5)            # APT campaigns
        ]
        
        for name, loc, scale in time_scales:
            kernel = ScaleKernel(
                RBFKernel(
                    lengthscale_prior=gpytorch.priors.LogNormalPrior(
                        torch.tensor(loc), torch.tensor(scale)
                    )
                )
            )
            kernels.append(kernel)
            self.kernel_components[f'temporal_{name}'] = kernel
        
        # 3. Periodic kernels for cyclical patterns
        periods = [
            ('hourly', 3600),
            ('daily', 86400),
            ('weekly', 604800)
        ]
        
        for name, period in periods:
            kernel = ScaleKernel(
                PeriodicKernel(
                    period_length_prior=gpytorch.priors.LogNormalPrior(
                        torch.tensor(np.log(period)), torch.tensor(0.1)
                    )
                )
            )
            kernels.append(kernel)
            self.kernel_components[f'periodic_{name}'] = kernel
        
        # 4. Cloud-specific kernels
        if self.config['cloud_specific']:
            # Matérn kernel for rough attack patterns
            matern_kernel = ScaleKernel(
                MaternKernel(
                    nu=2.5,
                    lengthscale_prior=gpytorch.priors.LogNormalPrior(0.0, 1.0)
                )
            )
            kernels.append(matern_kernel)
            self.kernel_components['matern'] = matern_kernel
            
            # Container/microservice kernel
            container_kernel = ScaleKernel(
                RBFKernel(
                    lengthscale_prior=gpytorch.priors.LogNormalPrior(-2.0, 0.5)
                )
            )
            kernels.append(container_kernel)
            self.kernel_components['container'] = container_kernel
        
        # 5. Spectral mixture for complex patterns
        if self.config['use_spectral']:
            from gpytorch.kernels import SpectralMixtureKernel
            spectral_kernel = SpectralMixtureKernel(
                num_mixtures=self.config['num_mixtures'],
                ard_num_dims=1
            )
            kernels.append(spectral_kernel)
            self.kernel_components['spectral'] = spectral_kernel
        
        # Combine kernels
        composite_kernel = AdditiveKernel(*kernels)
        
        return composite_kernel
    
    def forward(self, x):
        """Forward pass"""
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
    
    def get_kernel_decomposition(self, x):
        """Decompose kernel contributions for interpretability"""
        contributions = {}
        
        with torch.no_grad():
            for name, kernel in self.kernel_components.items():
                K = kernel(x).evaluate()
                contributions[name] = {
                    'matrix': K.cpu().numpy(),
                    'trace': K.trace().item(),
                    'mean': K.mean().item(),
                    'std': K.std().item()
                }
        
        return contributions

# ============================================================================
# SECTION 3: CLOUD-AWARE UNCERTAINTY DETECTION SYSTEM
# ============================================================================

class CloudUncertaintyDetector:
    """
    Production-ready uncertainty-aware detection for cloud environments
    Implements full methodology from Q1 paper
    """
    
    def __init__(self, feature_dim: int, cloud_provider: str = 'multi', config: dict = None):
        """Initialize cloud security detector"""
        self.feature_dim = feature_dim
        self.cloud_provider = cloud_provider
        self.device = device
        
        # Configuration from paper
        self.config = config or {
            'num_inducing': 500,
            'batch_size': 256,
            'learning_rate': 0.01,
            'epochs': 50,
            'uncertainty_weight': 0.5,
            'entropy_weight': 0.3,
            'adaptive_threshold': True,
            'adversarial_training': True,
            'epsilon': 0.1
        }
        
        # Models
        self.model = None
        self.likelihood = BernoulliLikelihood().to(self.device)
        
        # Metrics tracking
        self.metrics = {
            'training': [],
            'validation': [],
            'calibration': [],
            'temporal': [],
            'adversarial': [],
            'per_provider': {}
        }
        
        # Baseline statistics
        self.baseline_stats = None
        
        # Drift detection
        self.drift_detector = CloudDriftDetector()
        
    def select_adversarial_inducing_points(self, X: torch.Tensor, y: torch.Tensor, 
                                          epsilon: float = 0.1, 
                                          iterations: int = 10) -> torch.Tensor:
        """
        Select adversarially robust inducing points (from paper methodology)
        
        Args:
            X: Training features
            y: Training labels
            epsilon: Perturbation budget
            iterations: Number of adversarial iterations
        """
        print("\nSelecting adversarial inducing points...")
        
        # Initialize with k-means
        n_inducing = min(self.config['num_inducing'], X.shape[0] // 10)
        kmeans = KMeans(n_clusters=n_inducing, random_state=42)
        kmeans.fit(X.cpu().numpy())
        inducing_points = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32).to(self.device)
        
        if self.config['adversarial_training']:
            # Make inducing points robust
            inducing_points.requires_grad_(True)
            optimizer = optim.Adam([inducing_points], lr=0.01)
            
            for iter_idx in range(iterations):
                # Generate adversarial perturbations using PGD
                X_adv = self._pgd_attack(X[:1000], y[:1000], epsilon, steps=5)
                
                # Compute coverage under perturbations
                distances = torch.cdist(X_adv, inducing_points)
                coverage_loss = distances.min(dim=1)[0].mean()
                
                # Add diversity term
                pairwise_distances = torch.cdist(inducing_points, inducing_points)
                diversity_loss = -torch.log(pairwise_distances + 1e-6).mean()
                
                total_loss = coverage_loss + 0.1 * diversity_loss
                
                optimizer.zero_grad()
                total_loss.backward()
                optimizer.step()
                
                if (iter_idx + 1) % 5 == 0:
                    print(f"  Iteration {iter_idx + 1}: Loss = {total_loss.item():.4f}")
            
            inducing_points = inducing_points.detach()
        
        return inducing_points
    
    def _pgd_attack(self, X: torch.Tensor, y: torch.Tensor, 
                    epsilon: float, steps: int = 10) -> torch.Tensor:
        """Projected Gradient Descent attack for adversarial training"""
        X_adv = X.clone().detach()
        X_adv.requires_grad = True
        
        for _ in range(steps):
            # Simplified attack (would use actual model in practice)
            loss = nn.functional.binary_cross_entropy_with_logits(
                X_adv.mean(dim=1), y.float()
            )
            
            grad = torch.autograd.grad(loss, X_adv)[0]
            X_adv = X_adv.detach() + epsilon * grad.sign()
            
            # Project back to epsilon ball
            delta = torch.clamp(X_adv - X, min=-epsilon, max=epsilon)
            X_adv = X + delta
            X_adv.requires_grad = True
        
        return X_adv.detach()
    
    def train_model(self, train_loader, val_loader=None, verbose=True):
        """Train GP model with validation and early stopping"""
        self.model.train()
        self.likelihood.train()
        
        optimizer = optim.Adam([
            {'params': self.model.parameters()},
            {'params': self.likelihood.parameters()}
        ], lr=self.config['learning_rate'])
        
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, patience=5, factor=0.5, verbose=verbose
        )
        
        mll = VariationalELBO(self.likelihood, self.model, 
                             num_data=len(train_loader.dataset))
        
        best_val_loss = np.inf
        patience_counter = 0
        
        for epoch in range(self.config['epochs']):
            epoch_metrics = {
                'train_loss': 0,
                'train_acc': 0,
                'val_loss': 0,
                'val_acc': 0
            }
            
            # Training loop
            for batch_x, batch_y in train_loader:
                batch_x = batch_x.to(self.device)
                batch_y = batch_y.to(self.device).float()
                
                optimizer.zero_grad()
                output = self.model(batch_x)
                loss = -mll(output, batch_y)
                loss.backward()
                
                # Gradient clipping
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                
                optimizer.step()
                
                epoch_metrics['train_loss'] += loss.item()
                
                # Compute accuracy
                with torch.no_grad():
                    predicted = self.likelihood(output).mean.round()
                    epoch_metrics['train_acc'] += (predicted == batch_y).float().mean().item()
            
            # Normalize metrics
            epoch_metrics['train_loss'] /= len(train_loader)
            epoch_metrics['train_acc'] /= len(train_loader)
            
            # Validation
            if val_loader is not None:
                self.model.eval()
                self.likelihood.eval()
                
                with torch.no_grad():
                    for batch_x, batch_y in val_loader:
                        batch_x = batch_x.to(self.device)
                        batch_y = batch_y.to(self.device).float()
                        
                        output = self.model(batch_x)
                        loss = -mll(output, batch_y)
                        
                        epoch_metrics['val_loss'] += loss.item()
                        
                        predicted = self.likelihood(output).mean.round()
                        epoch_metrics['val_acc'] += (predicted == batch_y).float().mean().item()
                
                epoch_metrics['val_loss'] /= len(val_loader)
                epoch_metrics['val_acc'] /= len(val_loader)
                
                # Learning rate scheduling
                scheduler.step(epoch_metrics['val_loss'])
                
                # Early stopping
                if epoch_metrics['val_loss'] < best_val_loss:
                    best_val_loss = epoch_metrics['val_loss']
                    patience_counter = 0
                    self.best_model_state = self.model.state_dict()
                else:
                    patience_counter += 1
                    if patience_counter >= 10:
                        print(f"Early stopping at epoch {epoch + 1}")
                        break
                
                self.model.train()
                self.likelihood.train()
            
            self.metrics['training'].append(epoch_metrics)
            
            if verbose and (epoch + 1) % 10 == 0:
                print(f"Epoch {epoch+1}/{self.config['epochs']}: "
                      f"Train Loss={epoch_metrics['train_loss']:.4f}, "
                      f"Train Acc={epoch_metrics['train_acc']:.4f}, "
                      f"Val Loss={epoch_metrics['val_loss']:.4f}, "
                      f"Val Acc={epoch_metrics['val_acc']:.4f}")
        
        # Load best model
        if hasattr(self, 'best_model_state'):
            self.model.load_state_dict(self.best_model_state)
    
    def compute_uncertainty_metrics(self, X: torch.Tensor) -> Dict:
        """Compute comprehensive uncertainty measures (from paper)"""
        self.model.eval()
        self.likelihood.eval()
        
        with torch.no_grad(), gpytorch.settings.fast_pred_var():
            output = self.model(X)
            pred_dist = self.likelihood(output)
            
            # Predictive statistics
            mean = pred_dist.mean
            variance = pred_dist.variance
            std = torch.sqrt(variance + 1e-6)
            
            # Entropy (Equation 24 from paper)
            entropy = -mean * torch.log(mean + 1e-6) - (1 - mean) * torch.log(1 - mean + 1e-6)
            
            # Epistemic uncertainty (model uncertainty)
            epistemic = output.variance
            
            # Aleatoric uncertainty (data uncertainty)
            aleatoric = variance - epistemic
            aleatoric = torch.clamp(aleatoric, min=0)  # Ensure non-negative
            
            # Confidence
            confidence = 1 / (1 + std)
        
        return {
            'mean': mean,
            'variance': variance,
            'std': std,
            'entropy': entropy,
            'epistemic': epistemic,
            'aleatoric': aleatoric,
            'confidence': confidence
        }
    
    def adaptive_anomaly_detection(self, X: torch.Tensor, metadata: pd.DataFrame = None) -> Dict:
        """
        Uncertainty-calibrated anomaly detection (from paper Section 4.4)
        
        Args:
            X: Input features
            metadata: Optional metadata with timestamps, sources, etc.
        """
        uncertainties = self.compute_uncertainty_metrics(X)
        
        # Compute anomaly score (Equation 23 from paper)
        if self.baseline_stats is not None:
            deviation = torch.abs(uncertainties['mean'] - self.baseline_stats['mean'])
        else:
            deviation = uncertainties['mean']
        
        normalized_score = deviation / (uncertainties['std'] + 0.1)
        
        # Add entropy component
        if self.config['entropy_weight'] > 0:
            uncertainty_score = normalized_score + self.config['entropy_weight'] * uncertainties['entropy']
        else:
            uncertainty_score = normalized_score
        
        # Adaptive threshold (Equation 26 from paper)
        if self.config['adaptive_threshold']:
            base_threshold = 0.5
            threshold = base_threshold + self.config['uncertainty_weight'] * uncertainties['std']
            
            # Cloud-specific adjustments
            if metadata is not None and 'source' in metadata.columns:
                # Adjust threshold based on cloud provider
                provider_adjustments = {
                    'containers': 0.1,
                    'edge_iot': 0.15,
                    'microsoft': 0.05
                }
                
                for provider, adjustment in provider_adjustments.items():
                    if provider in metadata['source'].values:
                        threshold = threshold + adjustment
        else:
            threshold = torch.ones_like(uncertainties['mean']) * 0.5
        
        # Detection decisions
        detections = uncertainty_score > threshold
        
        return {
            'detections': detections,
            'scores': uncertainty_score,
            'threshold': threshold,
            'uncertainties': uncertainties,
            'components': {
                'deviation': deviation,
                'normalized': normalized_score,
                'entropy_weighted': uncertainty_score
            }
        }
    
    def evaluate_calibration(self, X: torch.Tensor, y: torch.Tensor, n_bins: int = 10) -> Dict:
        """Evaluate prediction calibration (Expected Calibration Error)"""
        uncertainties = self.compute_uncertainty_metrics(X)
        predictions = uncertainties['mean'].round()
        confidences = uncertainties['confidence']
        
        # Compute ECE
        ece = 0
        mce = 0
        bin_boundaries = torch.linspace(0, 1, n_bins + 1)
        reliability_data = []
        
        for i in range(n_bins):
            mask = (confidences >= bin_boundaries[i]) & (confidences < bin_boundaries[i+1])
            if mask.sum() > 0:
                bin_acc = (predictions[mask] == y[mask].float()).float().mean()
                bin_conf = confidences[mask].mean()
                bin_weight = mask.float().mean()
                
                ece += bin_weight * torch.abs(bin_acc - bin_conf)
                mce = max(mce, torch.abs(bin_acc - bin_conf).item())
                
                reliability_data.append({
                    'confidence': bin_conf.item(),
                    'accuracy': bin_acc.item(),
                    'count': mask.sum().item()
                })
        
        # Brier Score
        brier_score = ((uncertainties['mean'] - y.float())**2).mean().item()
        
        return {
            'ece': ece.item(),
            'mce': mce,
            'brier_score': brier_score,
            'reliability_diagram': reliability_data
        }
    
    def incremental_update(self, X_new: torch.Tensor, y_new: torch.Tensor):
        """Online learning update (from paper Section 4.6)"""
        self.model.train()
        self.likelihood.train()
        
        # Create mini-batch
        dataset = torch.utils.data.TensorDataset(X_new, y_new.float())
        loader = torch.utils.data.DataLoader(dataset, batch_size=32)
        
        optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        mll = VariationalELBO(self.likelihood, self.model, num_data=len(X_new))
        
        # Single epoch update
        for batch_x, batch_y in loader:
            optimizer.zero_grad()
            output = self.model(batch_x)
            loss = -mll(output, batch_y)
            loss.backward()
            optimizer.step()
        
        self.model.eval()
        self.likelihood.eval()

class CloudDriftDetector:
    """Detect concept drift in cloud environments"""
    
    def __init__(self, window_size: int = 1000, threshold: float = 0.05):
        self.window_size = window_size
        self.threshold = threshold
        self.reference_window = []
        self.current_window = []
        self.drift_history = []
    
    def detect_drift(self, features: torch.Tensor, predictions: torch.Tensor, 
                     uncertainties: Dict) -> Tuple[bool, float]:
        """Detect concept drift using uncertainty and distribution changes"""
        
        # Update windows
        self.current_window.append({
            'features': features.cpu().numpy(),
            'predictions': predictions.cpu().numpy(),
            'uncertainty': uncertainties['total'].mean().item(),
            'timestamp': datetime.now()
        })
        
        if len(self.current_window) > self.window_size:
            self.reference_window.append(self.current_window.pop(0))
            if len(self.reference_window) > self.window_size:
                self.reference_window.pop(0)
        
        # Check for drift
        if len(self.reference_window) >= self.window_size // 2 and \
           len(self.current_window) >= self.window_size // 2:
            
            # Compare uncertainty distributions
            ref_uncertainty = np.array([w['uncertainty'] for w in self.reference_window])
            curr_uncertainty = np.array([w['uncertainty'] for w in self.current_window])
            
            # KS test for distribution shift
            ks_stat, p_value = ks_2samp(ref_uncertainty, curr_uncertainty)
            
            # Uncertainty ratio
            uncertainty_ratio = curr_uncertainty.mean() / (ref_uncertainty.mean() + 1e-6)
            
            # Combined drift score
            drift_score = ks_stat * 0.5 + max(0, uncertainty_ratio - 1) * 0.5
            
            drift_detected = drift_score > self.threshold
            
            if drift_detected:
                self.drift_history.append({
                    'timestamp': datetime.now(),
                    'score': drift_score,
                    'ks_stat': ks_stat,
                    'uncertainty_ratio': uncertainty_ratio
                })
            
            return drift_detected, drift_score
        
        return False, 0.0

# ============================================================================
# SECTION 4: COMPREHENSIVE EXPERIMENT PIPELINE
# ============================================================================

class ICS3DExperimentPipeline:
    """Complete experimental pipeline for ICS3D dataset"""
    
    def __init__(self, output_dir: str = './ics3d_results'):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        
        self.experiment_id = f"ics3d_gp_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        self.results = {}
        
    def run_complete_experiment(self, use_unified: bool = True):
        """Run complete experimental evaluation on ICS3D"""
        
        print("\n" + "="*70)
        print("ICS3D GAUSSIAN PROCESS UNCERTAINTY-AWARE DETECTION")
        print("COMPLETE EXPERIMENTAL PIPELINE")
        print("="*70)
        
        # Step 1: Load and prepare data
        print("\n[Step 1] Loading ICS3D datasets...")
        loader = ICS3DDatasetLoader()
        loader.load_all_datasets()
        
        # Process each dataset
        for name in loader.loaded_data:
            loader.preprocess_dataset(name)
        
        # Create unified dataset
        if use_unified:
            X, y, metadata = loader.create_unified_dataset()
        else:
            # Use largest dataset
            name = 'microsoft_train'  # or choose specific dataset
            data = loader.preprocessed_data[name]
            X, y, metadata = data['X'], data['y'], data['metadata']
        
        # Step 2: Prepare data
        print("\n[Step 2] Preparing data...")
        
        # Split data
        X_train, X_test, y_train, y_test, meta_train, meta_test = train_test_split(
            X, y, metadata, test_size=0.2, random_state=42, stratify=y
        )
        
        # Normalize
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        # Convert to tensors
        X_train_tensor = torch.FloatTensor(X_train).to(device)
        y_train_tensor = torch.FloatTensor(y_train).to(device)
        X_test_tensor = torch.FloatTensor(X_test).to(device)
        y_test_tensor = torch.FloatTensor(y_test).to(device)
        
        print(f"  Training samples: {len(X_train):,}")
        print(f"  Test samples: {len(X_test):,}")
        print(f"  Features: {X_train.shape[1]}")
        print(f"  Attack rate (train): {y_train.mean():.2%}")
        print(f"  Attack rate (test): {y_test.mean():.2%}")
        
        # Step 3: Initialize detector
        print("\n[Step 3] Initializing Cloud Security GP...")
        
        detector = CloudUncertaintyDetector(
            feature_dim=X_train.shape[1],
            cloud_provider='multi',
            config={
                'num_inducing': min(500, X_train.shape[0] // 20),
                'epochs': 30,
                'batch_size': 256,
                'adversarial_training': True
            }
        )
        
        # Select adversarial inducing points
        inducing_points = detector.select_adversarial_inducing_points(
            X_train_tensor[:5000],
            y_train_tensor[:5000],
            epsilon=0.1,
            iterations=10
        )
        
        # Initialize model
        detector.model = CloudSecurityGP(
            inducing_points=inducing_points,
            feature_dim=X_train.shape[1],
            cloud_provider='multi'
        ).to(device)
        
        # Step 4: Train model
        print("\n[Step 4] Training model...")
        
        # Create data loaders
        train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=256, shuffle=True
        )
        
        val_size = int(0.1 * len(X_train_tensor))
        val_dataset = torch.utils.data.TensorDataset(
            X_train_tensor[-val_size:],
            y_train_tensor[-val_size:]
        )
        val_loader = torch.utils.data.DataLoader(
            val_dataset, batch_size=256, shuffle=False
        )
        
        # Train
        detector.train_model(train_loader, val_loader, verbose=True)
        
        # Step 5: Evaluate
        print("\n[Step 5] Evaluating model...")
        
        # Get predictions
        results = detector.adaptive_anomaly_detection(X_test_tensor, meta_test)
        
        predictions = results['detections'].cpu().numpy()
        scores = results['scores'].cpu().numpy()
        uncertainties = results['uncertainties']
        
        # Compute metrics
        from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
        
        accuracy = accuracy_score(y_test, predictions)
        precision, recall, f1, _ = precision_recall_fscore_support(
            y_test, predictions, average='binary', zero_division=0
        )
        
        if len(np.unique(y_test)) > 1:
            auc = roc_auc_score(y_test, scores)
        else:
            auc = 0.0
        
        print(f"\n  DETECTION METRICS:")
        print(f"    Accuracy: {accuracy:.4f}")
        print(f"    Precision: {precision:.4f}")
        print(f"    Recall: {recall:.4f}")
        print(f"    F1-Score: {f1:.4f}")
        print(f"    AUC-ROC: {auc:.4f}")
        
        # Calibration metrics
        calibration = detector.evaluate_calibration(X_test_tensor, y_test_tensor)
        
        print(f"\n  CALIBRATION METRICS:")
        print(f"    ECE: {calibration['ece']:.4f}")
        print(f"    MCE: {calibration['mce']:.4f}")
        print(f"    Brier Score: {calibration['brier_score']:.4f}")
        
        # False positive analysis
        fp_mask = (predictions == 1) & (y_test == 0)
        fp_count = fp_mask.sum()
        fp_rate = fp_count / (y_test == 0).sum() if (y_test == 0).sum() > 0 else 0
        
        # Uncertainty-based FP reduction
        high_uncertainty = uncertainties['entropy'] > uncertainties['entropy'].median()
        uncertain_fps = fp_mask & high_uncertainty.cpu().numpy()
        fp_reduction = 1 - (uncertain_fps.sum() / fp_count) if fp_count > 0 else 0
        
        print(f"\n  FALSE POSITIVE ANALYSIS:")
        print(f"    FP Rate: {fp_rate:.2%}")
        print(f"    FP Count: {fp_count}")
        print(f"    FP Reduction (uncertainty): {fp_reduction:.1%}")
        
        # Per-provider analysis
        if 'source' in meta_test.columns:
            print(f"\n  PER-PROVIDER PERFORMANCE:")
            for source in meta_test['source'].unique():
                source_mask = meta_test['source'] == source
                if source_mask.sum() > 0:
                    source_acc = accuracy_score(
                        y_test[source_mask], 
                        predictions[source_mask]
                    )
                    print(f"    {source}: {source_acc:.4f}")
        
        # Step 6: Visualization
        print("\n[Step 6] Generating visualizations...")
        self.visualize_results(detector, X_test_tensor[:1000], y_test_tensor[:1000], 
                               meta_test.iloc[:1000])
        
        # Step 7: Save results
        self.results = {
            'metrics': {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'auc': auc,
                'fp_rate': fp_rate,
                'fp_reduction': fp_reduction
            },
            'calibration': calibration,
            'experiment_id': self.experiment_id,
            'timestamp': datetime.now().isoformat()
        }
        
        # Save to file
        results_file = self.output_dir / f"{self.experiment_id}_results.json"
        with open(results_file, 'w') as f:
            json.dump(self.results, f, indent=2, default=str)
        
        print(f"\n[Step 7] Results saved to {results_file}")
        
        return self.results
    
    def visualize_results(self, detector, X_test, y_test, metadata):
        """Generate comprehensive visualizations"""
        
        fig = plt.figure(figsize=(20, 12))
        gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)
        
        # Get predictions
        results = detector.adaptive_anomaly_detection(X_test, metadata)
        uncertainties = results['uncertainties']
        
        # 1. Predictions with uncertainty
        ax1 = fig.add_subplot(gs[0, :])
        
        time_index = np.arange(len(X_test))
        mean = uncertainties['mean'].cpu().numpy()
        std = uncertainties['std'].cpu().numpy()
        
        ax1.plot(time_index, mean, 'b-', alpha=0.7, label='Prediction')
        ax1.fill_between(time_index, mean - 2*std, mean + 2*std,
                         alpha=0.3, color='blue', label='95% CI')
        
        # Mark true anomalies
        anomaly_mask = y_test.cpu().numpy() == 1
        if anomaly_mask.any():
            ax1.scatter(time_index[anomaly_mask], mean[anomaly_mask],
                       c='red', marker='x', s=50, label='True Anomalies', alpha=0.5)
        
        ax1.set_xlabel('Sample Index')
        ax1.set_ylabel('Anomaly Probability')
        ax1.set_title('GP Predictions with Uncertainty Quantification')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        
        # 2. Uncertainty decomposition
        ax2 = fig.add_subplot(gs[1, 0])
        
        epistemic = uncertainties['epistemic'].cpu().numpy()
        aleatoric = uncertainties['aleatoric'].cpu().numpy()
        
        ax2.plot(time_index, epistemic, 'g-', alpha=0.7, label='Epistemic')
        ax2.plot(time_index, aleatoric, 'orange', alpha=0.7, label='Aleatoric')
        
        ax2.set_xlabel('Sample Index')
        ax2.set_ylabel('Uncertainty')
        ax2.set_title('Uncertainty Decomposition')
        ax2.legend()
        ax2.grid(True, alpha=0.3)
        
        # 3. Entropy
        ax3 = fig.add_subplot(gs[1, 1])
        
        entropy = uncertainties['entropy'].cpu().numpy()
        ax3.plot(time_index, entropy, 'purple', alpha=0.7)
        ax3.fill_between(time_index, 0, entropy, alpha=0.3, color='purple')
        
        ax3.set_xlabel('Sample Index')
        ax3.set_ylabel('Entropy')
        ax3.set_title('Predictive Entropy')
        ax3.grid(True, alpha=0.3)
        
        # 4. Score distribution
        ax4 = fig.add_subplot(gs[1, 2])
        
        scores = results['scores'].cpu().numpy()
        normal_scores = scores[y_test.cpu().numpy() == 0]
        anomaly_scores = scores[y_test.cpu().numpy() == 1]
        
        if len(normal_scores) > 0:
            ax4.hist(normal_scores, bins=30, alpha=0.5, label='Normal', density=True)
        if len(anomaly_scores) > 0:
            ax4.hist(anomaly_scores, bins=30, alpha=0.5, label='Anomaly', density=True)
        
        ax4.set_xlabel('Anomaly Score')
        ax4.set_ylabel('Density')
        ax4.set_title('Score Distribution by Class')
        ax4.legend()
        ax4.grid(True, alpha=0.3)
        
        # 5. Calibration plot
        ax5 = fig.add_subplot(gs[2, 0])
        
        # Simple calibration plot
        ax5.plot([0, 1], [0, 1], 'k--', alpha=0.3, label='Perfect Calibration')
        
        # Bin predictions
        n_bins = 10
        bin_boundaries = np.linspace(0, 1, n_bins + 1)
        bin_centers = []
        bin_accuracies = []
        
        confidences = uncertainties['confidence'].cpu().numpy()
        predictions = results['detections'].cpu().numpy()
        y_true = y_test.cpu().numpy()
        
        for i in range(n_bins):
            mask = (confidences >= bin_boundaries[i]) & (confidences < bin_boundaries[i+1])
            if mask.sum() > 0:
                bin_centers.append((bin_boundaries[i] + bin_boundaries[i+1]) / 2)
                bin_accuracies.append((predictions[mask] == y_true[mask]).mean())
        
        if bin_centers:
            ax5.plot(bin_centers, bin_accuracies, 'bo-', label='Model')
        
        ax5.set_xlabel('Confidence')
        ax5.set_ylabel('Accuracy')
        ax5.set_title('Calibration Plot')
        ax5.legend()
        ax5.grid(True, alpha=0.3)
        
        # 6. ROC curve
        ax6 = fig.add_subplot(gs[2, 1])
        
        from sklearn.metrics import roc_curve, auc
        if len(np.unique(y_true)) > 1:
            fpr, tpr, _ = roc_curve(y_true, scores)
            roc_auc = auc(fpr, tpr)
            
            ax6.plot(fpr, tpr, 'b-', linewidth=2, label=f'ROC (AUC = {roc_auc:.3f})')
            ax6.plot([0, 1], [0, 1], 'k--', alpha=0.3)
        
        ax6.set_xlabel('False Positive Rate')
        ax6.set_ylabel('True Positive Rate')
        ax6.set_title('ROC Curve')
        ax6.legend()
        ax6.grid(True, alpha=0.3)
        
        # 7. Confusion matrix
        ax7 = fig.add_subplot(gs[2, 2])
        
        cm = confusion_matrix(y_true, predictions)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax7)
        ax7.set_xlabel('Predicted')
        ax7.set_ylabel('Actual')
        ax7.set_title('Confusion Matrix')
        
        plt.suptitle('ICS3D GP Uncertainty-Aware Detection Results', fontsize=16, y=1.02)
        plt.tight_layout()
        
        # Save figure
        fig_path = self.output_dir / f"{self.experiment_id}_visualization.png"
        plt.savefig(fig_path, dpi=150, bbox_inches='tight')
        plt.show()
        
        print(f"  Visualization saved to {fig_path}")

# ============================================================================
# MAIN EXECUTION
# ============================================================================

def main():
    """Main execution function"""
    
    print("\n" + "="*80)
    print("GAUSSIAN PROCESS UNCERTAINTY-AWARE DETECTION")
    print("INTEGRATED CLOUD SECURITY 3-DATASETS (ICS3D)")
    print("="*80)
    
    # Run complete experiment
    pipeline = ICS3DExperimentPipeline()
    results = pipeline.run_complete_experiment(use_unified=True)
    
    print("\n" + "="*80)
    print("EXPERIMENT COMPLETE")
    print("="*80)
    
    print("\nFINAL RESULTS SUMMARY:")
    print(f"  Detection F1-Score: {results['metrics']['f1']:.4f}")
    print(f"  AUC-ROC: {results['metrics']['auc']:.4f}")
    print(f"  False Positive Reduction: {results['metrics']['fp_reduction']:.1%}")
    print(f"  Calibration Error (ECE): {results['calibration']['ece']:.4f}")
    
    return results

if __name__ == "__main__":
    # Set up environment
    import os
    os.environ['CUDA_LAUNCH_BLOCKING'] = '1'  # For better error messages
    
    # Run main experiment
    results = main() 
