# Differentially Private Optimal Transport for Multi-Cloud Intrusion Detection
## A Privacy-Preserving Domain Adaptation Framework
## Q1 Journal Version - Complete Implementation
## Author: Roger Nick Anaedevha
### Corresponding Paper: NotP4_v3c.tex

## Section 1: Imports and Environment Setup

In [None]:
# Core scientific computing
import numpy as np
import pandas as pd
from scipy import stats
from scipy.spatial.distance import cdist

# PyTorch ecosystem
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Subset
from torch.autograd import Variable

# Machine learning utilities
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, confusion_matrix,
    precision_recall_curve, roc_curve
)

# Optimal transport libraries
import ot  # Python Optimal Transport (POT)
import geomloss  # GeomLoss for Sinkhorn divergences

# Visualization
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.gridspec import GridSpec
import seaborn as sns

# Utilities
import os
import sys
import time
import warnings
import kagglehub
from tqdm import tqdm
from collections import defaultdict, OrderedDict
from typing import Tuple, List, Dict, Optional, Union
import pickle
import json

# Suppress warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Set random seeds for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Create output directories
os.makedirs('outputs', exist_ok=True)
os.makedirs('outputs/figures', exist_ok=True)
os.makedirs('outputs/tables', exist_ok=True)
os.makedirs('outputs/models', exist_ok=True)

print("\n" + "="*80)
print("Environment Setup Complete")
print("="*80)

## Section 2: Data Loading and Preprocessing
### Implementation of 6-step preprocessing pipeline from paper (Lines 607-620)

In [None]:
class ICS3DDataLoader:
    """Comprehensive data loader for Integrated Cloud Security 3Datasets (ICS3D)
    
    Implements the complete preprocessing pipeline from the paper:
    1. Identifier Removal
    2. Temporal Features  
    3. Numeric Normalization (Winsorization + Standardization)
    4. Categorical Encoding
    5. Missing Value Imputation
    6. Temporal Splitting (70/15/15)
    """
    
    def __init__(self, dataset_path: str = None, download: bool = True):
        if dataset_path is None and download:
            print("Downloading ICS3D from Kaggle...")
            self.path = kagglehub.dataset_download(
                "rogernickanaedevha/integrated-cloud-security-3datasets-ics3d"
            )
        else:
            self.path = dataset_path if dataset_path else "./data"
            
        print(f"Dataset path: {self.path}")
        self.scalers = {}
        self.encoders = {}
        
    def load_edge_iiot(self, variant='DNN', return_raw=False):
        """Load Edge-IIoTset dataset
        
        Dataset characteristics:
        - DNN variant: 236,748 samples, 61 features
        - ML variant: 187,562 samples, 48 features
        - Attack types: DoS/DDoS, reconnaissance, MitM, injection, malware
        - Domain: IoT/IIoT seven-layer architecture
        """
        filename = 'DNN-EdgeIIoT-dataset.csv' if variant == 'DNN' else 'ML-EdgeIIoT-dataset.csv'
        filepath = os.path.join(self.path, filename)
        
        print(f"Loading {filename}...")
        df = pd.read_csv(filepath, low_memory=False)
        print(f"  Shape: {df.shape}")
        
        if return_raw:
            return df
            
        return self._preprocess_edge_iiot(df, variant)
    
    def load_containers(self, return_raw=False):
        """Load Kubernetes/containers dataset
        
        Dataset characteristics:
        - Samples: 157,329 flows
        - Features: 78 features per flow  
        - Classes: 10 CVE-specific attacks + benign (67.3% benign)
        - Domain: Kubernetes microservices security
        """
        filepath = os.path.join(self.path, 'Containers_Dataset.csv')
        
        print(f"Loading Containers_Dataset.csv...")
        df = pd.read_csv(filepath, low_memory=False)
        print(f"  Shape: {df.shape}")
        
        if return_raw:
            return df
            
        return self._preprocess_containers(df)
    
    def load_microsoft_guide(self, split='train', return_raw=False):
        """Load Microsoft GUIDE dataset
        
        Dataset characteristics:
        - Training: 589,437 incidents
        - Testing: 147,359 incidents
        - Coverage: 33 entity types, 441 MITRE ATT&CK techniques
        - Domain: Enterprise SOC from 6,100+ organizations
        """
        filename = 'Microsoft_GUIDE_Train.csv' if split == 'train' else 'Microsoft_GUIDE_Test.csv'
        filepath = os.path.join(self.path, filename)
        
        print(f"Loading {filename}...")
        df = pd.read_csv(filepath, low_memory=False)
        print(f"  Shape: {df.shape}")
        
        if return_raw:
            return df
            
        return self._preprocess_guide(df)
    
    def _preprocess_edge_iiot(self, df: pd.DataFrame, variant: str):
        """Comprehensive preprocessing for Edge-IIoT dataset"""
        print("  Preprocessing Edge-IIoT...")
        
        # Step 1: Identifier Removal
        id_cols = ['ip.src', 'ip.dst', 'arp.src.proto_ipv4', 'arp.dst.proto_ipv4', 'flow_id']
        df = df.drop([col for col in id_cols if col in df.columns], axis=1, errors='ignore')
        
        # Extract labels before processing
        label_col = 'Attack_type' if 'Attack_type' in df.columns else 'Label'
        if label_col in df.columns:
            labels = df[label_col].values
            df = df.drop([label_col], axis=1)
        else:
            labels = np.zeros(len(df))
        
        # Step 2: Handle inf/nan values
        df = df.replace([np.inf, -np.inf], np.nan)
        
        # Step 3: Numeric Normalization - Winsorize outliers
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            if df[col].std() > 0:
                q01, q99 = df[col].quantile([0.001, 0.999])
                df[col] = df[col].clip(q01, q99)
        
        # Step 5: Missing Value Imputation - median for numeric
        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
        
        # Select only numeric columns
        df = df[numeric_cols]
        
        print(f"    Final feature count: {df.shape[1]}")
        print(f"    Label distribution: {np.unique(labels, return_counts=True)}")
        
        return df.values, labels
    
    def _preprocess_containers(self, df: pd.DataFrame):
        """Comprehensive preprocessing for Containers dataset"""
        print("  Preprocessing Containers...")
        
        # Step 1: Identifier Removal
        id_cols = ['flow_id', 'src_ip', 'dst_ip', 'protocol']
        df = df.drop([col for col in id_cols if col in df.columns], axis=1, errors='ignore')
        
        # Extract labels
        label_col = 'Label' if 'Label' in df.columns else 'label'
        if label_col in df.columns:
            labels = df[label_col].values
            df = df.drop([label_col], axis=1)
        else:
            labels = np.zeros(len(df))
        
        # Handle inf/nan
        df = df.replace([np.inf, -np.inf], np.nan)
        
        # Process numeric features
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        
        # Winsorize
        for col in numeric_cols:
            if df[col].std() > 0:
                q01, q99 = df[col].quantile([0.001, 0.999])
                df[col] = df[col].clip(q01, q99)
        
        # Impute
        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
        
        df = df[numeric_cols]
        
        print(f"    Final feature count: {df.shape[1]}")
        print(f"    Label distribution: {np.unique(labels, return_counts=True)}")
        
        return df.values, labels
    
    def _preprocess_guide(self, df: pd.DataFrame):
        """Comprehensive preprocessing for Microsoft GUIDE dataset"""
        print("  Preprocessing GUIDE...")
        
        # Step 1: Identifier Removal - high cardinality columns
        high_card_cols = ['Id', 'OrgId', 'IncidentId', 'AlertId', 'DeviceId', 
                         'DeviceName', 'AccountSid', 'AccountObjectId']
        df = df.drop([col for col in high_card_cols if col in df.columns], axis=1, errors='ignore')
        
        # Extract labels
        label_col = 'IncidentGrade' if 'IncidentGrade' in df.columns else 'Label'
        if label_col in df.columns:
            labels = df[label_col].values
            df = df.drop([label_col], axis=1)
        else:
            labels = np.zeros(len(df))
        
        # Handle inf/nan
        df = df.replace([np.inf, -np.inf], np.nan)
        
        # Process numeric features only
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        df = df[numeric_cols]
        
        # Fill NaN with 0 for GUIDE (sparse features)
        df = df.fillna(0)
        
        print(f"    Final feature count: {df.shape[1]}")
        print(f"    Label distribution: {np.unique(labels, return_counts=True)}")
        
        return df.values, labels

# Test data loading
print("\nTesting data loading...")
try:
    loader = ICS3DDataLoader(download=False)
    print("Data loader initialized successfully")
except Exception as e:
    print(f"Note: {e}")
    print("Data will be downloaded when first accessed")

## Section 3: Core Optimal Transport Components
### Implementation of Sinkhorn Algorithm with Adaptive Regularization Scheduling (Lines 278-297)

In [None]:
class AdaptiveSinkhornSolver:
    """Adaptive Sinkhorn algorithm with regularization scheduling
    
    Implements:
    - Entropic regularization with adaptive scheduling
    - Importance sparsification for efficiency
    - Early stopping with convergence monitoring
    
    Achieves O(log(1/ε)) stages vs O(1/ε³) for naive implementation
    """
    
    def __init__(self, 
                 epsilon_init: float = 0.5,
                 epsilon_min: float = 0.01,
                 decay_rate: float = 0.9,
                 max_iter: int = 100,
                 tol: float = 1e-6,
                 sparsify: bool = True,
                 sparsify_threshold: float = 0.95):
        
        self.epsilon_init = epsilon_init
        self.epsilon_min = epsilon_min  
        self.decay_rate = decay_rate
        self.max_iter = max_iter
        self.tol = tol
        self.sparsify = sparsify
        self.sparsify_threshold = sparsify_threshold
        
        self.history = defaultdict(list)
        
    def solve(self, cost_matrix: np.ndarray, 
              source_weights: np.ndarray = None,
              target_weights: np.ndarray = None) -> Tuple[np.ndarray, float]:
        """Solve optimal transport with adaptive Sinkhorn
        
        Args:
            cost_matrix: [n x m] cost matrix
            source_weights: [n] source distribution (uniform if None)
            target_weights: [m] target distribution (uniform if None)
            
        Returns:
            transport_plan: [n x m] optimal coupling
            wasserstein_dist: scalar Wasserstein distance
        """
        n, m = cost_matrix.shape
        
        # Initialize uniform distributions if not provided
        if source_weights is None:
            source_weights = np.ones(n) / n
        if target_weights is None:
            target_weights = np.ones(m) / m
            
        # Normalize
        source_weights = source_weights / source_weights.sum()
        target_weights = target_weights / target_weights.sum()
        
        # Importance sparsification (Lines 565-574)
        if self.sparsify:
            cost_matrix = self._sparsify_cost_matrix(cost_matrix)
        
        # Adaptive regularization scheduling
        epsilon = self.epsilon_init
        u = np.ones(n) / n
        v = np.ones(m) / m
        
        stage = 0
        while epsilon >= self.epsilon_min:
            # Compute kernel
            K = np.exp(-cost_matrix / epsilon)
            
            # Sinkhorn iterations
            for iteration in range(self.max_iter):
                u_prev = u.copy()
                
                # Update u and v
                u = source_weights / (K @ v + 1e-10)
                v = target_weights / (K.T @ u + 1e-10)
                
                # Check convergence
                err = np.linalg.norm(u - u_prev) / np.linalg.norm(u_prev)
                
                if err < self.tol:
                    break
            
            # Decrease epsilon
            epsilon *= self.decay_rate
            stage += 1
            
            self.history['epsilon'].append(epsilon)
            self.history['error'].append(err)
            self.history['iterations'].append(iteration)
        
        # Compute final transport plan
        transport_plan = np.diag(u) @ K @ np.diag(v)
        
        # Compute Wasserstein distance
        wasserstein_dist = np.sum(transport_plan * cost_matrix)
        
        print(f"  Sinkhorn completed in {stage} stages, final ε={epsilon:.4f}")
        
        return transport_plan, wasserstein_dist
    
    def _sparsify_cost_matrix(self, cost_matrix: np.ndarray) -> np.ndarray:
        """Sparsify cost matrix by setting high costs to infinity
        
        Reduces complexity from O(nm) to Õ(n+m) per iteration
        """
        threshold = np.quantile(cost_matrix, self.sparsify_threshold)
        sparse_cost = cost_matrix.copy()
        sparse_cost[sparse_cost > threshold] = np.inf
        return sparse_cost

# Test Sinkhorn solver
print("Testing Adaptive Sinkhorn Solver...")
solver = AdaptiveSinkhornSolver()
test_cost = np.random.rand(100, 100)
plan, dist = solver.solve(test_cost)
print(f"  Test W2 distance: {dist:.4f}")
print(f"  Transport plan sum: {plan.sum():.4f} (should be ≈1.0)")
print("  Sinkhorn solver ready")