In [None]:
# =============================================================================
# IMPORTS
# =============================================================================
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import ks_2samp, gaussian_kde, spearmanr, pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
warnings.filterwarnings('ignore')

print("Libraries loaded successfully!")

## 1. Configuration

In [None]:
# =============================================================================
# CONFIGURATION - ANOMALY TRAFFIC GENERATION
# =============================================================================

# Data paths - ANOMALY DATA
REAL_DATA_PATH = '/home/smotaali/BGP_Traffic_Generation/RIPE/RIPE_INCIDENTS/all_incidents_anomalies_reinforced_v2.csv'
OUTPUT_DIR = '/home/smotaali/BGP_Traffic_Generation/results/Copula_Anomaly/'

# High confidence anomaly labels to filter
HIGH_CONFIDENCE_LABELS = ['medium_confidence', 'high_confidence', 'very_high_confidence']

# Generation settings
N_SYNTHETIC = 20000  # Number of samples to generate
RANDOM_STATE = 42

# All 27 BGP features
ALL_FEATURES = [
    'announcements', 'withdrawals', 'nlri_ann', 'dups',
    'origin_0', 'origin_2', 'origin_changes',
    'imp_wd', 'imp_wd_spath', 'imp_wd_dpath',
    'as_path_max', 'unique_as_path_max',
    'edit_distance_avg', 'edit_distance_max',
    'edit_distance_dict_0', 'edit_distance_dict_1', 'edit_distance_dict_2',
    'edit_distance_dict_3', 'edit_distance_dict_4', 'edit_distance_dict_5',
    'edit_distance_dict_6',
    'edit_distance_unique_dict_0', 'edit_distance_unique_dict_1',
    'number_rare_ases', 'rare_ases_avg',
    'nadas', 'flaps'
]

# Integer features (must be rounded)
INTEGER_FEATURES = [
    'announcements', 'withdrawals', 'nlri_ann', 'dups',
    'origin_0', 'origin_2', 'origin_changes',
    'imp_wd', 'imp_wd_spath', 'imp_wd_dpath',
    'as_path_max', 'unique_as_path_max',
    'edit_distance_max',
    'edit_distance_dict_0', 'edit_distance_dict_1', 'edit_distance_dict_2',
    'edit_distance_dict_3', 'edit_distance_dict_4', 'edit_distance_dict_5',
    'edit_distance_dict_6',
    'edit_distance_unique_dict_0', 'edit_distance_unique_dict_1',
    'number_rare_ases', 'nadas', 'flaps'
]

# Zero-inflated features (high proportion of zeros)
ZERO_INFLATED_FEATURES = ['flaps', 'nadas', 'imp_wd', 'number_rare_ases']

# Heavy-tailed features (need special handling)
HEAVY_TAILED_FEATURES = ['unique_as_path_max', 'edit_distance_max', 'rare_ases_avg', 'as_path_max']

print("Configuration loaded for ANOMALY TRAFFIC GENERATION!")
print(f"  Data path: {REAL_DATA_PATH}")
print(f"  Confidence labels: {HIGH_CONFIDENCE_LABELS}")
print(f"  Total features: {len(ALL_FEATURES)}")
print(f"  Integer features: {len(INTEGER_FEATURES)}")
print(f"  Zero-inflated features: {len(ZERO_INFLATED_FEATURES)}")
print(f"  Heavy-tailed features: {len(HEAVY_TAILED_FEATURES)}")

## 2. Load Real Anomaly Data

In [None]:
# =============================================================================
# LOAD ANOMALY DATA
# =============================================================================

try:
    df_raw = pd.read_csv(REAL_DATA_PATH)
    print(f"Loaded raw data: {df_raw.shape}")
    print(f"\nColumns: {df_raw.columns.tolist()}")
except FileNotFoundError:
    print(f"File not found: {REAL_DATA_PATH}")
    print("Please update REAL_DATA_PATH to your data location")
    raise

In [None]:
# Check confidence label distribution
print("Confidence label distribution:")
print(df_raw['confidence_label'].value_counts())

if 'label' in df_raw.columns:
    print("\nOriginal label distribution:")
    print(df_raw['label'].value_counts())

In [None]:
# Filter for high confidence anomaly labels
df_filtered = df_raw[df_raw['confidence_label'].isin(HIGH_CONFIDENCE_LABELS)].copy()
print(f"\nFiltered to high confidence anomalies: {len(df_filtered)} samples")
print(f"Distribution within filtered data:")
print(df_filtered['confidence_label'].value_counts())

In [None]:
# Extract features
available_features = [f for f in ALL_FEATURES if f in df_filtered.columns]
missing_features = [f for f in ALL_FEATURES if f not in df_filtered.columns]

if missing_features:
    print(f"WARNING: Missing features: {missing_features}")
    # Update feature lists
    ALL_FEATURES = available_features
    INTEGER_FEATURES = [f for f in INTEGER_FEATURES if f in available_features]
    ZERO_INFLATED_FEATURES = [f for f in ZERO_INFLATED_FEATURES if f in available_features]
    HEAVY_TAILED_FEATURES = [f for f in HEAVY_TAILED_FEATURES if f in available_features]

X_real = df_filtered[available_features].copy()
X_real = X_real.fillna(X_real.median())

print(f"\nUsing {len(available_features)} features")
print(f"Anomaly data shape: {X_real.shape}")
print(f"\nFeature statistics:")
X_real.describe().T[['mean', 'std', 'min', 'max']]

## 3. Analyze Feature Characteristics

In [None]:
# =============================================================================
# ANALYZE FEATURE CHARACTERISTICS
# =============================================================================

def analyze_features(X):
    """Analyze each feature's characteristics for optimal copula configuration."""
    analysis = []
    
    for col in X.columns:
        vals = X[col].dropna().values
        
        # Basic stats
        zero_ratio = (vals == 0).mean()
        unique_ratio = len(np.unique(vals)) / len(vals)
        skewness = stats.skew(vals)
        kurtosis = stats.kurtosis(vals)
        
        # Determine characteristics
        is_zero_inflated = zero_ratio > 0.2
        is_heavy_tailed = kurtosis > 3 or skewness > 2
        is_discrete = unique_ratio < 0.1 or col in INTEGER_FEATURES
        
        analysis.append({
            'feature': col,
            'zero_ratio': zero_ratio,
            'unique_ratio': unique_ratio,
            'skewness': skewness,
            'kurtosis': kurtosis,
            'is_zero_inflated': is_zero_inflated,
            'is_heavy_tailed': is_heavy_tailed,
            'is_discrete': is_discrete
        })
    
    return pd.DataFrame(analysis)

feature_analysis = analyze_features(X_real)

print("Feature Analysis Summary for ANOMALY Data:")
print(f"  Zero-inflated features: {feature_analysis['is_zero_inflated'].sum()}")
print(f"  Heavy-tailed features: {feature_analysis['is_heavy_tailed'].sum()}")
print(f"  Discrete features: {feature_analysis['is_discrete'].sum()}")

# Show problematic features
print("\nFeatures needing special handling:")
problematic = feature_analysis[
    feature_analysis['is_zero_inflated'] | feature_analysis['is_heavy_tailed']
][['feature', 'zero_ratio', 'skewness', 'kurtosis']]
print(problematic.to_string(index=False))

## 4. Enhanced Copula Implementation

In [None]:
# =============================================================================
# HYBRID CORRELATION (PEARSON + SPEARMAN)
# =============================================================================

def compute_hybrid_correlation(X, pearson_weight=0.3, spearman_weight=0.7):
    """
    Compute hybrid correlation matrix using both Pearson and Spearman.
    
    Why use both?
    - Pearson: Captures linear relationships, sensitive to outliers
    - Spearman: Captures monotonic (non-linear) relationships, robust to outliers
    
    For BGP anomaly data with heavy tails and non-linear relationships,
    Spearman should have higher weight.
    """
    assert abs(pearson_weight + spearman_weight - 1.0) < 1e-6, "Weights must sum to 1"
    
    # Compute Pearson correlation
    pearson_corr = X.corr(method='pearson').values
    
    # Compute Spearman correlation
    spearman_corr = X.corr(method='spearman').values
    
    # Handle NaN values
    pearson_corr = np.nan_to_num(pearson_corr, nan=0.0)
    spearman_corr = np.nan_to_num(spearman_corr, nan=0.0)
    
    # Weighted combination
    hybrid_corr = pearson_weight * pearson_corr + spearman_weight * spearman_corr
    
    # Ensure diagonal is 1
    np.fill_diagonal(hybrid_corr, 1.0)
    
    # Ensure positive definiteness
    eigvals, eigvecs = np.linalg.eigh(hybrid_corr)
    eigvals = np.maximum(eigvals, 1e-6)
    hybrid_corr = eigvecs @ np.diag(eigvals) @ eigvecs.T
    
    # Normalize to correlation matrix
    d = np.sqrt(np.diag(hybrid_corr))
    hybrid_corr = hybrid_corr / np.outer(d, d)
    np.fill_diagonal(hybrid_corr, 1.0)
    
    return hybrid_corr

print("Hybrid correlation function defined!")

In [None]:
# =============================================================================
# MARGINAL ESTIMATION (KDE + EMPIRICAL)
# =============================================================================

class MarginalEstimator:
    """
    Estimates marginal distributions using adaptive methods.
    
    - KDE for continuous features with many unique values
    - Empirical CDF for discrete/low-cardinality features
    - Special handling for zero-inflated features
    """
    
    def __init__(self, use_kde=True, handle_zeros=True, log_transform_heavy=True, kde_bandwidth_factor=1.0):
        self.use_kde = use_kde
        self.handle_zeros = handle_zeros
        self.log_transform_heavy = log_transform_heavy
        self.kde_bandwidth_factor = kde_bandwidth_factor
        self.marginals = {}
    
    def fit(self, X, feature_analysis=None):
        """
        Fit marginal distributions for each feature.
        """
        self.feature_names = X.columns.tolist()
        self.n_features = len(self.feature_names)
        
        for col in self.feature_names:
            vals = X[col].values.copy()
            
            # Analyze feature
            zero_ratio = (vals == 0).mean()
            n_unique = len(np.unique(vals))
            is_heavy = col in HEAVY_TAILED_FEATURES or (vals.max() > 100 and stats.skew(vals) > 2)
            
            marginal_info = {
                'zero_ratio': zero_ratio,
                'is_zero_inflated': zero_ratio > 0.2,
                'is_heavy_tailed': is_heavy,
                'is_integer': col in INTEGER_FEATURES,
                'original_values': vals,
                'sorted_values': np.sort(vals),
                'method': 'empirical'
            }
            
            # Choose estimation method
            if self.use_kde and n_unique > 20 and not marginal_info['is_integer']:
                try:
                    bw_method = lambda obj: obj.scotts_factor() * self.kde_bandwidth_factor
                    
                    if is_heavy and self.log_transform_heavy and vals.min() >= 0:
                        log_vals = np.log1p(vals)
                        kde = gaussian_kde(log_vals, bw_method=bw_method)
                        marginal_info['kde'] = kde
                        marginal_info['method'] = 'kde_log'
                        marginal_info['log_sorted'] = np.sort(log_vals)
                    else:
                        kde = gaussian_kde(vals, bw_method=bw_method)
                        marginal_info['kde'] = kde
                        marginal_info['method'] = 'kde'
                except:
                    pass
            
            self.marginals[col] = marginal_info
        
        return self
    
    def transform_to_uniform(self, X):
        """
        Transform data to uniform [0,1] using fitted marginals.
        """
        n_samples = len(X)
        uniform_data = np.zeros((n_samples, self.n_features))
        
        for i, col in enumerate(self.feature_names):
            vals = X[col].values
            info = self.marginals[col]
            
            ranks = stats.rankdata(vals, method='average')
            uniform_data[:, i] = np.clip(ranks / (n_samples + 1), 0.001, 0.999)
        
        return uniform_data
    
    def inverse_transform(self, uniform_data, n_samples):
        """
        Transform uniform samples back to original scale.
        """
        synthetic_data = np.zeros((n_samples, self.n_features))
        
        for i, col in enumerate(self.feature_names):
            info = self.marginals[col]
            u = uniform_data[:, i]
            
            if self.handle_zeros and info['is_zero_inflated']:
                synthetic_data[:, i] = self._inverse_zero_inflated(u, info)
            else:
                synthetic_data[:, i] = self._inverse_standard(u, info)
            
            synthetic_data[:, i] = np.maximum(0, synthetic_data[:, i])
            
            if info['is_integer']:
                synthetic_data[:, i] = np.round(synthetic_data[:, i])
        
        return synthetic_data
    
    def _inverse_standard(self, u, info):
        """Standard inverse CDF using quantile function."""
        sorted_vals = info['sorted_values']
        indices = (u * len(sorted_vals)).astype(int)
        indices = np.clip(indices, 0, len(sorted_vals) - 1)
        return sorted_vals[indices]
    
    def _inverse_zero_inflated(self, u, info):
        """Inverse CDF for zero-inflated features."""
        zero_ratio = info['zero_ratio']
        original = info['original_values']
        
        result = np.zeros(len(u))
        is_zero = u < zero_ratio
        
        non_zero_vals = original[original > 0]
        if len(non_zero_vals) > 0:
            scaled_u = (u[~is_zero] - zero_ratio) / (1 - zero_ratio + 1e-10)
            scaled_u = np.clip(scaled_u, 0, 1)
            
            sorted_non_zero = np.sort(non_zero_vals)
            indices = (scaled_u * len(sorted_non_zero)).astype(int)
            indices = np.clip(indices, 0, len(sorted_non_zero) - 1)
            result[~is_zero] = sorted_non_zero[indices]
        
        return result

print("Marginal Estimator class defined!")

In [None]:
# =============================================================================
# ENHANCED COPULA GENERATOR
# =============================================================================

class EnhancedCopulaGenerator:
    """
    Enhanced Copula Generator with multiple improvements:
    
    1. Hybrid correlation (Pearson + Spearman)
    2. Adaptive marginal estimation (KDE + Empirical)
    3. Zero-inflation handling
    4. t-Copula option for heavy tails
    5. Built-in constraint enforcement
    6. KDE bandwidth tuning
    """
    
    def __init__(self, 
                 copula_type='gaussian',
                 t_df=5,
                 pearson_weight=0.3,
                 spearman_weight=0.7,
                 use_kde_marginals=True,
                 kde_bandwidth_factor=1.0,
                 handle_zeros=True,
                 random_state=42):
        
        self.copula_type = copula_type
        self.t_df = t_df
        self.pearson_weight = pearson_weight
        self.spearman_weight = spearman_weight
        self.use_kde_marginals = use_kde_marginals
        self.kde_bandwidth_factor = kde_bandwidth_factor
        self.handle_zeros = handle_zeros
        self.random_state = random_state
        
        self.marginal_estimator = None
        self.correlation_matrix = None
        self.cholesky_L = None
    
    def fit(self, X):
        """
        Fit the copula model to real anomaly data.
        """
        print("Fitting Enhanced Copula for ANOMALY data...")
        print(f"  Configuration:")
        print(f"    - Copula type: {self.copula_type}" + 
              (f" (df={self.t_df})" if self.copula_type == 't' else ""))
        print(f"    - Correlation: {self.pearson_weight:.0%} Pearson + {self.spearman_weight:.0%} Spearman")
        print(f"    - Marginals: {'KDE' if self.use_kde_marginals else 'Empirical'}" +
              (f" (bw={self.kde_bandwidth_factor}x)" if self.use_kde_marginals else ""))
        print(f"    - Zero handling: {self.handle_zeros}")
        
        self.feature_names = X.columns.tolist()
        self.n_features = len(self.feature_names)
        
        print("\n  [1/3] Fitting marginal distributions...")
        self.marginal_estimator = MarginalEstimator(
            use_kde=self.use_kde_marginals,
            handle_zeros=self.handle_zeros,
            kde_bandwidth_factor=self.kde_bandwidth_factor
        )
        self.marginal_estimator.fit(X)
        
        print("  [2/3] Computing hybrid correlation matrix...")
        self.correlation_matrix = compute_hybrid_correlation(
            X, 
            pearson_weight=self.pearson_weight,
            spearman_weight=self.spearman_weight
        )
        
        print("  [3/3] Computing Cholesky decomposition...")
        try:
            self.cholesky_L = np.linalg.cholesky(self.correlation_matrix)
        except np.linalg.LinAlgError:
            self.correlation_matrix += 0.01 * np.eye(self.n_features)
            self.cholesky_L = np.linalg.cholesky(self.correlation_matrix)
        
        print("  Fitting complete!")
        return self
    
    def generate(self, n_samples):
        """
        Generate synthetic anomaly samples.
        """
        np.random.seed(self.random_state)
        
        print(f"\nGenerating {n_samples} synthetic anomaly samples...")
        
        if self.copula_type == 'gaussian':
            independent_samples = np.random.randn(n_samples, self.n_features)
        else:
            chi2_samples = np.random.chisquare(self.t_df, n_samples) / self.t_df
            independent_samples = np.random.randn(n_samples, self.n_features)
            independent_samples = independent_samples / np.sqrt(chi2_samples)[:, np.newaxis]
        
        correlated_samples = independent_samples @ self.cholesky_L.T
        
        if self.copula_type == 'gaussian':
            uniform_samples = stats.norm.cdf(correlated_samples)
        else:
            uniform_samples = stats.t.cdf(correlated_samples, df=self.t_df)
        
        synthetic_data = self.marginal_estimator.inverse_transform(uniform_samples, n_samples)
        
        synthetic_df = pd.DataFrame(synthetic_data, columns=self.feature_names)
        
        print(f"  Generated shape: {synthetic_df.shape}")
        
        return synthetic_df
    
    def get_config(self):
        """Return configuration as dictionary."""
        return {
            'copula_type': self.copula_type,
            't_df': self.t_df,
            'pearson_weight': self.pearson_weight,
            'spearman_weight': self.spearman_weight,
            'use_kde_marginals': self.use_kde_marginals,
            'kde_bandwidth_factor': self.kde_bandwidth_factor,
            'handle_zeros': self.handle_zeros
        }

print("EnhancedCopulaGenerator class defined!")

## 5. Post-Processing & Constraints

In [None]:
# =============================================================================
# BGP CONSTRAINT ENFORCEMENT
# =============================================================================

def enforce_bgp_constraints(synthetic, real, verbose=True):
    """
    Enforce BGP domain-specific constraints.
    
    Constraints:
    1. All features non-negative
    2. Integer features properly rounded
    3. origin_0 + origin_2 <= announcements
    4. imp_wd_spath + imp_wd_dpath <= imp_wd
    5. Values within realistic bounds
    """
    result = synthetic.copy()
    
    violations = {
        'nan_inf': 0,
        'negative': 0,
        'origin_constraint': 0,
        'imp_wd_constraint': 0,
        'bounds': 0
    }
    
    # 1. Handle NaN/Inf
    for col in result.columns:
        nan_count = result[col].isna().sum()
        if nan_count > 0:
            violations['nan_inf'] += nan_count
            if col in INTEGER_FEATURES:
                result[col] = result[col].fillna(0)
            else:
                result[col] = result[col].fillna(real[col].median() if col in real.columns else 0)
        
        result[col] = result[col].replace([np.inf, -np.inf], 0)
    
    # 2. Non-negative
    for col in result.columns:
        neg_count = (result[col] < 0).sum()
        if neg_count > 0:
            violations['negative'] += neg_count
            result[col] = np.maximum(0, result[col])
    
    # 3. Integer features
    for col in INTEGER_FEATURES:
        if col in result.columns:
            result[col] = np.round(result[col]).astype(int)
    
    # 4. Origin constraint: origin_0 + origin_2 <= announcements
    if all(c in result.columns for c in ['origin_0', 'origin_2', 'announcements']):
        origin_sum = result['origin_0'] + result['origin_2']
        excess = origin_sum > result['announcements']
        violations['origin_constraint'] = excess.sum()
        
        if excess.any():
            scale = result.loc[excess, 'announcements'] / (origin_sum[excess] + 1e-10)
            result.loc[excess, 'origin_0'] = np.floor(result.loc[excess, 'origin_0'] * scale).astype(int)
            result.loc[excess, 'origin_2'] = np.floor(result.loc[excess, 'origin_2'] * scale).astype(int)
    
    # 5. Implicit withdrawal constraint: imp_wd_spath + imp_wd_dpath <= imp_wd
    if all(c in result.columns for c in ['imp_wd_spath', 'imp_wd_dpath', 'imp_wd']):
        imp_sum = result['imp_wd_spath'] + result['imp_wd_dpath']
        excess = imp_sum > result['imp_wd']
        violations['imp_wd_constraint'] = excess.sum()
        
        if excess.any():
            scale = result.loc[excess, 'imp_wd'] / (imp_sum[excess] + 1e-10)
            result.loc[excess, 'imp_wd_spath'] = np.floor(result.loc[excess, 'imp_wd_spath'] * scale).astype(int)
            result.loc[excess, 'imp_wd_dpath'] = np.floor(result.loc[excess, 'imp_wd_dpath'] * scale).astype(int)
    
    # 6. Realistic bounds (99.5th percentile)
    for col in result.columns:
        if col in real.columns:
            upper = np.percentile(real[col].dropna(), 99.5) * 1.1
            over = (result[col] > upper).sum()
            if over > 0:
                violations['bounds'] += over
                result[col] = np.clip(result[col], 0, upper)
    
    # Final integer enforcement
    for col in INTEGER_FEATURES:
        if col in result.columns:
            result[col] = np.maximum(0, np.round(result[col])).astype(int)
    
    if verbose:
        total = sum(violations.values())
        if total > 0:
            print(f"  Constraints fixed: {violations}")
        else:
            print("  No constraint violations")
    
    return result

print("Constraint enforcement function defined!")

In [None]:
# =============================================================================
# CORRELATION ALIGNMENT POST-PROCESSING
# =============================================================================

def align_correlation_structure(synthetic, real, n_iterations=10, learning_rate=0.3, verbose=True):
    """
    Post-process synthetic data to better match real data's correlation structure.
    """
    result = synthetic.copy()
    features = list(result.columns)
    n_features = len(features)
    
    original_ranks = {}
    for col in features:
        original_ranks[col] = stats.rankdata(result[col].values)
    
    target_corr = real[features].corr(method='pearson').values
    target_corr = np.nan_to_num(target_corr, nan=0.0)
    np.fill_diagonal(target_corr, 1.0)
    
    eigvals, eigvecs = np.linalg.eigh(target_corr)
    eigvals = np.maximum(eigvals, 1e-6)
    target_corr = eigvecs @ np.diag(eigvals) @ eigvecs.T
    d = np.sqrt(np.diag(target_corr))
    target_corr = target_corr / np.outer(d, d)
    np.fill_diagonal(target_corr, 1.0)
    
    try:
        target_L = np.linalg.cholesky(target_corr)
    except:
        target_corr += 0.01 * np.eye(n_features)
        target_L = np.linalg.cholesky(target_corr)
    
    if verbose:
        current_corr = result[features].corr(method='pearson').values
        current_flat = current_corr[np.triu_indices(n_features, k=1)]
        target_flat = target_corr[np.triu_indices(n_features, k=1)]
        initial_match = np.corrcoef(current_flat, target_flat)[0, 1]
        print(f"Correlation Alignment: Initial match = {initial_match:.4f}")
    
    data = result[features].values.copy()
    means = data.mean(axis=0)
    stds = data.std(axis=0)
    stds[stds == 0] = 1
    Z = (data - means) / stds
    
    for iteration in range(n_iterations):
        current_corr = np.corrcoef(Z.T)
        current_corr = np.nan_to_num(current_corr, nan=0.0)
        np.fill_diagonal(current_corr, 1.0)
        
        eigvals, eigvecs = np.linalg.eigh(current_corr)
        eigvals = np.maximum(eigvals, 1e-6)
        current_corr = eigvecs @ np.diag(eigvals) @ eigvecs.T
        d = np.sqrt(np.diag(current_corr))
        current_corr = current_corr / np.outer(d, d)
        np.fill_diagonal(current_corr, 1.0)
        
        try:
            current_L = np.linalg.cholesky(current_corr)
        except:
            current_corr += 0.01 * np.eye(n_features)
            current_L = np.linalg.cholesky(current_corr)
        
        current_L_inv = np.linalg.inv(current_L)
        blended_L = (1 - learning_rate) * current_L + learning_rate * target_L
        
        Z = Z @ current_L_inv.T @ blended_L.T
        Z = (Z - Z.mean(axis=0)) / (Z.std(axis=0) + 1e-10)
    
    adjusted_data = np.zeros_like(Z)
    for i, col in enumerate(features):
        adjusted_ranks = stats.rankdata(Z[:, i])
        sorted_original = np.sort(result[col].values)
        rank_indices = ((adjusted_ranks - 1) / (len(adjusted_ranks) - 1) * (len(sorted_original) - 1)).astype(int)
        rank_indices = np.clip(rank_indices, 0, len(sorted_original) - 1)
        adjusted_data[:, i] = sorted_original[rank_indices]
    
    aligned_result = pd.DataFrame(adjusted_data, columns=features)
    
    if verbose:
        final_corr = aligned_result[features].corr(method='pearson').values
        final_flat = final_corr[np.triu_indices(n_features, k=1)]
        final_match = np.corrcoef(final_flat, target_flat)[0, 1]
        improvement = final_match - initial_match
        print(f"Correlation Alignment: Final match = {final_match:.4f} ({'+' if improvement >= 0 else ''}{improvement:.4f})")
    
    return aligned_result


def iterative_correlation_refinement(synthetic, real, target_corr_match=0.95, max_rounds=5, verbose=True):
    """
    Apply multiple rounds of correlation alignment until target is reached.
    """
    result = synthetic.copy()
    features = list(result.columns)
    n_features = len(features)
    
    target_corr = real[features].corr(method='pearson').values
    target_flat = target_corr[np.triu_indices(n_features, k=1)]
    target_flat = np.nan_to_num(target_flat, nan=0.0)
    
    if verbose:
        print("="*60)
        print("ITERATIVE CORRELATION REFINEMENT")
        print("="*60)
    
    for round_num in range(1, max_rounds + 1):
        current_corr = result[features].corr(method='pearson').values
        current_flat = current_corr[np.triu_indices(n_features, k=1)]
        current_flat = np.nan_to_num(current_flat, nan=0.0)
        
        valid_mask = ~(np.isnan(current_flat) | np.isnan(target_flat))
        if valid_mask.sum() > 1:
            corr_match = np.corrcoef(current_flat[valid_mask], target_flat[valid_mask])[0, 1]
        else:
            corr_match = 0
        
        if verbose:
            print(f"\nRound {round_num}: Correlation match = {corr_match:.4f}")
        
        if corr_match >= target_corr_match:
            if verbose:
                print(f"Target reached ({target_corr_match:.2f})!")
            break
        
        lr = min(0.5, 0.2 + 0.1 * round_num)
        result = align_correlation_structure(result, real, n_iterations=15, learning_rate=lr, verbose=False)
    
    final_corr = result[features].corr(method='pearson').values
    final_flat = final_corr[np.triu_indices(n_features, k=1)]
    final_flat = np.nan_to_num(final_flat, nan=0.0)
    
    valid_mask = ~(np.isnan(final_flat) | np.isnan(target_flat))
    final_match = np.corrcoef(final_flat[valid_mask], target_flat[valid_mask])[0, 1] if valid_mask.sum() > 1 else 0
    
    if verbose:
        print(f"\nFinal correlation match: {final_match:.4f}")
        print("="*60)
    
    return result

print("Correlation alignment functions defined!")

## 6. Evaluation Functions

In [None]:
# =============================================================================
# EVALUATION
# =============================================================================

KS_EXCELLENT = 0.05
KS_GOOD = 0.10
KS_MODERATE = 0.15

def evaluate_synthetic(real, synthetic, verbose=True):
    """
    Evaluate synthetic anomaly data quality.
    """
    common_cols = [c for c in real.columns if c in synthetic.columns]
    n_features = len(common_cols)
    
    ks_stats = {}
    cohens_d = {}
    wasserstein = {}
    
    for col in common_cols:
        real_vals = real[col].dropna().values
        syn_vals = synthetic[col].dropna().values
        
        if len(real_vals) == 0 or len(syn_vals) == 0:
            continue
        
        ks_stat, _ = ks_2samp(syn_vals, real_vals)
        ks_stats[col] = ks_stat
        
        pooled_std = np.sqrt((real_vals.std()**2 + syn_vals.std()**2) / 2)
        if pooled_std > 0:
            d = (syn_vals.mean() - real_vals.mean()) / pooled_std
            d = np.clip(d, -10, 10)
        else:
            d = 0
        cohens_d[col] = d
        
        r_range = real_vals.max() - real_vals.min()
        s_range = syn_vals.max() - syn_vals.min()
        r_norm = (real_vals - real_vals.min()) / (r_range + 1e-10) if r_range > 0 else np.zeros_like(real_vals)
        s_norm = (syn_vals - syn_vals.min()) / (s_range + 1e-10) if s_range > 0 else np.zeros_like(syn_vals)
        wasserstein[col] = stats.wasserstein_distance(r_norm, s_norm)
    
    good_or_better = sum(1 for ks in ks_stats.values() if ks < KS_GOOD)
    distribution_score = (good_or_better / n_features) * 100
    
    real_corr = real[common_cols].corr()
    syn_corr = synthetic[common_cols].corr()
    
    real_flat = real_corr.values[np.triu_indices(n_features, k=1)]
    syn_flat = syn_corr.values[np.triu_indices(n_features, k=1)]
    
    valid = ~(np.isnan(real_flat) | np.isnan(syn_flat))
    if valid.sum() > 1:
        structure_corr = np.corrcoef(real_flat[valid], syn_flat[valid])[0, 1]
    else:
        structure_corr = 0
    
    correlation_score = ((structure_corr + 1) / 2) * 100
    
    negligible = sum(1 for d in cohens_d.values() if abs(d) < 0.2)
    small = sum(1 for d in cohens_d.values() if 0.2 <= abs(d) < 0.5)
    medium = sum(1 for d in cohens_d.values() if 0.5 <= abs(d) < 0.8)
    large = sum(1 for d in cohens_d.values() if abs(d) >= 0.8)
    
    effect_score = ((negligible * 1.0 + small * 0.75 + medium * 0.25 + large * 0.0) / n_features) * 100
    
    mean_wd = np.mean(list(wasserstein.values()))
    wasserstein_score = max(0, (1 - mean_wd * 2)) * 100
    
    weights = {'distribution': 0.25, 'correlation': 0.25, 'effect_size': 0.30, 'wasserstein': 0.20}
    overall_score = (
        distribution_score * weights['distribution'] +
        correlation_score * weights['correlation'] +
        effect_score * weights['effect_size'] +
        wasserstein_score * weights['wasserstein']
    )
    
    results = {
        'overall_score': overall_score,
        'distribution_score': distribution_score,
        'correlation_score': correlation_score,
        'effect_score': effect_score,
        'wasserstein_score': wasserstein_score,
        'mean_ks': np.mean(list(ks_stats.values())),
        'structure_corr': structure_corr,
        'ks_stats': ks_stats,
        'cohens_d': cohens_d,
        'effect_counts': {'negligible': negligible, 'small': small, 'medium': medium, 'large': large},
        'good_features': good_or_better,
        'n_features': n_features
    }
    
    if verbose:
        print("="*70)
        print("EVALUATION RESULTS - ANOMALY DATA")
        print("="*70)
        print(f"\nDistribution (KS < {KS_GOOD}): {good_or_better}/{n_features} = {distribution_score:.1f}/100")
        print(f"Correlation Structure: {structure_corr:.3f} -> {correlation_score:.1f}/100")
        print(f"Effect Size: neg={negligible}, small={small}, med={medium}, large={large} -> {effect_score:.1f}/100")
        print(f"Wasserstein: {mean_wd:.4f} -> {wasserstein_score:.1f}/100")
        print(f"\n{'='*70}")
        print(f"OVERALL SCORE: {overall_score:.1f}/100")
        print(f"{'='*70}")
        
        if overall_score >= 80:
            print("Verdict: EXCELLENT")
        elif overall_score >= 70:
            print("Verdict: GOOD")
        elif overall_score >= 50:
            print("Verdict: MODERATE")
        else:
            print("Verdict: POOR")
    
    return results

print("Evaluation function defined!")

## 7. Run Enhanced Copula Generation for Anomaly Data

In [None]:
# =============================================================================
# TEST MULTIPLE CONFIGURATIONS FOR ANOMALY DATA
# =============================================================================

print("="*70)
print("TESTING COPULA CONFIGURATIONS FOR ANOMALY DATA")
print("="*70)
print(f"\nOriginal anomaly samples: {len(X_real)}")
print(f"Target synthetic samples: {N_SYNTHETIC}\n")

configurations = [
    # Baseline configurations
    {
        'name': 'Gaussian Copula + Spearman',
        'copula_type': 'gaussian',
        't_df': 5,
        'pearson_weight': 0.3,
        'spearman_weight': 0.7,
        'use_kde_marginals': True,
        'kde_bandwidth_factor': 1.0,
        'handle_zeros': True
    },
    {
        'name': 't-Copula(df=3) + Pearson',
        'copula_type': 't',
        't_df': 3,
        'pearson_weight': 1.0,
        'spearman_weight': 0.0,
        'use_kde_marginals': True,
        'kde_bandwidth_factor': 1.0,
        'handle_zeros': True
    },
    {
        'name': 't-Copula(df=4) + Pearson',
        'copula_type': 't',
        't_df': 4,
        'pearson_weight': 1.0,
        'spearman_weight': 0.0,
        'use_kde_marginals': True,
        'kde_bandwidth_factor': 1.0,
        'handle_zeros': True
    },
    {
        'name': 't-Copula(df=5) + Pearson',
        'copula_type': 't',
        't_df': 5,
        'pearson_weight': 1.0,
        'spearman_weight': 0.0,
        'use_kde_marginals': True,
        'kde_bandwidth_factor': 1.0,
        'handle_zeros': True
    },
    {
        'name': 't-Copula(df=3) + 90%Pearson/10%Spearman',
        'copula_type': 't',
        't_df': 3,
        'pearson_weight': 0.90,
        'spearman_weight': 0.10,
        'use_kde_marginals': True,
        'kde_bandwidth_factor': 1.0,
        'handle_zeros': True
    },
    {
        'name': 't-Copula(df=4) + KDE(bw=0.75x)',
        'copula_type': 't',
        't_df': 4,
        'pearson_weight': 1.0,
        'spearman_weight': 0.0,
        'use_kde_marginals': True,
        'kde_bandwidth_factor': 0.75,
        'handle_zeros': True
    },
]

print(f"Testing {len(configurations)} configurations...\n")

results_list = []

for i, config in enumerate(configurations, 1):
    name = config.pop('name')
    print(f"[{i}/{len(configurations)}] {name}")
    print("-" * 60)
    
    try:
        generator = EnhancedCopulaGenerator(**config, random_state=RANDOM_STATE)
        generator.fit(X_real)
        
        synthetic = generator.generate(N_SYNTHETIC)
        synthetic = enforce_bgp_constraints(synthetic, X_real, verbose=False)
        
        eval_result = evaluate_synthetic(X_real, synthetic, verbose=False)
        
        print(f"  Score: {eval_result['overall_score']:.1f}/100")
        print(f"  KS Good: {eval_result['good_features']}/{eval_result['n_features']}")
        print(f"  Correlation: {eval_result['structure_corr']:.3f}")
        
        results_list.append({
            'name': name,
            'score': eval_result['overall_score'],
            'synthetic': synthetic,
            'eval': eval_result,
            'config': config
        })
        
    except Exception as e:
        print(f"  FAILED: {e}")
    
    config['name'] = name
    print()

# Sort and display results
print("="*70)
print("CONFIGURATION RANKING")
print("="*70)
results_sorted = sorted(results_list, key=lambda x: x['score'], reverse=True)
for i, r in enumerate(results_sorted, 1):
    marker = " <-- BEST" if i == 1 else ""
    print(f"{i:2d}. {r['name']}: {r['score']:.1f}/100{marker}")

if results_list:
    best = max(results_list, key=lambda x: x['score'])
    print(f"\n{'='*70}")
    print(f"BEST CONFIGURATION: {best['name']}")
    print(f"SCORE: {best['score']:.1f}/100")
    print(f"{'='*70}")

In [None]:
# Use best configuration
if results_list:
    best_synthetic = best['synthetic']
    best_eval = best['eval']
    
    print("\nFull Evaluation of Best Configuration:")
    _ = evaluate_synthetic(X_real, best_synthetic, verbose=True)

In [None]:
# Apply correlation alignment if needed
print("\n" + "="*70)
print("APPLYING CORRELATION ALIGNMENT POST-PROCESSING")
print("="*70)

aligned_synthetic = iterative_correlation_refinement(
    best_synthetic, X_real, 
    target_corr_match=0.92, 
    max_rounds=5, 
    verbose=True
)

print("\nRe-applying BGP constraints...")
aligned_synthetic = enforce_bgp_constraints(aligned_synthetic, X_real, verbose=True)

print("\nEvaluating aligned synthetic data:")
aligned_eval = evaluate_synthetic(X_real, aligned_synthetic, verbose=True)

# Use better result
if aligned_eval['overall_score'] > best_eval['overall_score']:
    final_synthetic = aligned_synthetic
    final_eval = aligned_eval
    print("\nUsing ALIGNED synthetic data (better score)")
else:
    final_synthetic = best_synthetic
    final_eval = best_eval
    print("\nUsing ORIGINAL best synthetic data (better score)")

## 8. Visualization

In [None]:
# Distribution comparison for key features
key_features = ['announcements', 'withdrawals', 'edit_distance_avg', 'rare_ases_avg']
key_features = [f for f in key_features if f in X_real.columns]

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, col in enumerate(key_features[:4]):
    ax = axes[i]
    ax.hist(X_real[col], bins=50, alpha=0.5, label='Real Anomaly', density=True)
    ax.hist(final_synthetic[col], bins=50, alpha=0.5, label='Synthetic Anomaly', density=True)
    
    ks_stat = final_eval['ks_stats'].get(col, 0)
    ax.set_title(f'{col}\nKS={ks_stat:.3f}')
    ax.legend()

plt.suptitle('Distribution Comparison: Real vs Synthetic ANOMALY Data', y=1.02, fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix comparison
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Real correlation
corr_features = key_features + ['origin_0', 'origin_2', 'dups']
corr_features = [f for f in corr_features if f in X_real.columns]

real_corr = X_real[corr_features].corr()
sns.heatmap(real_corr, annot=True, fmt='.2f', cmap='coolwarm', center=0, ax=axes[0])
axes[0].set_title('Real Anomaly Data Correlation')

# Synthetic correlation
syn_corr = final_synthetic[corr_features].corr()
sns.heatmap(syn_corr, annot=True, fmt='.2f', cmap='coolwarm', center=0, ax=axes[1])
axes[1].set_title('Synthetic Anomaly Data Correlation')

plt.tight_layout()
plt.show()

## 9. Save Results

In [None]:
# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Output directory: {OUTPUT_DIR}")

# Save synthetic anomaly data
synthetic_path = os.path.join(OUTPUT_DIR, 'copula_synthetic_anomaly.csv')
final_synthetic.to_csv(synthetic_path, index=False)
print(f"\nSaved synthetic anomaly data: {synthetic_path}")
print(f"  Samples: {len(final_synthetic)}")

# Save combined data (original + synthetic)
df_original = X_real.copy()
df_original['data_source'] = 'original'
df_original['label'] = 'anomaly'

df_synthetic = final_synthetic.copy()
df_synthetic['data_source'] = 'synthetic'
df_synthetic['label'] = 'anomaly'

df_combined = pd.concat([df_original, df_synthetic], ignore_index=True)
combined_path = os.path.join(OUTPUT_DIR, 'combined_anomaly_data.csv')
df_combined.to_csv(combined_path, index=False)
print(f"\nSaved combined anomaly data: {combined_path}")
print(f"  Original: {len(X_real)}")
print(f"  Synthetic: {len(final_synthetic)}")
print(f"  Total: {len(df_combined)}")

In [None]:
# Final summary
print("\n" + "="*70)
print("COPULA ANOMALY TRAFFIC GENERATION COMPLETE")
print("="*70)

print(f"\nInput Data:")
print(f"  Source: {REAL_DATA_PATH}")
print(f"  Confidence labels: {HIGH_CONFIDENCE_LABELS}")
print(f"  Original samples: {len(X_real)}")

print(f"\nBest Configuration: {best['name']}")

print(f"\nQuality Metrics:")
print(f"  Overall Score: {final_eval['overall_score']:.1f}/100")
print(f"  Distribution Score: {final_eval['distribution_score']:.1f}/100")
print(f"  Correlation Score: {final_eval['correlation_score']:.1f}/100")
print(f"  Effect Size Score: {final_eval['effect_score']:.1f}/100")

print(f"\nOutput Files:")
print(f"  Synthetic only: {synthetic_path}")
print(f"  Combined data: {combined_path}")
print("="*70)

In [None]:
# Display sample of generated data
print("\nSample of generated synthetic anomaly data:")
final_synthetic.head(10)