# SMOTE-based Anomaly Traffic Generation

This notebook generates synthetic anomaly BGP traffic using SMOTE-based oversampling techniques.
It filters data with high confidence anomaly labels: `medium_confidence`, `high_confidence`, `very_high_confidence`.

## 1. Setup and Installation

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from scipy import stats
from scipy.stats import ks_2samp
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded successfully!")

## 2. Define Feature Sets with Enhanced Categorization

In [None]:
# High confidence anomaly labels to filter
HIGH_CONFIDENCE_LABELS = ['medium_confidence', 'high_confidence', 'very_high_confidence']

# Integer features (must be rounded after SMOTE)
INTEGER_FEATURES = [
    'announcements', 'withdrawals', 'nlri_ann', 'dups',
    'origin_0', 'origin_2', 'origin_changes',
    'imp_wd', 'imp_wd_spath', 'imp_wd_dpath',
    'as_path_max', 'unique_as_path_max',
    'edit_distance_max',
    'edit_distance_dict_0', 'edit_distance_dict_1', 'edit_distance_dict_2',
    'edit_distance_dict_3', 'edit_distance_dict_4', 'edit_distance_dict_5',
    'edit_distance_dict_6',
    'edit_distance_unique_dict_0', 'edit_distance_unique_dict_1',
    'number_rare_ases',
    'nadas', 'flaps'
]

# Heavy-tailed count features (benefit from log1p transform before SMOTE)
HEAVY_TAILED_FEATURES = [
    'announcements', 'withdrawals', 'nlri_ann',
    'number_rare_ases', 'nadas'
]

# Continuous features (can be float but bounded)
CONTINUOUS_FEATURES = [
    'edit_distance_avg',
    'rare_ases_avg'
]

# Bounded ratio features (need empirical min/max clipping)
BOUNDED_RATIO_FEATURES = [
    'rare_ases_avg'  # This is a ratio typically between 0 and 1
]

# Edit distance dict features (for constraint validation)
EDIT_DISTANCE_DICT_FEATURES = [
    'edit_distance_dict_0', 'edit_distance_dict_1', 'edit_distance_dict_2',
    'edit_distance_dict_3', 'edit_distance_dict_4', 'edit_distance_dict_5',
    'edit_distance_dict_6'
]

# Core features for quality validation
CORE_VALIDATION_FEATURES = [
    'announcements', 'withdrawals', 'nlri_ann', 'edit_distance_avg'
]

# All features to use for synthetic generation
ALL_FEATURES = INTEGER_FEATURES + CONTINUOUS_FEATURES

# Columns to exclude (labels, timestamps, derived scores)
EXCLUDE_COLS = [
    'label', 'confidence_label', 'window_start', 'window_end',
    'iso_forest_score', 'lof_score', 'statistical_score',
    'elliptic_score', 'cluster', 'anomaly_votes', 'consensus_score',
    'Incident'
]

# Potential grouping columns (if available in dataset)
POTENTIAL_GROUP_COLS = ['peer_asn', 'peer_ip', 'prefix', 'collector', 'Incident']

print(f"Integer features: {len(INTEGER_FEATURES)}")
print(f"Heavy-tailed features: {len(HEAVY_TAILED_FEATURES)}")
print(f"Continuous features: {len(CONTINUOUS_FEATURES)}")
print(f"Total features for generation: {len(ALL_FEATURES)}")

## 3. Load and Explore Data

In [None]:
# Load data - Anomaly data with reinforced labels
DATA_PATH = '/home/smotaali/BGP_Traffic_Generation/RIPE/RIPE_INCIDENTS/all_incidents_anomalies_reinforced_v2.csv'

df = pd.read_csv(DATA_PATH)
print(f"Original dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

In [None]:
# Check label distributions
print("Confidence label distribution:")
print(df['confidence_label'].value_counts())

if 'label' in df.columns:
    print("\nOriginal label distribution:")
    print(df['label'].value_counts())

In [None]:
# Filter only high confidence anomaly samples
df_high_confidence = df[df['confidence_label'].isin(HIGH_CONFIDENCE_LABELS)].copy()
print(f"High confidence anomaly samples: {len(df_high_confidence)}")
print(f"\nDistribution within high confidence:")
print(df_high_confidence['confidence_label'].value_counts())

In [None]:
# Verify all features exist in the dataset
missing_features = [f for f in ALL_FEATURES if f not in df.columns]
if missing_features:
    print(f"WARNING: Missing features: {missing_features}")
    # Update ALL_FEATURES to only include available features
    ALL_FEATURES = [f for f in ALL_FEATURES if f in df.columns]
    INTEGER_FEATURES = [f for f in INTEGER_FEATURES if f in df.columns]
    HEAVY_TAILED_FEATURES = [f for f in HEAVY_TAILED_FEATURES if f in df.columns]
    CONTINUOUS_FEATURES = [f for f in CONTINUOUS_FEATURES if f in df.columns]
    print(f"Updated to {len(ALL_FEATURES)} available features")
else:
    print("All features found in dataset!")

In [None]:
# Check for potential grouping columns
available_group_cols = [col for col in POTENTIAL_GROUP_COLS if col in df.columns]
print(f"Available grouping columns: {available_group_cols}")

if available_group_cols:
    for col in available_group_cols:
        n_unique = df_high_confidence[col].nunique()
        print(f"  {col}: {n_unique} unique values")

In [None]:
# Prepare features - use only raw BGP features
X_anomaly = df_high_confidence[ALL_FEATURES].copy()

# Handle any missing values
X_anomaly = X_anomaly.fillna(X_anomaly.median())

print(f"Feature matrix shape: {X_anomaly.shape}")
X_anomaly.describe()

In [None]:
# Store empirical bounds for all features (for clipping after inverse transform)
EMPIRICAL_BOUNDS = {}
for col in ALL_FEATURES:
    EMPIRICAL_BOUNDS[col] = {
        'min': X_anomaly[col].min(),
        'max': X_anomaly[col].max(),
        'q01': X_anomaly[col].quantile(0.01),
        'q99': X_anomaly[col].quantile(0.99)
    }

print("Empirical bounds computed for all features.")
print("\nExample bounds for key features:")
for feat in ['announcements', 'withdrawals', 'rare_ases_avg', 'edit_distance_avg']:
    if feat in EMPIRICAL_BOUNDS:
        print(f"  {feat}: [{EMPIRICAL_BOUNDS[feat]['min']:.4f}, {EMPIRICAL_BOUNDS[feat]['max']:.4f}]")

## 4. Enhanced Transform Functions

In [None]:
def log1p_transform(X, features):
    """Apply log1p transform to heavy-tailed features."""
    X_transformed = X.copy()
    for feat in features:
        if feat in X_transformed.columns:
            X_transformed[feat] = np.log1p(X_transformed[feat])
    return X_transformed

def inverse_log1p_transform(X, features):
    """Inverse log1p transform."""
    X_inverse = X.copy()
    for feat in features:
        if feat in X_inverse.columns:
            X_inverse[feat] = np.expm1(X_inverse[feat])
    return X_inverse

def post_process_synthetic(df_synthetic, empirical_bounds, integer_features, bounded_features):
    """
    Post-process synthetic data:
    1. Round integer features
    2. Clip to empirical bounds
    3. Ensure non-negativity
    """
    df_processed = df_synthetic.copy()
    
    # Ensure non-negativity first
    for col in df_processed.columns:
        df_processed[col] = np.maximum(df_processed[col], 0)
    
    # Round integer features
    for col in integer_features:
        if col in df_processed.columns:
            df_processed[col] = np.round(df_processed[col]).astype(int)
    
    # Clip bounded features
    for col in bounded_features:
        if col in df_processed.columns and col in empirical_bounds:
            df_processed[col] = np.clip(
                df_processed[col],
                empirical_bounds[col]['min'],
                empirical_bounds[col]['max']
            )
    
    # Clip all features to empirical bounds (using q01, q99 for robustness)
    for col in df_processed.columns:
        if col in empirical_bounds:
            # Use slightly expanded bounds
            min_val = empirical_bounds[col]['min']
            max_val = empirical_bounds[col]['max'] * 1.1  # Allow 10% overshoot
            df_processed[col] = np.clip(df_processed[col], min_val, max_val)
    
    return df_processed

print("Transform functions defined.")

## 5. Cluster-Based SMOTE Generation

In [None]:
def cluster_based_smote(X, n_clusters=5, target_samples=None, k_neighbors=5, random_state=42):
    """
    Perform cluster-based SMOTE generation.
    
    1. Cluster the data
    2. Apply SMOTE within each cluster
    3. Combine results
    
    This preserves local structure better than global SMOTE.
    """
    if target_samples is None:
        target_samples = len(X) * 2  # Default: double the data
    
    # Standardize for clustering
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Cluster the data
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
    clusters = kmeans.fit_predict(X_scaled)
    
    # Calculate samples per cluster proportionally
    cluster_counts = pd.Series(clusters).value_counts().sort_index()
    total_original = len(X)
    new_samples_needed = target_samples - total_original
    
    synthetic_dfs = []
    
    for cluster_id in range(n_clusters):
        cluster_mask = clusters == cluster_id
        X_cluster = X[cluster_mask].copy()
        
        if len(X_cluster) < k_neighbors + 1:
            print(f"  Cluster {cluster_id}: Too few samples ({len(X_cluster)}), skipping SMOTE")
            continue
        
        # Calculate target for this cluster
        cluster_proportion = len(X_cluster) / total_original
        cluster_target = int(new_samples_needed * cluster_proportion)
        
        if cluster_target <= 0:
            continue
        
        # Create dummy labels for SMOTE (minority class = 1)
        y_dummy = np.array([0] * len(X_cluster) + [1] * 2)  # Need at least 2 minority samples
        X_dummy = pd.concat([X_cluster, X_cluster.iloc[:2]], ignore_index=True)
        
        try:
            # Calculate appropriate k_neighbors
            k = min(k_neighbors, len(X_cluster) - 1)
            
            smote = SMOTE(
                sampling_strategy={1: cluster_target + 2},
                k_neighbors=k,
                random_state=random_state + cluster_id
            )
            
            X_resampled, y_resampled = smote.fit_resample(X_dummy, y_dummy)
            
            # Extract only the new synthetic samples
            synthetic_mask = y_resampled == 1
            X_synthetic = pd.DataFrame(X_resampled[synthetic_mask], columns=X.columns)
            
            # Remove the original dummy minority samples
            X_synthetic = X_synthetic.iloc[2:]
            
            synthetic_dfs.append(X_synthetic)
            print(f"  Cluster {cluster_id}: Generated {len(X_synthetic)} samples from {len(X_cluster)} original")
            
        except Exception as e:
            print(f"  Cluster {cluster_id}: Error - {str(e)}")
            continue
    
    if synthetic_dfs:
        return pd.concat(synthetic_dfs, ignore_index=True)
    else:
        return pd.DataFrame(columns=X.columns)

print("Cluster-based SMOTE function defined.")

## 6. Quality Validation Functions

In [None]:
def validate_synthetic_quality(X_original, X_synthetic, feature_names=None, show_plots=True):
    """
    Comprehensive validation of synthetic data quality.
    
    Returns:
        dict: Quality metrics including KS statistics, correlation preservation, etc.
    """
    if feature_names is None:
        feature_names = X_original.columns.tolist()
    
    results = {
        'ks_statistics': {},
        'mean_differences': {},
        'std_differences': {},
        'correlation_diff': None
    }
    
    # 1. KS Test for each feature
    ks_stats = []
    for feat in feature_names:
        if feat in X_original.columns and feat in X_synthetic.columns:
            stat, p_value = ks_2samp(X_original[feat], X_synthetic[feat])
            results['ks_statistics'][feat] = {'statistic': stat, 'p_value': p_value}
            ks_stats.append(stat)
    
    results['avg_ks_statistic'] = np.mean(ks_stats) if ks_stats else 0
    
    # 2. Mean and Std differences
    for feat in feature_names:
        if feat in X_original.columns and feat in X_synthetic.columns:
            orig_mean = X_original[feat].mean()
            syn_mean = X_synthetic[feat].mean()
            results['mean_differences'][feat] = abs(orig_mean - syn_mean) / (orig_mean + 1e-10)
            
            orig_std = X_original[feat].std()
            syn_std = X_synthetic[feat].std()
            results['std_differences'][feat] = abs(orig_std - syn_std) / (orig_std + 1e-10)
    
    # 3. Correlation matrix comparison
    try:
        corr_orig = X_original[feature_names].corr()
        corr_syn = X_synthetic[feature_names].corr()
        results['correlation_diff'] = np.abs(corr_orig - corr_syn).mean().mean()
    except:
        results['correlation_diff'] = None
    
    # 4. Print summary
    print("\n=== Synthetic Data Quality Summary ===")
    print(f"Average KS Statistic: {results['avg_ks_statistic']:.4f} (lower is better, <0.1 is good)")
    print(f"Average Mean Difference: {np.mean(list(results['mean_differences'].values())):.4f}")
    print(f"Average Std Difference: {np.mean(list(results['std_differences'].values())):.4f}")
    if results['correlation_diff'] is not None:
        print(f"Correlation Preservation Error: {results['correlation_diff']:.4f} (lower is better)")
    
    # 5. Visualization
    if show_plots:
        # Plot distributions for key features
        key_features = ['announcements', 'withdrawals', 'edit_distance_avg', 'rare_ases_avg']
        key_features = [f for f in key_features if f in feature_names]
        
        if key_features:
            fig, axes = plt.subplots(2, 2, figsize=(14, 10))
            axes = axes.flatten()
            
            for i, feat in enumerate(key_features[:4]):
                ax = axes[i]
                ax.hist(X_original[feat], bins=50, alpha=0.5, label='Original', density=True)
                ax.hist(X_synthetic[feat], bins=50, alpha=0.5, label='Synthetic', density=True)
                ax.set_title(f'{feat}\nKS={results["ks_statistics"].get(feat, {}).get("statistic", 0):.3f}')
                ax.legend()
            
            plt.tight_layout()
            plt.suptitle('Distribution Comparison: Original vs Synthetic Anomaly Data', y=1.02)
            plt.show()
    
    return results

print("Quality validation function defined.")

## 7. Generate Synthetic Anomaly Data

In [None]:
# Configuration
TARGET_MULTIPLIER = 3  # Generate 3x the original data
N_CLUSTERS = 8  # Number of clusters for cluster-based SMOTE
K_NEIGHBORS = 5  # K neighbors for SMOTE
RANDOM_STATE = 42

target_samples = len(X_anomaly) * TARGET_MULTIPLIER
print(f"Original high confidence anomaly samples: {len(X_anomaly)}")
print(f"Target total samples: {target_samples}")
print(f"New samples to generate: {target_samples - len(X_anomaly)}")

In [None]:
# Step 1: Apply log1p transform to heavy-tailed features
print("Step 1: Applying log1p transform to heavy-tailed features...")
X_transformed = log1p_transform(X_anomaly, HEAVY_TAILED_FEATURES)
print(f"Transformed features: {HEAVY_TAILED_FEATURES}")

In [None]:
# Step 2: Generate synthetic data using cluster-based SMOTE
print("\nStep 2: Generating synthetic data using cluster-based SMOTE...")
X_synthetic_transformed = cluster_based_smote(
    X_transformed,
    n_clusters=N_CLUSTERS,
    target_samples=target_samples,
    k_neighbors=K_NEIGHBORS,
    random_state=RANDOM_STATE
)

print(f"\nGenerated {len(X_synthetic_transformed)} synthetic samples")

In [None]:
# Step 3: Inverse transform
print("\nStep 3: Applying inverse log1p transform...")
X_synthetic_raw = inverse_log1p_transform(X_synthetic_transformed, HEAVY_TAILED_FEATURES)

In [None]:
# Step 4: Post-process synthetic data
print("\nStep 4: Post-processing synthetic data...")
X_synthetic_final = post_process_synthetic(
    X_synthetic_raw,
    EMPIRICAL_BOUNDS,
    INTEGER_FEATURES,
    BOUNDED_RATIO_FEATURES
)

print(f"Final synthetic samples: {len(X_synthetic_final)}")

## 8. Validate Synthetic Data Quality

In [None]:
# Validate synthetic data quality
validation_results = validate_synthetic_quality(
    X_anomaly,
    X_synthetic_final,
    feature_names=ALL_FEATURES,
    show_plots=True
)

In [None]:
# Detailed feature comparison
print("\n=== Detailed Feature Statistics Comparison ===")
comparison_df = pd.DataFrame({
    'Feature': ALL_FEATURES,
    'Original_Mean': [X_anomaly[f].mean() for f in ALL_FEATURES],
    'Synthetic_Mean': [X_synthetic_final[f].mean() for f in ALL_FEATURES],
    'Original_Std': [X_anomaly[f].std() for f in ALL_FEATURES],
    'Synthetic_Std': [X_synthetic_final[f].std() for f in ALL_FEATURES],
    'KS_Statistic': [validation_results['ks_statistics'].get(f, {}).get('statistic', 0) for f in ALL_FEATURES]
})

comparison_df['Mean_Diff_%'] = abs(comparison_df['Original_Mean'] - comparison_df['Synthetic_Mean']) / (comparison_df['Original_Mean'] + 1e-10) * 100
comparison_df['Std_Diff_%'] = abs(comparison_df['Original_Std'] - comparison_df['Synthetic_Std']) / (comparison_df['Original_Std'] + 1e-10) * 100

print(comparison_df.to_string(index=False))

## 9. Additional Quality Checks

In [None]:
# Check for any negative values
print("Checking for negative values...")
negative_counts = (X_synthetic_final < 0).sum()
if negative_counts.any():
    print(f"WARNING: Negative values found in: {negative_counts[negative_counts > 0].to_dict()}")
else:
    print("✓ No negative values found")

# Check integer features are actually integers
print("\nChecking integer feature types...")
for feat in INTEGER_FEATURES:
    if feat in X_synthetic_final.columns:
        non_int = (X_synthetic_final[feat] != X_synthetic_final[feat].astype(int)).sum()
        if non_int > 0:
            print(f"  WARNING: {feat} has {non_int} non-integer values")
print("✓ Integer features validated")

In [None]:
# Correlation heatmap comparison
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Original correlation
corr_orig = X_anomaly[CORE_VALIDATION_FEATURES].corr()
sns.heatmap(corr_orig, annot=True, cmap='coolwarm', center=0, ax=axes[0], fmt='.2f')
axes[0].set_title('Original Anomaly Data Correlations')

# Synthetic correlation
corr_syn = X_synthetic_final[CORE_VALIDATION_FEATURES].corr()
sns.heatmap(corr_syn, annot=True, cmap='coolwarm', center=0, ax=axes[1], fmt='.2f')
axes[1].set_title('Synthetic Anomaly Data Correlations')

plt.tight_layout()
plt.show()

## 10. Combine and Export Data

In [None]:
# Create combined dataset with labels
print("Creating combined dataset...")

# Add source label to original data
df_original_labeled = X_anomaly.copy()
df_original_labeled['data_source'] = 'original'
df_original_labeled['label'] = 'anomaly'

# Add confidence labels from original data if available
if 'confidence_label' in df_high_confidence.columns:
    df_original_labeled['confidence_label'] = df_high_confidence['confidence_label'].values

# Add source label to synthetic data
df_synthetic_labeled = X_synthetic_final.copy()
df_synthetic_labeled['data_source'] = 'synthetic'
df_synthetic_labeled['label'] = 'anomaly'
df_synthetic_labeled['confidence_label'] = 'synthetic_anomaly'

# Combine
df_combined = pd.concat([df_original_labeled, df_synthetic_labeled], ignore_index=True)

print(f"\nCombined dataset shape: {df_combined.shape}")
print(f"Data source distribution:\n{df_combined['data_source'].value_counts()}")

In [None]:
# Export paths
import os

# Create output directory if it doesn't exist
OUTPUT_DIR = '/home/smotaali/BGP_Traffic_Generation/results/synthetic_anomaly_data'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Export synthetic data only
synthetic_only_path = os.path.join(OUTPUT_DIR, 'synthetic_anomaly_data.csv')
X_synthetic_final.to_csv(synthetic_only_path, index=False)
print(f"Synthetic data saved to: {synthetic_only_path}")

# Export combined data (original + synthetic)
combined_path = os.path.join(OUTPUT_DIR, 'combined_anomaly_data.csv')
df_combined.to_csv(combined_path, index=False)
print(f"Combined data saved to: {combined_path}")

print(f"\n=== Export Complete ===")
print(f"Original samples: {len(X_anomaly)}")
print(f"Synthetic samples: {len(X_synthetic_final)}")
print(f"Total combined: {len(df_combined)}")

## 11. Summary Statistics

In [None]:
# Final summary
print("=" * 60)
print("SYNTHETIC ANOMALY DATA GENERATION SUMMARY")
print("=" * 60)
print(f"\nInput Data:")
print(f"  - Source file: {DATA_PATH}")
print(f"  - Confidence labels used: {HIGH_CONFIDENCE_LABELS}")
print(f"  - Original high confidence samples: {len(X_anomaly)}")

print(f"\nGeneration Parameters:")
print(f"  - Target multiplier: {TARGET_MULTIPLIER}x")
print(f"  - Number of clusters: {N_CLUSTERS}")
print(f"  - K neighbors: {K_NEIGHBORS}")

print(f"\nOutput:")
print(f"  - Synthetic samples generated: {len(X_synthetic_final)}")
print(f"  - Total combined samples: {len(df_combined)}")

print(f"\nQuality Metrics:")
print(f"  - Average KS Statistic: {validation_results['avg_ks_statistic']:.4f}")
if validation_results['correlation_diff'] is not None:
    print(f"  - Correlation Preservation Error: {validation_results['correlation_diff']:.4f}")

print(f"\nExport Locations:")
print(f"  - Synthetic only: {synthetic_only_path}")
print(f"  - Combined data: {combined_path}")
print("=" * 60)

In [None]:
# Display sample of generated data
print("\nSample of Generated Synthetic Anomaly Data:")
X_synthetic_final.head(10)