# BGP Anomaly Detection with Normal Baseline

## The Correct Approach for Anomaly Detection

**Problem with previous approach:**
- Unsupervised methods find outliers *within* the provided data
- If you only provide attack data, they find "unusual attacks" not "attacks vs normal"

**Correct approach:**
1. Train models on **NORMAL traffic** (learn what normal looks like)
2. Score **NEW/UNKNOWN data** against the normal baseline
3. High scores = anomalies (deviates from normal)

---

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.covariance import EllipticEnvelope
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-v0_8-whitegrid')
print("Imports successful!")

In [None]:
# ============================================
# CONFIGURATION
# ============================================

# NORMAL traffic file (for training the baseline)
NORMAL_FILE = "/path/to/normal_traffic_features.csv"  # <-- YOUR NORMAL DATA

# File to test/score (can be normal, attack, or unknown)
TEST_FILE = "/path/to/test_features.csv"  # <-- DATA TO CLASSIFY

# Output directory
OUTPUT_DIR = None  # Uses test file directory if None

# Label column (if exists in test file)
LABEL_COL = 'label'

# Threshold for anomaly (percentile of normal data scores)
# 95 means: flag as anomaly if score is worse than 95% of normal data
ANOMALY_THRESHOLD_PERCENTILE = 95

# Random seed
RANDOM_STATE = 42

# Save plots
SAVE_PLOTS = True

print("Configuration set")
print(f"  Normal file: {NORMAL_FILE}")
print(f"  Test file: {TEST_FILE}")

## 1. Load Data

In [None]:
# Load normal data (for training)
print("Loading NORMAL data (for baseline training)...")
df_normal = pd.read_csv(NORMAL_FILE)
print(f"  Loaded {len(df_normal)} normal samples")

# Load test data (to classify)
print("\nLoading TEST data (to classify)...")
df_test = pd.read_csv(TEST_FILE)
print(f"  Loaded {len(df_test)} test samples")

# Show test data label distribution if available
if LABEL_COL and LABEL_COL in df_test.columns:
    print(f"\nTest data labels (ground truth):")
    for label, count in df_test[LABEL_COL].value_counts().items():
        print(f"  {label}: {count} ({count/len(df_test)*100:.1f}%)")

In [None]:
# Identify feature columns - exclude metadata and previous analysis columns
META_COLS = {
    # Metadata columns
    'incident', 'window_start', 'window_end', 'timestamp', 'time',
    'label', 'label_rule', 'label_refined', 'source', 'collector',
    # Previous analysis columns (from BGP_Label_Validation_Discovery.ipynb)
    'iso_forest_score', 'lof_score', 'statistical_score', 'elliptic_score',
    'cluster', 'anomaly_votes', 'consensus_score', 'discovered_label',
    # Any other score columns
    'hdbscan_score', 'predicted_label'
}

def get_feature_cols(df):
    """Get only the actual BGP feature columns, excluding metadata and analysis columns."""
    candidates = [c for c in df.columns if c.lower() not in {m.lower() for m in META_COLS}]
    return df[candidates].select_dtypes(include=[np.number]).columns.tolist()

# Get feature columns from BOTH datasets and use intersection
normal_features = set(get_feature_cols(df_normal))
test_features = set(get_feature_cols(df_test))

# Use only features present in BOTH datasets
feature_cols = sorted(list(normal_features & test_features))

print(f"Normal dataset has {len(normal_features)} features")
print(f"Test dataset has {len(test_features)} features")
print(f"Using {len(feature_cols)} common features")
print(f"\nFeatures: {feature_cols}")

# Prepare normal data
X_normal = df_normal[feature_cols].values
valid_normal = ~np.isnan(X_normal).any(axis=1)
X_normal = X_normal[valid_normal]
print(f"\nNormal samples after removing NaN: {len(X_normal)}")

# Prepare test data
X_test = df_test[feature_cols].values
valid_test = ~np.isnan(X_test).any(axis=1)
X_test_clean = X_test[valid_test]
print(f"Test samples after removing NaN: {len(X_test_clean)}")

# Fit scaler on NORMAL data only
scaler = RobustScaler()
X_normal_scaled = scaler.fit_transform(X_normal)
X_test_scaled = scaler.transform(X_test_clean)  # Transform test with normal's scaler

print("\nScaler fitted on NORMAL data and applied to test data")

## 2. Train Models on NORMAL Data

This is the key difference: we train **only on normal data** to learn what normal looks like.

In [None]:
print("Training anomaly detection models on NORMAL data...")
print("="*60)

models = {}
normal_scores = {}

# 1. Isolation Forest
print("\n[1/4] Training Isolation Forest...")
models['IsolationForest'] = IsolationForest(
    n_estimators=200,
    contamination=0.01,  # Expect very few anomalies in normal data
    random_state=RANDOM_STATE,
    n_jobs=-1
)
models['IsolationForest'].fit(X_normal_scaled)
normal_scores['IsolationForest'] = models['IsolationForest'].decision_function(X_normal_scaled)
print(f"  Normal score range: [{normal_scores['IsolationForest'].min():.3f}, {normal_scores['IsolationForest'].max():.3f}]")

# 2. Local Outlier Factor (novelty detection mode)
print("\n[2/4] Training Local Outlier Factor...")
n_neighbors = max(10, min(50, int(np.sqrt(len(X_normal_scaled)))))
models['LOF'] = LocalOutlierFactor(
    n_neighbors=n_neighbors,
    contamination=0.01,
    novelty=True  # Important: enables scoring new data
)
models['LOF'].fit(X_normal_scaled)
normal_scores['LOF'] = models['LOF'].decision_function(X_normal_scaled)
print(f"  Normal score range: [{normal_scores['LOF'].min():.3f}, {normal_scores['LOF'].max():.3f}]")

# 3. Elliptic Envelope
print("\n[3/4] Training Elliptic Envelope...")
try:
    models['EllipticEnvelope'] = EllipticEnvelope(
        contamination=0.01,
        random_state=RANDOM_STATE
    )
    models['EllipticEnvelope'].fit(X_normal_scaled)
    normal_scores['EllipticEnvelope'] = models['EllipticEnvelope'].decision_function(X_normal_scaled)
    print(f"  Normal score range: [{normal_scores['EllipticEnvelope'].min():.3f}, {normal_scores['EllipticEnvelope'].max():.3f}]")
except Exception as e:
    print(f"  Skipped due to error: {e}")
    models['EllipticEnvelope'] = None

# 4. Statistical baseline (mean and std of normal data)
print("\n[4/4] Computing Statistical baseline...")
models['Statistical'] = {
    'mean': np.mean(X_normal_scaled, axis=0),
    'std': np.std(X_normal_scaled, axis=0)
}
# Mahalanobis-like distance from normal mean
normal_scores['Statistical'] = -np.mean(
    np.abs((X_normal_scaled - models['Statistical']['mean']) / (models['Statistical']['std'] + 1e-10)), 
    axis=1
)
print(f"  Normal score range: [{normal_scores['Statistical'].min():.3f}, {normal_scores['Statistical'].max():.3f}]")

print("\n" + "="*60)
print("All models trained on NORMAL data!")

In [None]:
# Compute thresholds based on normal data distribution
print(f"Computing anomaly thresholds ({ANOMALY_THRESHOLD_PERCENTILE}th percentile of normal scores)...")

thresholds = {}
for method, scores in normal_scores.items():
    # For decision_function: higher = more normal, lower = more anomalous
    # So we use a LOW percentile as threshold
    thresholds[method] = np.percentile(scores, 100 - ANOMALY_THRESHOLD_PERCENTILE)
    print(f"  {method}: threshold = {thresholds[method]:.4f}")

print("\nSamples scoring BELOW these thresholds will be flagged as anomalies")

## 3. Score Test Data Against Normal Baseline

In [None]:
print("Scoring TEST data against NORMAL baseline...")
print("="*60)

test_scores = {}
test_anomalies = {}

# 1. Isolation Forest
print("\n[1/4] Scoring with Isolation Forest...")
test_scores['IsolationForest'] = models['IsolationForest'].decision_function(X_test_scaled)
test_anomalies['IsolationForest'] = test_scores['IsolationForest'] < thresholds['IsolationForest']
print(f"  Anomalies: {test_anomalies['IsolationForest'].sum()} ({test_anomalies['IsolationForest'].mean()*100:.1f}%)")

# 2. LOF
print("\n[2/4] Scoring with LOF...")
test_scores['LOF'] = models['LOF'].decision_function(X_test_scaled)
test_anomalies['LOF'] = test_scores['LOF'] < thresholds['LOF']
print(f"  Anomalies: {test_anomalies['LOF'].sum()} ({test_anomalies['LOF'].mean()*100:.1f}%)")

# 3. Elliptic Envelope
if models['EllipticEnvelope'] is not None:
    print("\n[3/4] Scoring with Elliptic Envelope...")
    test_scores['EllipticEnvelope'] = models['EllipticEnvelope'].decision_function(X_test_scaled)
    test_anomalies['EllipticEnvelope'] = test_scores['EllipticEnvelope'] < thresholds['EllipticEnvelope']
    print(f"  Anomalies: {test_anomalies['EllipticEnvelope'].sum()} ({test_anomalies['EllipticEnvelope'].mean()*100:.1f}%)")

# 4. Statistical
print("\n[4/4] Scoring with Statistical method...")
test_scores['Statistical'] = -np.mean(
    np.abs((X_test_scaled - models['Statistical']['mean']) / (models['Statistical']['std'] + 1e-10)), 
    axis=1
)
test_anomalies['Statistical'] = test_scores['Statistical'] < thresholds['Statistical']
print(f"  Anomalies: {test_anomalies['Statistical'].sum()} ({test_anomalies['Statistical'].mean()*100:.1f}%)")

print("\n" + "="*60)

In [None]:
# Compute consensus
print("Computing consensus across methods...")

n_methods = len(test_anomalies)
anomaly_votes = np.zeros(len(X_test_scaled))

for method, anomalies in test_anomalies.items():
    anomaly_votes += anomalies.astype(int)

consensus_score = anomaly_votes / n_methods

# Assign labels
predicted_labels = np.where(
    consensus_score >= 0.75, 'anomaly',
    np.where(
        consensus_score >= 0.5, 'likely_anomaly',
        np.where(
            consensus_score >= 0.25, 'uncertain',
            'normal'
        )
    )
)

print("\nPredicted Label Distribution:")
print("="*50)
for label in ['normal', 'uncertain', 'likely_anomaly', 'anomaly']:
    count = (predicted_labels == label).sum()
    pct = count / len(predicted_labels) * 100
    emoji = "üü¢" if label == 'normal' else "üî¥" if 'anomaly' in label else "üü°"
    print(f"  {emoji} {label}: {count} ({pct:.1f}%)")

## 4. Evaluate Against Ground Truth (if available)

In [None]:
if LABEL_COL and LABEL_COL in df_test.columns:
    print("Evaluating against ground truth labels...")
    print("="*60)
    
    # Get ground truth for valid samples
    ground_truth = df_test.loc[valid_test, LABEL_COL].values
    
    # Binary ground truth: normal vs not normal
    is_attack_gt = ~np.isin(ground_truth, ['normal', 'Normal', 'NORMAL'])
    is_attack_pred = np.isin(predicted_labels, ['anomaly', 'likely_anomaly'])
    
    # Calculate metrics
    TP = np.sum(is_attack_gt & is_attack_pred)
    TN = np.sum(~is_attack_gt & ~is_attack_pred)
    FP = np.sum(~is_attack_gt & is_attack_pred)
    FN = np.sum(is_attack_gt & ~is_attack_pred)
    
    accuracy = (TP + TN) / len(ground_truth)
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    print(f"\nüìä DETECTION PERFORMANCE:")
    print(f"  Accuracy:  {accuracy*100:.1f}%")
    print(f"  Precision: {precision*100:.1f}%")
    print(f"  Recall:    {recall*100:.1f}%")
    print(f"  F1-Score:  {f1*100:.1f}%")
    
    print(f"\nüìã CONFUSION MATRIX:")
    print(f"  True Positives (attacks detected):  {TP}")
    print(f"  True Negatives (normal correct):    {TN}")
    print(f"  False Positives (false alarms):     {FP}")
    print(f"  False Negatives (missed attacks):   {FN}")
    
    # Per-class detection rates
    print(f"\nüìà PER-CLASS DETECTION RATES:")
    for label in df_test[LABEL_COL].unique():
        mask = ground_truth == label
        if mask.sum() > 0:
            detected = is_attack_pred[mask].sum()
            rate = detected / mask.sum() * 100
            emoji = "‚úÖ" if rate > 80 else "‚ö†Ô∏è" if rate > 50 else "‚ùå"
            print(f"  {emoji} {label}: {detected}/{mask.sum()} detected ({rate:.1f}%)")
else:
    print("No ground truth labels available for evaluation.")

In [None]:
# ============================================
# DIAGNOSTIC: Compare Normal vs Attack Feature Distributions
# ============================================
print("DIAGNOSTIC: Comparing feature distributions...")
print("="*60)

# Get attack data statistics
attack_stats = pd.DataFrame(X_test_scaled, columns=feature_cols).describe()
normal_stats = pd.DataFrame(X_normal_scaled, columns=feature_cols).describe()

# Compare means and stds
print("\nüìä FEATURE COMPARISON (Normal vs Attack):")
print("-"*60)
print(f"{'Feature':<30} {'Normal Mean':>12} {'Attack Mean':>12} {'Difference':>12}")
print("-"*60)

significant_diffs = []
for col in feature_cols:
    n_mean = normal_stats.loc['mean', col]
    a_mean = attack_stats.loc['mean', col]
    diff = abs(a_mean - n_mean)
    significant_diffs.append((col, n_mean, a_mean, diff))

# Sort by difference
significant_diffs.sort(key=lambda x: x[3], reverse=True)

for col, n_mean, a_mean, diff in significant_diffs[:15]:
    indicator = "‚ö†Ô∏è" if diff > 0.5 else "  "
    print(f"{indicator} {col:<28} {n_mean:>12.3f} {a_mean:>12.3f} {diff:>12.3f}")

print("\n‚ö†Ô∏è = Features with significant difference (>0.5 std)")
print("\nIf most differences are small, normal and attack data are SIMILAR!")

In [None]:
# ============================================
# DIAGNOSTIC: Visualize Score Overlap
# ============================================
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, (method, test_sc) in enumerate(test_scores.items()):
    if i >= 4:
        break
    ax = axes[i]
    
    normal_sc = normal_scores[method]
    threshold = thresholds[method]
    
    # Plot distributions
    ax.hist(normal_sc, bins=50, alpha=0.6, label=f'Normal baseline (n={len(normal_sc)})', color='green', density=True)
    ax.hist(test_sc, bins=50, alpha=0.6, label=f'Attack data (n={len(test_sc)})', color='red', density=True)
    ax.axvline(threshold, color='black', linestyle='--', linewidth=2, label=f'Threshold')
    
    # Calculate overlap
    overlap_pct = (test_sc >= threshold).mean() * 100
    ax.set_title(f'{method}\nAttacks ABOVE threshold (looks normal): {overlap_pct:.1f}%')
    ax.set_xlabel('Score (higher = more normal)')
    ax.legend()

plt.suptitle('PROBLEM: Attack scores overlap with Normal scores!', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('diagnostic_score_overlap.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nüî¥ If red (attack) distribution overlaps with green (normal), the data is too similar!")

## ‚ö†Ô∏è Diagnostic Results Interpretation

If attack detection is very low (~1%), there are **two possible explanations**:

### 1. Your "Normal" Data Contains Anomalies
- You mentioned the RIPE data is "assumed normal" without ground truth
- If your baseline already contains attack-like patterns, the model learns them as normal
- **Solution**: Clean the baseline or use known-good normal data

### 2. The BGP Features Don't Distinguish Attacks
- Your 27 features might not capture what makes attacks different
- The attack traffic might have similar aggregate statistics to normal traffic
- **Solution**: Engineer better features or use different detection approaches

---

## üî¨ Let's Test: Check if Normal Data is Clean

Run the cell below to analyze your "normal" baseline for potential contamination:

In [None]:
# ============================================
# CHECK: Is the Normal Baseline Clean?
# ============================================
print("Analyzing NORMAL baseline for potential contamination...")
print("="*60)

# Check variance in normal data
normal_df = pd.DataFrame(X_normal_scaled, columns=feature_cols)

# High variance features might indicate contamination
variances = normal_df.var().sort_values(ascending=False)
print("\nüìä Top 10 highest variance features in 'normal' data:")
for feat, var in variances.head(10).items():
    warning = "‚ö†Ô∏è HIGH" if var > 2 else ""
    print(f"  {feat}: {var:.3f} {warning}")

# Check for outliers within normal data using IQR
print("\nüìä Outlier check within 'normal' data (IQR method):")
outlier_counts = {}
for col in feature_cols:
    Q1 = normal_df[col].quantile(0.25)
    Q3 = normal_df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = ((normal_df[col] < Q1 - 1.5*IQR) | (normal_df[col] > Q3 + 1.5*IQR)).sum()
    outlier_counts[col] = outliers

total_outliers = sum(outlier_counts.values())
avg_outlier_pct = (total_outliers / (len(normal_df) * len(feature_cols))) * 100
print(f"  Average outlier rate: {avg_outlier_pct:.2f}%")

if avg_outlier_pct > 5:
    print("  ‚ö†Ô∏è HIGH outlier rate suggests normal data may be contaminated!")
else:
    print("  ‚úÖ Outlier rate is reasonable")

# Check skewness
print("\nüìä Feature skewness (high skew = potential contamination):")
skews = normal_df.skew().abs().sort_values(ascending=False)
for feat, skew in skews.head(5).items():
    warning = "‚ö†Ô∏è" if skew > 2 else ""
    print(f"  {feat}: {skew:.3f} {warning}")

In [None]:
# ============================================
# STATISTICAL TEST: Are Normal and Attack Actually Different?
# ============================================
from scipy.stats import mannwhitneyu

print("Statistical Test: Are normal and attack distributions different?")
print("="*60)
print("(Mann-Whitney U test, p < 0.05 = significantly different)")
print()

test_results = []
for col in feature_cols:
    normal_vals = X_normal_scaled[:, feature_cols.index(col)]
    attack_vals = X_test_scaled[:, feature_cols.index(col)]
    
    try:
        stat, p_value = mannwhitneyu(normal_vals, attack_vals, alternative='two-sided')
        effect_size = abs(normal_vals.mean() - attack_vals.mean())
        test_results.append((col, p_value, effect_size))
    except:
        test_results.append((col, 1.0, 0.0))

# Sort by effect size
test_results.sort(key=lambda x: x[2], reverse=True)

print(f"{'Feature':<30} {'p-value':>12} {'Effect Size':>12} {'Significant?':>12}")
print("-"*70)

significant_count = 0
for col, p, effect in test_results:
    sig = "‚úÖ YES" if p < 0.05 else "‚ùå NO"
    if p < 0.05:
        significant_count += 1
    print(f"{col:<30} {p:>12.4f} {effect:>12.3f} {sig:>12}")

print()
print(f"üìä SUMMARY: {significant_count}/{len(feature_cols)} features are significantly different")

if significant_count < len(feature_cols) * 0.3:
    print("‚ö†Ô∏è PROBLEM: Most features are NOT different between normal and attack!")
    print("   This means either:")
    print("   1. Normal baseline contains attack-like patterns")
    print("   2. These features don't capture attack characteristics")
elif significant_count > len(feature_cols) * 0.7:
    print("‚úÖ Most features ARE different - detection should work!")
    print("   If detection still fails, try lowering the threshold.")

In [None]:
# Visualize score distributions
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, (method, scores) in enumerate(test_scores.items()):
    if i >= 4:
        break
    
    ax = axes[i]
    
    # Plot normal scores distribution
    ax.hist(normal_scores[method], bins=50, alpha=0.5, label='Normal (training)', color='green', density=True)
    
    # Plot test scores distribution
    ax.hist(scores, bins=50, alpha=0.5, label='Test data', color='blue', density=True)
    
    # Mark threshold
    ax.axvline(x=thresholds[method], color='red', linestyle='--', linewidth=2, label=f'Threshold')
    
    ax.set_xlabel('Anomaly Score')
    ax.set_ylabel('Density')
    ax.set_title(f'{method}\n(Anomalies: {test_anomalies[method].mean()*100:.1f}%)')
    ax.legend()

plt.tight_layout()
plt.savefig('score_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

print("Plot saved to: score_distributions.png")

In [None]:
# Save results
results_df = df_test.loc[valid_test].copy()

# Add scores
for method, scores in test_scores.items():
    results_df[f'{method}_score'] = scores
    results_df[f'{method}_anomaly'] = test_anomalies[method]

results_df['consensus_score'] = consensus_score
results_df['predicted_label'] = predicted_labels

# Save
output_path = Path(TEST_FILE).parent / f"{Path(TEST_FILE).stem}_anomaly_detection.csv"
results_df.to_csv(output_path, index=False)
print(f"Results saved to: {output_path}")

## Summary

This notebook implements the **correct** approach for anomaly detection:

1. **Train on NORMAL data** - Learn what normal traffic looks like
2. **Compute thresholds** - Based on normal data distribution
3. **Score test data** - Measure how different from normal
4. **Flag anomalies** - Samples that deviate significantly from normal

This approach will correctly identify attacks as anomalies because they differ from the normal baseline.