# Unified BGP Feature Extraction

This notebook extracts BGP features from incident data.

Based on the working feature extraction logic.

## 1. Configuration

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# CONFIGURATION
# =============================================================================

# For incident data processing
INCIDENT_BASE_DIR = Path("/home/smotaali/BGP_Traffic_Generation/RIPE/RIPE_INCIDENTS")

# Feature extraction settings
WINDOW_SIZE = '1s'
LABEL_STRATEGY = 'majority'  # Options: 'majority', 'conservative', 'weighted'
SKIP_EXISTING = False  # Set to True to skip already processed files

print(f"Base directory: {INCIDENT_BASE_DIR}")
print(f"Window size: {WINDOW_SIZE}")
print(f"Skip existing: {SKIP_EXISTING}")

## 2. Feature Extraction Functions

In [None]:
def calculate_edit_distance(as_path1, as_path2):
    """
    Calculate edit distance between two AS paths
    """
    if not as_path1 or not as_path2:
        return 0
    
    # Handle integer AS paths by converting them to single-item lists
    if isinstance(as_path1, int):
        as_path1 = [as_path1]
    
    if isinstance(as_path2, int):
        as_path2 = [as_path2]
    
    # Convert paths to lists if they are strings
    if isinstance(as_path1, str):
        as_path1 = as_path1.replace('{', '').replace('}', '')
        as_path1 = [int(as_num) for as_num in as_path1.split() if as_num.isdigit()]
    
    if isinstance(as_path2, str):
        as_path2 = as_path2.replace('{', '').replace('}', '')
        as_path2 = [int(as_num) for as_num in as_path2.split() if as_num.isdigit()]
    
    if not as_path1 or not as_path2:
        return 0
    
    m, n = len(as_path1), len(as_path2)
    
    # Initialize the distance matrix
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    
    # Fill the first row and column
    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j
    
    # Calculate edit distance
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if as_path1[i-1] == as_path2[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                dp[i][j] = 1 + min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1])
    
    return dp[m][n]

print("Edit distance function defined!")

In [None]:
def extract_features(df_window, entry_type_col='Entry_Type'):
    """
    Extract BGP features from a dataframe within a specific time window.
    This is the exact working logic.
    """
    features = {}
    
    # Define announcement and withdrawal types
    announce_types = ['A', 'ANNOUNCE']
    withdrawal_types = ['W', 'WITHDRAW', 'WITHDRAW_MP_UNREACH_NLRI_AFI2']
    
    # 1. ANNOUNCEMENTS
    announcements = df_window[df_window[entry_type_col].isin(announce_types)]
    
    # 2. WITHDRAWALS
    withdrawals = df_window[df_window[entry_type_col].isin(withdrawal_types)]
    
    # Store counts
    features['announcements'] = len(announcements)
    features['withdrawals'] = len(withdrawals)
    
    # 3. NLRI_ANN
    features['nlri_ann'] = features['announcements']
    
    # 4. DUPLICATES
    if not announcements.empty:
        dup_cols = ['Peer_IP', 'Peer_ASN', 'Prefix', 'AS_Path', 'Origin', 'Next_Hop', 'MED', 'Local_Pref', 'Communities']
        dup_cols = [col for col in dup_cols if col in announcements.columns]
        
        if dup_cols:
            announcement_counts = announcements.groupby(dup_cols, dropna=False).size()
            duplicates = sum(count - 1 for count in announcement_counts if count > 1)
            features['dups'] = duplicates
        else:
            features['dups'] = 0
    else:
        features['dups'] = 0
    
    # 5-7. ORIGIN ATTRIBUTES
    if not announcements.empty and 'Origin' in announcements.columns:
        origin_counts = announcements['Origin'].value_counts()
        # Handle both string ('IGP', 'INCOMPLETE') and numeric (0, 2) formats
        features['origin_0'] = origin_counts.get('IGP', 0) + origin_counts.get(0, 0) + origin_counts.get('0', 0)
        features['origin_2'] = origin_counts.get('INCOMPLETE', 0) + origin_counts.get(2, 0) + origin_counts.get('2', 0)
        
        # 8. ORIGIN CHANGES - prefixes with more than one unique origin
        unique_prefix_origins = announcements.groupby('Prefix')['Origin'].nunique()
        features['origin_changes'] = (unique_prefix_origins > 1).sum()
    else:
        features['origin_0'] = 0
        features['origin_2'] = 0
        features['origin_changes'] = 0
    
    # 9-10. AS PATH METRICS
    if not announcements.empty and 'AS_Path' in announcements.columns:
        valid_as_paths = announcements[announcements['AS_Path'].notna() & (announcements['AS_Path'] != '')]
        
        if not valid_as_paths.empty:
            # Calculate AS path lengths
            as_path_lengths = valid_as_paths['AS_Path'].apply(
                lambda path: len([p for p in str(path).split() if p.isdigit()]) if pd.notna(path) else 0
            )
            
            features['as_path_max'] = int(as_path_lengths.max()) if not as_path_lengths.empty else 0
            
            unique_paths_per_prefix = valid_as_paths.groupby('Prefix')['AS_Path'].nunique()
            features['unique_as_path_max'] = int(unique_paths_per_prefix.max()) if not unique_paths_per_prefix.empty else 0
            
            # 11-20. EDIT DISTANCE FEATURES
            edit_distances = []
            edit_distance_dict = defaultdict(list)
            
            for prefix, group in valid_as_paths.groupby('Prefix'):
                if len(group) >= 2:
                    sorted_group = group.sort_values('Timestamp')
                    prev_path = None
                    
                    for _, row in sorted_group.iterrows():
                        current_path = row['AS_Path']
                        
                        if prev_path is not None:
                            dist = calculate_edit_distance(prev_path, current_path)
                            edit_distances.append(dist)
                            edit_distance_dict[prefix].append(dist)
                        
                        prev_path = current_path
            
            if edit_distances:
                features['edit_distance_avg'] = np.mean(edit_distances)
                features['edit_distance_max'] = max(edit_distances)
                
                edit_dist_counter = Counter(edit_distances)
                for i in range(7):
                    features[f'edit_distance_dict_{i}'] = edit_dist_counter.get(i, 0)
                
                unique_edit_dists = {}
                for prefix, dists in edit_distance_dict.items():
                    unique_dists = set(dists)
                    for dist in unique_dists:
                        if dist in unique_edit_dists:
                            unique_edit_dists[dist] += 1
                        else:
                            unique_edit_dists[dist] = 1
                
                for i in range(2):
                    features[f'edit_distance_unique_dict_{i}'] = unique_edit_dists.get(i, 0)
            else:
                features['edit_distance_avg'] = 0
                features['edit_distance_max'] = 0
                for i in range(7):
                    features[f'edit_distance_dict_{i}'] = 0
                for i in range(2):
                    features[f'edit_distance_unique_dict_{i}'] = 0
        else:
            features['as_path_max'] = 0
            features['unique_as_path_max'] = 0
            features['edit_distance_avg'] = 0
            features['edit_distance_max'] = 0
            for i in range(7):
                features[f'edit_distance_dict_{i}'] = 0
            for i in range(2):
                features[f'edit_distance_unique_dict_{i}'] = 0
    else:
        features['as_path_max'] = 0
        features['unique_as_path_max'] = 0
        features['edit_distance_avg'] = 0
        features['edit_distance_max'] = 0
        for i in range(7):
            features[f'edit_distance_dict_{i}'] = 0
        for i in range(2):
            features[f'edit_distance_unique_dict_{i}'] = 0
    
    # 22-24. IMPLICIT WITHDRAWAL FEATURES
    if not announcements.empty:
        prefix_peer_groups = announcements.groupby(['Prefix', 'Peer_IP'])
        
        imp_wd_prefixes = 0
        imp_wd_spath_prefixes = 0
        imp_wd_dpath_prefixes = 0
        
        for (prefix, peer), group in prefix_peer_groups:
            if len(group) > 1:
                imp_wd_prefixes += 1
                
                if 'AS_Path' in group.columns:
                    unique_paths = group['AS_Path'].nunique()
                    if unique_paths == 1:
                        imp_wd_spath_prefixes += 1
                    else:
                        imp_wd_dpath_prefixes += 1
        
        features['imp_wd'] = imp_wd_prefixes
        features['imp_wd_spath'] = imp_wd_spath_prefixes
        features['imp_wd_dpath'] = imp_wd_dpath_prefixes
    else:
        features['imp_wd'] = 0
        features['imp_wd_spath'] = 0
        features['imp_wd_dpath'] = 0
    
    # 25-26. RARE AS FEATURES
    if not announcements.empty and 'AS_Path' in announcements.columns:
        all_asns = []
        
        for as_path in announcements['AS_Path']:
            if pd.isnull(as_path) or as_path == '':
                continue
            
            as_path_str = str(as_path)
            
            if as_path_str.isdigit():
                all_asns.append(as_path_str)
                continue
            
            as_path_str = as_path_str.replace('{', '').replace('}', '')
            path_asns = [asn for asn in as_path_str.split() if asn.isdigit()]
            all_asns.extend(path_asns)
        
        asn_counts = Counter(all_asns)
        rare_threshold = 3
        rare_asns = [asn for asn, count in asn_counts.items() if count < rare_threshold]
        
        features['number_rare_ases'] = len(rare_asns)
        features['rare_ases_avg'] = len(rare_asns) / len(all_asns) if all_asns else 0
    else:
        features['number_rare_ases'] = 0
        features['rare_ases_avg'] = 0
    
    # 27-28. FLAP AND NADAS FEATURES
    if not df_window.empty:
        if not withdrawals.empty and not announcements.empty:
            withdrawn_prefixes = set(withdrawals['Prefix'].dropna())
            announced_prefixes = set(announcements['Prefix'].dropna())
            flapped_prefixes = withdrawn_prefixes.intersection(announced_prefixes)
            features['flaps'] = len(flapped_prefixes)
        else:
            features['flaps'] = 0
        
        very_specific_prefixes = 0
        if 'Prefix' in df_window.columns:
            very_specific_prefixes = sum(1 for prefix in df_window['Prefix'].dropna() 
                                       if isinstance(prefix, str) and prefix.endswith('/32'))
        
        wd_ann_ratio = features['withdrawals'] / features['announcements'] if features['announcements'] > 0 else 0
        features['nadas'] = very_specific_prefixes + (wd_ann_ratio > 0.5) * 10
    else:
        features['flaps'] = 0
        features['nadas'] = 0
    
    # LABEL
    if 'Label' in df_window.columns:
        labels = df_window['Label'].value_counts()
        if not labels.empty:
            if LABEL_STRATEGY == 'majority':
                features['label'] = labels.idxmax()
            elif LABEL_STRATEGY == 'conservative':
                if any(label != 'normal' for label in labels.index):
                    abnormal_labels = [label for label in labels.index if label != 'normal']
                    features['label'] = abnormal_labels[0]
                else:
                    features['label'] = 'normal'
            elif LABEL_STRATEGY == 'weighted':
                total = labels.sum()
                abnormal_weight = sum(count for label, count in labels.items() if label != 'normal') / total
                if abnormal_weight > 0.4:
                    abnormal_labels = [label for label in labels.index if label != 'normal']
                    features['label'] = abnormal_labels[0] if abnormal_labels else 'normal'
                else:
                    features['label'] = 'normal'
        else:
            features['label'] = 'unknown'
    else:
        features['label'] = 'unknown'
    
    # Incident name
    if 'Incident' in df_window.columns:
        features['Incident'] = df_window['Incident'].iloc[0]
    
    return features

print("Feature extraction function defined!")

## 3. Process Single File

In [None]:
def process_single_file(input_path, output_path):
    """
    Process a single BGP data file and extract features.
    """
    print(f"\n{'='*60}")
    print(f"Processing: {input_path.name}")
    print(f"{'='*60}")
    
    # Read data
    df = pd.read_csv(input_path, low_memory=False)
    print(f"Loaded {len(df):,} records")
    
    # Standardize column names
    if 'Time' in df.columns and 'Timestamp' not in df.columns:
        df['Timestamp'] = pd.to_datetime(df['Time'])
    else:
        df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    
    # Determine entry type column
    if 'Entry_Type' in df.columns:
        entry_type_col = 'Entry_Type'
    elif 'Subtype' in df.columns:
        entry_type_col = 'Subtype'
    else:
        print("ERROR: No Entry_Type or Subtype column found!")
        return None
    
    print(f"Entry type column: {entry_type_col}")
    print(f"Entry types: {df[entry_type_col].value_counts().to_dict()}")
    
    # Check Origin column
    if 'Origin' in df.columns:
        print(f"Origin values: {df['Origin'].value_counts().to_dict()}")
    
    # Sort by timestamp
    df = df.sort_values('Timestamp')
    
    # Get time range
    start_time = df['Timestamp'].min()
    end_time = df['Timestamp'].max()
    print(f"Time range: {start_time} to {end_time}")
    
    # Set index for grouping
    df.set_index('Timestamp', inplace=True)
    
    # Extract features per window
    all_features = []
    grouped = df.groupby(pd.Grouper(freq=WINDOW_SIZE))
    window_count = 0
    total_windows = len(grouped)
    
    for window_start, window_df in grouped:
        if not window_df.empty:
            window_df = window_df.reset_index()
            features = extract_features(window_df, entry_type_col)
            
            if features:
                window_end = window_start + pd.Timedelta(WINDOW_SIZE)
                features['window_start'] = window_start
                features['window_end'] = window_end
                all_features.append(features)
                window_count += 1
                
                if window_count % 500 == 0:
                    print(f"  Processed {window_count}/{total_windows} windows...")
    
    print(f"Total windows: {window_count}")
    
    # Save features
    if all_features:
        features_df = pd.DataFrame(all_features)
        features_df.to_csv(output_path, index=False)
        print(f"Saved to: {output_path}")
        print(f"Shape: {features_df.shape}")
        
        # Show sample of key features
        print(f"\nSample feature values (first non-zero):")
        for col in ['origin_0', 'origin_2', 'origin_changes', 'edit_distance_avg', 'imp_wd']:
            if col in features_df.columns:
                non_zero = features_df[features_df[col] > 0][col]
                if not non_zero.empty:
                    print(f"  {col}: {non_zero.iloc[0]:.2f} (first non-zero)")
                else:
                    print(f"  {col}: all zeros")
        
        return features_df
    else:
        print("No features extracted!")
        return None

print("File processing function defined!")

## 4. Process All Incidents

In [None]:
print("="*70)
print("PROCESSING INCIDENT DATA")
print("="*70)

# Find all incident directories
incident_dirs = [d for d in INCIDENT_BASE_DIR.iterdir() if d.is_dir() and d.name != 'temp_mrt']
print(f"Found {len(incident_dirs)} incident directories")

processed = 0
skipped = 0
failed = 0
all_features_dfs = []

for incident_dir in sorted(incident_dirs):
    labeled_files = list(incident_dir.glob("*_labeled.csv"))
    
    if not labeled_files:
        print(f"\n[SKIP] No labeled file in {incident_dir.name}")
        continue
    
    csv_path = labeled_files[0]
    out_path = incident_dir / (csv_path.stem + "_features.csv")
    
    if SKIP_EXISTING and out_path.exists():
        print(f"\n[SKIP] Already exists: {out_path.name}")
        skipped += 1
        try:
            existing_df = pd.read_csv(out_path)
            all_features_dfs.append(existing_df)
        except:
            pass
        continue
    
    try:
        features_df = process_single_file(csv_path, out_path)
        if features_df is not None:
            all_features_dfs.append(features_df)
            processed += 1
        else:
            failed += 1
    except Exception as e:
        print(f"\n[ERROR] {incident_dir.name}: {e}")
        import traceback
        traceback.print_exc()
        failed += 1

print("\n" + "="*70)
print("PROCESSING COMPLETE")
print("="*70)
print(f"Processed: {processed}")
print(f"Skipped: {skipped}")
print(f"Failed: {failed}")

## 5. Merge Anomaly Features

In [None]:
print("\n" + "="*70)
print("MERGING ANOMALY FEATURES")
print("="*70)

# Collect all feature files
feature_files = list(INCIDENT_BASE_DIR.glob("*/*_features.csv"))
print(f"Found {len(feature_files)} feature files")

all_anomalies = []

for f in sorted(feature_files):
    try:
        df = pd.read_csv(f)
        anomaly_df = df[df['label'] != 'normal']
        if not anomaly_df.empty:
            anomaly_df = anomaly_df.copy()
            anomaly_df['source_file'] = f.parent.name
            all_anomalies.append(anomaly_df)
            print(f"  {f.parent.name}: {len(anomaly_df)} anomalies")
    except Exception as e:
        print(f"  Error reading {f.name}: {e}")

if all_anomalies:
    merged_anomalies = pd.concat(all_anomalies, ignore_index=True)
    
    output_file = INCIDENT_BASE_DIR / "all_incidents_anomalies_only.csv"
    merged_anomalies.to_csv(output_file, index=False)
    
    print(f"\n{'='*70}")
    print("MERGED ANOMALIES SUMMARY")
    print(f"{'='*70}")
    print(f"Total anomaly samples: {len(merged_anomalies):,}")
    print(f"\nLabel distribution:")
    print(merged_anomalies['label'].value_counts())
    print(f"\nSaved to: {output_file}")
else:
    print("\nNo anomalies found to merge!")

## 6. Verify Features

In [None]:
# Load and check one feature file
if all_features_dfs:
    sample_df = all_features_dfs[0]
    print("Sample feature file columns:")
    print(sample_df.columns.tolist())
    print(f"\nShape: {sample_df.shape}")
    print(f"\nFirst few rows:")
    display(sample_df.head())
    
    # Check for logical consistency
    print("\n" + "="*70)
    print("LOGICAL CONSISTENCY CHECK")
    print("="*70)
    
    # Check origin consistency
    origin_issues = sample_df[(sample_df['origin_0'] == 0) & 
                              (sample_df['origin_2'] == 0) & 
                              (sample_df['origin_changes'] > 0)]
    if len(origin_issues) > 0:
        print(f"WARNING: {len(origin_issues)} rows have origin_changes > 0 but origin_0=origin_2=0")
    else:
        print("Origin features: OK")
    
    # Check edit distance consistency
    ed_issues = sample_df[(sample_df['edit_distance_avg'] > 0) & 
                          (sample_df['edit_distance_max'] == 0)]
    if len(ed_issues) > 0:
        print(f"WARNING: {len(ed_issues)} rows have edit_distance_avg > 0 but max=0")
    else:
        print("Edit distance features: OK")