In [2]:
#!/usr/bin/env python3
import pandas as pd
import numpy as np
import datetime
from collections import defaultdict, Counter

# Configuration variables for Jupyter notebook
INPUT_FILE = "/home/smotaali/BGP_Traffic_Generation/results/bgp_updates_analysis_20251208_223239.csv"
OUTPUT_FILE = "/home/smotaali/BGP_Traffic_Generation/results/20251208_223239_exctracted_1s.csv"
WINDOW_SIZE = '1s'  # You can change to '30s', '5min', etc.
LABEL_STRATEGY = 'majority'  # Options: 'majority', 'conservative', 'weighted'

def calculate_edit_distance(as_path1, as_path2):
    """
    Calculate edit distance between two AS paths
    """
    if not as_path1 or not as_path2:
        return 0
    
    # Handle integer AS paths by converting them to single-item lists
    if isinstance(as_path1, int):
        as_path1 = [as_path1]
    
    if isinstance(as_path2, int):
        as_path2 = [as_path2]
    
    # Convert paths to lists if they are strings
    if isinstance(as_path1, str):
        # Handle cases with special characters
        as_path1 = as_path1.replace('{', '').replace('}', '')
        as_path1 = [int(as_num) for as_num in as_path1.split() if as_num.isdigit()]
    
    if isinstance(as_path2, str):
        # Handle cases with special characters
        as_path2 = as_path2.replace('{', '').replace('}', '')
        as_path2 = [int(as_num) for as_num in as_path2.split() if as_num.isdigit()]
    
    if not as_path1 or not as_path2:
        return 0
    
    m, n = len(as_path1), len(as_path2)
    
    # Initialize the distance matrix
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    
    # Fill the first row and column
    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j
    
    # Calculate edit distance
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if as_path1[i-1] == as_path2[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                dp[i][j] = 1 + min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1])
    
    return dp[m][n]
    

def extract_features(df_window):
    """
    Extract BGP features from a dataframe within a specific time window
    """
    features = {}
    

    # 1. ANNOUNCEMENTS
    # Count number of updates by type
    announcements = df_window[df_window['Subtype'] == 'ANNOUNCE']
    
    # 2. WITHDRAWALS - including all withdrawal types
    withdrawal_types = ['WITHDRAW', 'WITHDRAW_MP_UNREACH_NLRI_AFI2']
    withdrawals = df_window[df_window['Subtype'].isin(withdrawal_types)]
    
    # Store counts
    features['announcements'] = len(announcements)
    features['withdrawals'] = len(withdrawals)
    
    # 3. NLRI_ANN (Network Layer Reachability Information announcements)
    features['nlri_ann'] = features['announcements']
    
    # 4. DUPLICATES
    # Proper implementation for duplicates: same prefix announced repeatedly with the same attributes
    if not announcements.empty:
        # Group by all relevant attributes and count instances > 1
        dup_cols = ['Peer_IP', 'Peer_ASN', 'Prefix', 'AS_Path', 'Origin', 'Next_Hop', 'MED', 'Local_Pref', 'Communities']
        dup_cols = [col for col in dup_cols if col in announcements.columns]  # Only include columns that exist
        
        # Count announcements for same prefix/path/attributes
        announcement_counts = announcements.groupby(dup_cols).size()
        duplicates = sum(count - 1 for count in announcement_counts if count > 1)
        features['dups'] = duplicates
    else:
        features['dups'] = 0
    
    # 5-7. ORIGIN ATTRIBUTES
    # Only count origin attributes that are present in your data
    if not announcements.empty and 'Origin' in announcements.columns:
        origin_counts = announcements['Origin'].value_counts()
        features['origin_0'] = origin_counts.get('IGP', 0)  
      
        features['origin_2'] = origin_counts.get('INCOMPLETE', 0)  
        
        # 8. ORIGIN CHANGES
        # Count prefixes with more than one unique origin attribute
        if not announcements.empty:
            unique_prefix_origins = announcements.groupby('Prefix')['Origin'].nunique()
            features['origin_changes'] = (unique_prefix_origins > 1).sum()
        else:
            features['origin_changes'] = 0
    else:
        features['origin_0'] = 0
        features['origin_2'] = 0
        features['origin_changes'] = 0
    
    # 9-10. AS PATH METRICS
    if not announcements.empty and 'AS_Path' in announcements.columns:
        # Filter out empty AS paths
        valid_as_paths = announcements[announcements['AS_Path'].notna() & (announcements['AS_Path'] != '')]
        
        if not valid_as_paths.empty:
            # Calculate AS path lengths
            as_path_lengths = valid_as_paths['AS_Path'].apply(
                lambda path: len([p for p in path.split() if p.isdigit()]) if isinstance(path, str) else 0
            )
            
            # 9. AS_PATH_MAX: Maximum AS path length
            features['as_path_max'] = as_path_lengths.max() if not as_path_lengths.empty else 0
            
            # 10. UNIQUE_AS_PATH_MAX: Maximum number of unique AS paths per prefix
            unique_paths_per_prefix = valid_as_paths.groupby('Prefix')['AS_Path'].nunique()
            features['unique_as_path_max'] = unique_paths_per_prefix.max() if not unique_paths_per_prefix.empty else 0
            
            # 11-20. EDIT DISTANCE FEATURES
            # Calculate edit distances between consecutive AS paths for each prefix
            edit_distances = []
            edit_distance_dict = defaultdict(list)
            
            for prefix, group in valid_as_paths.groupby('Prefix'):
                if len(group) >= 2:
                    sorted_group = group.sort_values('Timestamp')
                    prev_path = None
                    
                    for _, row in sorted_group.iterrows():
                        current_path = row['AS_Path']
                        
                        if prev_path is not None:
                            dist = calculate_edit_distance(prev_path, current_path)
                            edit_distances.append(dist)
                            edit_distance_dict[prefix].append(dist)
                        
                        prev_path = current_path
            
            if edit_distances:
                # 11. EDIT_DISTANCE_AVG
                features['edit_distance_avg'] = np.mean(edit_distances)
                
                # 12. EDIT_DISTANCE_MAX
                features['edit_distance_max'] = max(edit_distances)
                
                # 13-19. EDIT_DISTANCE_DICT_X (Distribution of edit distances)
                edit_dist_counter = Counter(edit_distances)
                for i in range(7):  # 0 to 6
                    features[f'edit_distance_dict_{i}'] = edit_dist_counter.get(i, 0)
                
                # 20-21. EDIT_DISTANCE_UNIQUE_DICT_X (Distribution of unique edit distances per prefix)
                unique_edit_dists = {}
                for prefix, dists in edit_distance_dict.items():
                    unique_dists = set(dists)
                    for dist in unique_dists:
                        if dist in unique_edit_dists:
                            unique_edit_dists[dist] += 1
                        else:
                            unique_edit_dists[dist] = 1
                
                for i in range(2):  # 0 to 1 for unique distributions
                    features[f'edit_distance_unique_dict_{i}'] = unique_edit_dists.get(i, 0)
            else:
                features['edit_distance_avg'] = 0
                features['edit_distance_max'] = 0
                for i in range(7):
                    features[f'edit_distance_dict_{i}'] = 0
                for i in range(2):
                    features[f'edit_distance_unique_dict_{i}'] = 0
        else:
            features['as_path_max'] = 0
            features['unique_as_path_max'] = 0
            features['edit_distance_avg'] = 0
            features['edit_distance_max'] = 0
            for i in range(7):
                features[f'edit_distance_dict_{i}'] = 0
            for i in range(2):
                features[f'edit_distance_unique_dict_{i}'] = 0
    else:
        features['as_path_max'] = 0
        features['unique_as_path_max'] = 0
        features['edit_distance_avg'] = 0
        features['edit_distance_max'] = 0
        for i in range(7):
            features[f'edit_distance_dict_{i}'] = 0
        for i in range(2):
            features[f'edit_distance_unique_dict_{i}'] = 0
    
    # 22-24. IMPLICIT WITHDRAWAL FEATURES
    if not announcements.empty:
    # Group by prefix and peer to count prefixes with multiple announcements
        prefix_peer_groups = announcements.groupby(['Prefix', 'Peer_IP'])
    
    # Count prefixes with multiple announcements
        imp_wd_prefixes = 0
        imp_wd_spath_prefixes = 0
        imp_wd_dpath_prefixes = 0
    
        for (prefix, peer), group in prefix_peer_groups:
            if len(group) > 1:
            # This prefix has implicit withdrawals
                imp_wd_prefixes += 1
            
            # Check if all AS paths are the same or different
                if 'AS_Path' in group.columns:
                    unique_paths = group['AS_Path'].nunique()
                    if unique_paths == 1:
                    # All announcements have the same AS path
                        imp_wd_spath_prefixes += 1
                    else:
                    # Announcements have different AS paths
                        imp_wd_dpath_prefixes += 1
    
        features['imp_wd'] = imp_wd_prefixes
        features['imp_wd_spath'] = imp_wd_spath_prefixes
        features['imp_wd_dpath'] = imp_wd_dpath_prefixes
    else:
        features['imp_wd'] = 0
        features['imp_wd_spath'] = 0
        features['imp_wd_dpath'] = 0


    # 25-26. RARE AND NEW AS NUMBERS FEATURES
    if not announcements.empty and 'AS_Path' in announcements.columns:
        all_asns = []
    
        # Process each AS_Path
        for as_path in announcements['AS_Path']:
            # Skip null/NaN values
            if pd.isnull(as_path) or as_path == '':
                continue
            
            # Convert to string if it isn't already (just to be safe)
            as_path_str = str(as_path)
        
            # Simple case: Single ASN (e.g. "43289")
            if as_path_str.isdigit():
                all_asns.append(as_path_str)
                continue
            
            # Complex case: Path with multiple ASNs
            # Remove special characters and split by whitespace
            as_path_str = as_path_str.replace('{', '').replace('}', '')
            path_asns = [asn for asn in as_path_str.split() if asn.isdigit()]
            all_asns.extend(path_asns)
    
        # Count the occurrence of each ASN
        asn_counts = Counter(all_asns)
    
        # Define rare ASNs (those appearing less than the threshold)
        rare_threshold = 3
        rare_asns = [asn for asn, count in asn_counts.items() if count < rare_threshold]
    
        # Calculate features
        features['number_rare_ases'] = len(rare_asns)
        features['rare_ases_avg'] = len(rare_asns) / len(all_asns) if all_asns else 0
    else:
        # Default values if no data or AS_Path column
        features['number_rare_ases'] = 0
        features['rare_ases_avg'] = 0
    

        
    # 27-28. FLAP AND NADA FEATURES
    if not df_window.empty:
        # FLAPS: A flap is when a prefix is withdrawn and then announced again
        # Get all prefixes that were both withdrawn and announced in this window
        if not withdrawals.empty and not announcements.empty:
            withdrawn_prefixes = set(withdrawals['Prefix'].dropna())
            announced_prefixes = set(announcements['Prefix'].dropna())
            flapped_prefixes = withdrawn_prefixes.intersection(announced_prefixes)
            features['flaps'] = len(flapped_prefixes)
        else:
            features['flaps'] = 0
        
        # NADAS (Network Attacks and Defenses Assessment)
        # This requires a more complex heuristic to detect potential attack patterns
        # For now, we'll implement a basic heuristic looking at suspicious patterns:
        # 1. Multiple very specific prefixes (e.g., /32) announced in a short window
        # 2. Rapid withdrawal of previously announced prefixes
        
        # Count very specific prefixes (potential DOS indicators)
        very_specific_prefixes = 0
        if 'Prefix' in df_window.columns:
            # Count /32 prefixes which are typical in some DoS attacks
            very_specific_prefixes = sum(1 for prefix in df_window['Prefix'].dropna() 
                                       if isinstance(prefix, str) and prefix.endswith('/32'))
        
        # Ratio of withdrawals to announcements
        wd_ann_ratio = features['withdrawals'] / features['announcements'] if features['announcements'] > 0 else 0
        
        # Basic NADAS score - a simple heuristic
        nadas_score = very_specific_prefixes + (wd_ann_ratio > 0.5) * 10
        features['nadas'] = nadas_score
    else:
        features['flaps'] = 0
        features['nadas'] = 0
    
    # Determine label based on window data
    if 'Label' in df_window.columns:
        labels = df_window['Label'].value_counts()
        if not labels.empty:
            # Choose labeling strategy
            if LABEL_STRATEGY == 'majority':
                # Majority vote
                features['label'] = labels.idxmax()
            elif LABEL_STRATEGY == 'conservative':
                # If any abnormal, label as abnormal
                if any(label != 'normal' for label in labels.index):
                    abnormal_labels = [label for label in labels.index if label != 'normal']
                    features['label'] = abnormal_labels[0]
                else:
                    features['label'] = 'normal'
            elif LABEL_STRATEGY == 'weighted':
                # Weight by count of each label
                total = labels.sum()
                abnormal_weight = sum(count for label, count in labels.items() if label != 'normal') / total
                if abnormal_weight > 0.4:  # Threshold for abnormal classification
                    abnormal_labels = [label for label in labels.index if label != 'normal']
                    features['label'] = abnormal_labels[0] if abnormal_labels else 'normal'
                else:
                    features['label'] = 'normal'
        else:
            features['label'] = 'unknown'
    else:
        features['label'] = 'unknown'
    
    return features

def process_bgp_data():
    """
    Process BGP data file and extract features with specified time window
    """
    print(f"Reading input file: {INPUT_FILE}")
    df = pd.read_csv(INPUT_FILE)
    
    # Convert timestamp to datetime
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    
    # Sort by timestamp
    df = df.sort_values('Timestamp')
    
    # Get start and end times
    start_time = df['Timestamp'].min()
    end_time = df['Timestamp'].max()
    
    print(f"Time range: {start_time} to {end_time}")
    
    # Create time windows based on the specified frequency
    window_freq = WINDOW_SIZE
    
    # Use pandas resample to create time windows
    df.set_index('Timestamp', inplace=True)
    
    # Create empty list to store features
    all_features = []
    
    # Group data by time windows
    grouped = df.groupby(pd.Grouper(freq=window_freq))
    
    window_count = 0
    total_windows = len(grouped)
    
    # Process each window
    for window_start, window_df in grouped:
        if not window_df.empty:
            window_df = window_df.reset_index()  # Reset index to get Timestamp as column
            features = extract_features(window_df)
            
            if features:
                window_end = window_start + pd.Timedelta(window_freq)
                features['window_start'] = window_start
                features['window_end'] = window_end
                all_features.append(features)
                window_count += 1
                
                # Print progress
                if window_count % 50 == 0:
                    print(f"Processed {window_count}/{total_windows} windows ({window_count/total_windows:.1%})...")
    
    print(f"Total windows processed: {window_count}")
    
    # Create features dataframe
    if all_features:
        features_df = pd.DataFrame(all_features)
        
        # Ensure all 25 required features are present
        required_features = [
            'dups', 'edit_distance_avg', 'edit_distance_dict_0', 'edit_distance_dict_1',
            'nlri_ann', 'origin_0', 'origin_2', 'imp_wd', 'rare_ases_avg', 'imp_wd_spath',
            'unique_as_path_max', 'edit_distance_dict_2', 'edit_distance_dict_4', 
            'edit_distance_dict_6', 'edit_distance_max', 'edit_distance_unique_dict_0',
            'edit_distance_unique_dict_1', 'announcements', 'origin_changes', 'flaps',
            'nadas', 'number_rare_ases', 'withdrawals', 'as_path_max', 'imp_wd_dpath'
        ]
        
        # Check if all required features are present
        missing_features = [feature for feature in required_features if feature not in features_df.columns]
        if missing_features:
            print(f"Warning: Missing features: {missing_features}")
            # Add missing features with default value 0
            for feature in missing_features:
                features_df[feature] = 0
        
        # Write to output file
        features_df.to_csv(OUTPUT_FILE, index=False)
        print(f"Features written to {OUTPUT_FILE}")
        print(f"Features extracted: {', '.join(features_df.columns)}")
        
        return features_df
    else:
        print("No features extracted. Check input data.")
        return None

# Main execution for Jupyter - uncomment when ready
features_df = process_bgp_data()

Reading input file: /home/smotaali/BGP_Traffic_Generation/results/bgp_updates_analysis_20251208_223239.csv


  df = pd.read_csv(INPUT_FILE)


Time range: 2025-12-08 17:26:21.348996 to 2025-12-09 17:26:35.109632
Processed 50/86415 windows (0.1%)...
Processed 100/86415 windows (0.1%)...
Processed 150/86415 windows (0.2%)...
Processed 200/86415 windows (0.2%)...
Processed 250/86415 windows (0.3%)...
Processed 300/86415 windows (0.3%)...
Processed 350/86415 windows (0.4%)...
Processed 400/86415 windows (0.5%)...
Processed 450/86415 windows (0.5%)...
Processed 500/86415 windows (0.6%)...
Processed 550/86415 windows (0.6%)...
Processed 600/86415 windows (0.7%)...
Processed 650/86415 windows (0.8%)...
Processed 700/86415 windows (0.8%)...
Processed 750/86415 windows (0.9%)...
Processed 800/86415 windows (0.9%)...
Processed 850/86415 windows (1.0%)...
Processed 900/86415 windows (1.0%)...
Processed 950/86415 windows (1.1%)...
Processed 1000/86415 windows (1.2%)...
Processed 1050/86415 windows (1.2%)...
Processed 1100/86415 windows (1.3%)...
Processed 1150/86415 windows (1.3%)...
Processed 1200/86415 windows (1.4%)...
Processed 1250/