In [1]:
#!/usr/bin/env python3
"""
CORRECTED BGP Feature Extraction Code
"""

import pandas as pd
import numpy as np
import datetime
from collections import defaultdict, Counter

# Configuration variables for Jupyter notebook
INPUT_FILE = "/home/smotaali/BGP_Traffic_Generation/results/bgp_updates_analysis_20251214_182312.csv"
OUTPUT_FILE = "/home/smotaali/BGP_Traffic_Generation/results/20251214_182312_exctracted_1s_FIXED.csv"
WINDOW_SIZE = '1s'
LABEL_STRATEGY = 'majority'


def calculate_edit_distance(as_path1, as_path2):
    """Calculate edit distance between two AS paths"""
    if not as_path1 or not as_path2:
        return 0
    
    # Handle integer AS paths
    if isinstance(as_path1, int):
        as_path1 = [as_path1]
    if isinstance(as_path2, int):
        as_path2 = [as_path2]
    
    # Convert paths to lists if they are strings
    if isinstance(as_path1, str):
        as_path1 = as_path1.replace('{', '').replace('}', '')
        as_path1 = [int(as_num) for as_num in as_path1.split() if as_num.isdigit()]
    if isinstance(as_path2, str):
        as_path2 = as_path2.replace('{', '').replace('}', '')
        as_path2 = [int(as_num) for as_num in as_path2.split() if as_num.isdigit()]
    
    if not as_path1 or not as_path2:
        return 0
    
    m, n = len(as_path1), len(as_path2)
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    
    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j
    
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if as_path1[i-1] == as_path2[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                dp[i][j] = 1 + min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1])
    
    return dp[m][n]


def get_path_length(as_path):
    """Get the length of an AS path"""
    if pd.isnull(as_path) or as_path == '':
        return 0
    if isinstance(as_path, int):
        return 1
    if isinstance(as_path, str):
        as_path = as_path.replace('{', '').replace('}', '')
        return len([asn for asn in as_path.split() if asn.isdigit()])
    return 0


# =========================================================================
# MOVE THESE FUNCTIONS OUTSIDE extract_features_fixed
# =========================================================================
def attributes_are_same(row1, row2):
    """Compare BGP attributes between two announcements"""
    attrs_to_compare = ['AS_Path', 'Origin', 'Next_Hop', 'MED', 'Local_Pref', 'Communities']
    
    for attr in attrs_to_compare:
        if attr in row1.index and attr in row2.index:
            val1 = row1[attr]
            val2 = row2[attr]
            
            if pd.isna(val1) and pd.isna(val2):
                continue
            if pd.isna(val1) or pd.isna(val2):
                return False
            if val1 != val2:
                return False
    
    return True


def calculate_nadas_and_flaps(df_window):
    """
    Calculate NADAS and FLAPS using only UPDATE packets within window.
    
    NADAS: Withdrawal → Announcement with DIFFERENT attributes
    FLAP:  Withdrawal → Announcement with SAME attributes
    """
    nadas_count = 0
    flap_count = 0
    
    withdrawal_types = ['WITHDRAW', 'WITHDRAW_MP_UNREACH_NLRI_AFI2']
    
    # Sort by timestamp
    df_sorted = df_window.sort_values('Timestamp')
    
    # Track state per (prefix, peer)
    prefix_state = {}
    
    for _, row in df_sorted.iterrows():
        key = (row['Prefix'], row['Peer_IP'])
        
        if row['Subtype'] == 'ANNOUNCE':
            if key in prefix_state and prefix_state[key]['withdrawn']:
                last_ann = prefix_state[key].get('last_ann')
                
                if last_ann is not None:
                    if attributes_are_same(last_ann, row):
                        flap_count += 1
                    else:
                        nadas_count += 1
                else:
                    nadas_count += 1
                
                prefix_state[key]['withdrawn'] = False
            
            if key not in prefix_state:
                prefix_state[key] = {}
            prefix_state[key]['last_ann'] = row
            prefix_state[key]['withdrawn'] = False
            
        elif row['Subtype'] in withdrawal_types:
            if key in prefix_state:
                prefix_state[key]['withdrawn'] = True
            else:
                prefix_state[key] = {'last_ann': None, 'withdrawn': True}
    
    return nadas_count, flap_count


def extract_features_fixed(df_window):
    """
    Extract BGP features with CORRECTED calculations
    """
    features = {}
    
    # Basic counts
    announcements = df_window[df_window['Subtype'] == 'ANNOUNCE']
    withdrawal_types = ['WITHDRAW', 'WITHDRAW_MP_UNREACH_NLRI_AFI2']
    withdrawals = df_window[df_window['Subtype'].isin(withdrawal_types)]
    
    features['announcements'] = len(announcements)
    features['withdrawals'] = len(withdrawals)
    features['nlri_ann'] = announcements['Prefix'].nunique()

    # =========================================================================
    # DUPLICATES
    # =========================================================================
    if not announcements.empty:
        dup_cols = ['Peer_IP', 'Peer_ASN', 'Prefix', 'AS_Path', 'Origin', 'Next_Hop', 'MED', 'Local_Pref', 'Communities']
        dup_cols = [col for col in dup_cols if col in announcements.columns]
        announcement_counts = announcements.groupby(dup_cols).size()
        duplicates = sum(count - 1 for count in announcement_counts if count > 1)
        features['dups'] = duplicates
    else:
        features['dups'] = 0
    
    # =========================================================================
    # ORIGIN ATTRIBUTES
    # =========================================================================
    if not announcements.empty and 'Origin' in announcements.columns:
        origin_counts = announcements['Origin'].value_counts()
        features['origin_0'] = origin_counts.get('IGP', 0)
        features['origin_2'] = origin_counts.get('INCOMPLETE', 0)
        
        unique_prefix_origins = announcements.groupby('Prefix')['Origin'].nunique()
        features['origin_changes'] = (unique_prefix_origins > 1).sum()
    else:
        features['origin_0'] = 0
        features['origin_2'] = 0
        features['origin_changes'] = 0
    
    # =========================================================================
    # IMPLICIT WITHDRAWALS
    # =========================================================================
    imp_wd_count = 0
    imp_wd_spath_count = 0
    imp_wd_dpath_count = 0

    edit_distances = []
    edit_distance_dict = defaultdict(list)

    attrs_to_compare = ['AS_Path', 'Origin', 'Next_Hop', 'MED', 'Local_Pref', 'Communities']

    if not announcements.empty:
        available_attrs = [col for col in attrs_to_compare if col in announcements.columns]
    
        for (prefix, peer), group in announcements.groupby(['Prefix', 'Peer_IP']):
            if len(group) >= 2:
                sorted_group = group.sort_values('Timestamp')
                prev_row = None
            
                for _, row in sorted_group.iterrows():
                    if prev_row is not None:
                        attributes_changed = False
                        as_path_changed = False
                    
                        for attr in available_attrs:
                            prev_val = prev_row[attr] if attr in prev_row.index else None
                            curr_val = row[attr] if attr in row.index else None
                        
                            prev_is_nan = pd.isna(prev_val)
                            curr_is_nan = pd.isna(curr_val)
                        
                            if prev_is_nan and curr_is_nan:
                                continue
                            if prev_is_nan or curr_is_nan:
                                attributes_changed = True
                                if attr == 'AS_Path':
                                    as_path_changed = True
                            elif prev_val != curr_val:
                                attributes_changed = True
                                if attr == 'AS_Path':
                                    as_path_changed = True
                    
                        if attributes_changed:
                            imp_wd_count += 1
                        
                            if as_path_changed:
                                imp_wd_dpath_count += 1
                            
                                prev_path = prev_row.get('AS_Path', '')
                                curr_path = row.get('AS_Path', '')
                                if prev_path and curr_path:
                                    dist = calculate_edit_distance(prev_path, curr_path)
                                    edit_distances.append(dist)
                                    edit_distance_dict[prefix].append(dist)
                            else:
                                imp_wd_spath_count += 1
                
                    prev_row = row

    features['imp_wd'] = imp_wd_count
    features['imp_wd_spath'] = imp_wd_spath_count
    features['imp_wd_dpath'] = imp_wd_dpath_count

    # =========================================================================
    # AS PATH METRICS
    # =========================================================================
    if not announcements.empty and 'AS_Path' in announcements.columns:
        valid_as_paths = announcements[announcements['AS_Path'].notna() & (announcements['AS_Path'] != '')]
        
        if not valid_as_paths.empty:
            as_path_lengths = valid_as_paths['AS_Path'].apply(get_path_length)
            features['as_path_max'] = as_path_lengths.max() if not as_path_lengths.empty else 0
            
            unique_paths_per_prefix = valid_as_paths.groupby('Prefix')['AS_Path'].nunique()
            features['unique_as_path_max'] = unique_paths_per_prefix.max() if not unique_paths_per_prefix.empty else 0
        else:
            features['as_path_max'] = 0
            features['unique_as_path_max'] = 0
    else:
        features['as_path_max'] = 0
        features['unique_as_path_max'] = 0
    
    # =========================================================================
    # EDIT DISTANCE FEATURES
    # =========================================================================
    if edit_distances:
        features['edit_distance_avg'] = np.mean(edit_distances)
        features['edit_distance_max'] = max(edit_distances)
        
        edit_dist_counter = Counter(edit_distances)
        for i in range(7):
            features[f'edit_distance_dict_{i}'] = edit_dist_counter.get(i, 0)
        
        unique_edit_dists = {}
        for prefix, dists in edit_distance_dict.items():
            unique_dists = set(dists)
            for dist in unique_dists:
                unique_edit_dists[dist] = unique_edit_dists.get(dist, 0) + 1
        
        for i in range(2):
            features[f'edit_distance_unique_dict_{i}'] = unique_edit_dists.get(i, 0)
    else:
        features['edit_distance_avg'] = 0
        features['edit_distance_max'] = 0
        for i in range(7):
            features[f'edit_distance_dict_{i}'] = 0
        for i in range(2):
            features[f'edit_distance_unique_dict_{i}'] = 0
    
    # =========================================================================
    # RARE AS FEATURES
    # =========================================================================
    if not announcements.empty and 'AS_Path' in announcements.columns:
        all_asns = []
        
        for as_path in announcements['AS_Path']:
            if pd.isnull(as_path) or as_path == '':
                continue
            
            as_path_str = str(as_path)
            
            if as_path_str.isdigit():
                all_asns.append(as_path_str)
                continue
            
            as_path_str = as_path_str.replace('{', '').replace('}', '')
            path_asns = [asn for asn in as_path_str.split() if asn.isdigit()]
            all_asns.extend(path_asns)
        
        asn_counts = Counter(all_asns)
        rare_threshold = 3
        rare_asns = [asn for asn, count in asn_counts.items() if count < rare_threshold]
        
        features['number_rare_ases'] = len(rare_asns)
        features['rare_ases_avg'] = len(rare_asns) / len(all_asns) if all_asns else 0
    else:
        features['number_rare_ases'] = 0
        features['rare_ases_avg'] = 0
    
    # =========================================================================
    # NADAS AND FLAPS - NOW ACTUALLY CALLED!
    # =========================================================================
    nadas_count, flap_count = calculate_nadas_and_flaps(df_window)
    features['nadas'] = nadas_count
    features['flaps'] = flap_count
    
    # =========================================================================
    # LABEL
    # =========================================================================
    if 'Label' in df_window.columns:
        labels = df_window['Label'].value_counts()
        if not labels.empty:
            if LABEL_STRATEGY == 'majority':
                features['label'] = labels.idxmax()
            elif LABEL_STRATEGY == 'conservative':
                if any(label != 'normal' for label in labels.index):
                    abnormal_labels = [label for label in labels.index if label != 'normal']
                    features['label'] = abnormal_labels[0]
                else:
                    features['label'] = 'normal'
            elif LABEL_STRATEGY == 'weighted':
                total = labels.sum()
                abnormal_weight = sum(count for label, count in labels.items() if label != 'normal') / total
                if abnormal_weight > 0.4:
                    abnormal_labels = [label for label in labels.index if label != 'normal']
                    features['label'] = abnormal_labels[0] if abnormal_labels else 'normal'
                else:
                    features['label'] = 'normal'
        else:
            features['label'] = 'unknown'
    else:
        features['label'] = 'unknown'
    
    return features


def process_bgp_data():
    """Process BGP data file and extract features with specified time window"""
    print(f"Reading input file: {INPUT_FILE}")
    df = pd.read_csv(INPUT_FILE)
    
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df = df.sort_values('Timestamp')
    
    start_time = df['Timestamp'].min()
    end_time = df['Timestamp'].max()
    
    print(f"Time range: {start_time} to {end_time}")
    
    df.set_index('Timestamp', inplace=True)
    
    all_features = []
    grouped = df.groupby(pd.Grouper(freq=WINDOW_SIZE))
    
    window_count = 0
    total_windows = len(grouped)
    
    for window_start, window_df in grouped:
        if not window_df.empty:
            window_df = window_df.reset_index()
            features = extract_features_fixed(window_df)
            
            if features:
                window_end = window_start + pd.Timedelta(WINDOW_SIZE)
                features['window_start'] = window_start
                features['window_end'] = window_end
                all_features.append(features)
                window_count += 1
                
                if window_count % 50 == 0:
                    print(f"Processed {window_count}/{total_windows} windows ({window_count/total_windows:.1%})...")
    
    print(f"Total windows processed: {window_count}")
    
    if all_features:
        features_df = pd.DataFrame(all_features)
        
        features_df.to_csv(OUTPUT_FILE, index=False)
        print(f"Features written to {OUTPUT_FILE}")
        
        # Print diagnostic
        print("\n" + "="*60)
        print("DIAGNOSTIC")
        print("="*60)
        print(f"Total withdrawals: {features_df['withdrawals'].sum()}")
        print(f"Total flaps: {features_df['flaps'].sum()}")
        print(f"Total nadas: {features_df['nadas'].sum()}")
        print(f"Total imp_wd: {features_df['imp_wd'].sum()}")
        print(f"Total imp_wd_spath: {features_df['imp_wd_spath'].sum()}")
        print(f"Total imp_wd_dpath: {features_df['imp_wd_dpath'].sum()}")
        print("="*60)
        
        return features_df
    else:
        print("No features extracted. Check input data.")
        return None


# Main execution
if __name__ == "__main__":
    features_df = process_bgp_data()

Reading input file: /home/smotaali/BGP_Traffic_Generation/results/bgp_updates_analysis_20251214_182312.csv


  df = pd.read_csv(INPUT_FILE)


Time range: 2025-12-14 12:25:41.125479 to 2025-12-15 12:25:55.831214
Processed 50/86415 windows (0.1%)...
Processed 100/86415 windows (0.1%)...
Processed 150/86415 windows (0.2%)...
Processed 200/86415 windows (0.2%)...
Processed 250/86415 windows (0.3%)...
Processed 300/86415 windows (0.3%)...
Processed 350/86415 windows (0.4%)...
Processed 400/86415 windows (0.5%)...
Processed 450/86415 windows (0.5%)...
Processed 500/86415 windows (0.6%)...
Processed 550/86415 windows (0.6%)...
Processed 600/86415 windows (0.7%)...
Processed 650/86415 windows (0.8%)...
Processed 700/86415 windows (0.8%)...
Processed 750/86415 windows (0.9%)...
Processed 800/86415 windows (0.9%)...
Processed 850/86415 windows (1.0%)...
Processed 900/86415 windows (1.0%)...
Processed 950/86415 windows (1.1%)...
Processed 1000/86415 windows (1.2%)...
Processed 1050/86415 windows (1.2%)...
Processed 1100/86415 windows (1.3%)...
Processed 1150/86415 windows (1.3%)...
Processed 1200/86415 windows (1.4%)...
Processed 1250/