# Unified BGP Feature Extraction

This notebook extracts time-window based features from BGP update data for anomaly detection.

**Features extracted:**
- Basic counts: announcements, withdrawals, nlri_ann, duplicates
- Origin attributes: origin_0 (IGP), origin_2 (INCOMPLETE), origin_changes
- Implicit withdrawals: imp_wd, imp_wd_spath, imp_wd_dpath
- AS path metrics: as_path_max, unique_as_path_max
- Edit distance: avg, max, distribution (dict_0 to dict_6), unique distribution
- Rare AS metrics: number_rare_ases, rare_ases_avg
- Flaps and NADAS

**Works with multiple schemas:**

| Schema | Columns | Notes |
|--------|---------|-------|
| **Standard (new)** | Timestamp, Subtype, Origin, Peer_ASN, Communities | Full BGP Origin attribute support |
| **RIPE Old** | Time, Entry_Type, Origin_AS, Peer_AS, Community | Missing BGP Origin attribute |

**Important:** Old RIPE data has `Origin_AS` (the originating ASN from AS_Path) but is **missing** the BGP `Origin` attribute (IGP/EGP/INCOMPLETE). Re-collect data with the updated collector to get full feature support.

## Configuration

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from pathlib import Path
from typing import Dict, List, Tuple, Optional

# =============================================================================
# CONFIGURATION - Modify these settings as needed
# =============================================================================

# Input/Output paths
INPUT_FILE = "/home/smotaali/BGP_Traffic_Generation/results/bgp_updates_analysis.csv"
OUTPUT_FILE = "/home/smotaali/BGP_Traffic_Generation/results/extracted_features_1s.csv"

# For processing RIPE incidents directory
RIPE_INCIDENTS_DIR = "/home/smotaali/BGP_Traffic_Generation/RIPE/RIPE_INCIDENTS"

# Feature extraction settings
WINDOW_SIZE = '1s'  # Options: '1s', '5s', '30s', '1min', '5min', etc.
LABEL_STRATEGY = 'majority'  # Options: 'majority', 'conservative', 'weighted'

# Schema type: 'auto', 'ripe', or 'standard'
SCHEMA_TYPE = 'auto'

print(f"Window size: {WINDOW_SIZE}")
print(f"Label strategy: {LABEL_STRATEGY}")

## Helper Functions

In [None]:
def _normalize_as_path(as_path) -> List[int]:
    """
    Convert AS path to list of integers.
    
    Handles:
    - Integer: 65001 -> [65001]
    - String: "65001 65002 65003" -> [65001, 65002, 65003]
    - AS_SET: "65001 {65002 65003}" -> [65001, 65002, 65003]
    - List: [65001, 65002] -> [65001, 65002]
    """
    if isinstance(as_path, int):
        return [as_path]
    
    if isinstance(as_path, list):
        return [int(x) for x in as_path if str(x).isdigit()]
    
    if isinstance(as_path, str):
        # Remove AS_SET brackets
        as_path = as_path.replace('{', '').replace('}', '')
        return [int(asn) for asn in as_path.split() if asn.isdigit()]
    
    return []


def get_path_length(as_path) -> int:
    """Get the length of an AS path."""
    if pd.isnull(as_path) or as_path == '':
        return 0
    return len(_normalize_as_path(as_path))

In [None]:
def calculate_edit_distance(as_path1, as_path2) -> int:
    """
    Calculate Levenshtein edit distance between two AS paths.
    
    The edit distance measures how many insertions, deletions, or
    substitutions are needed to transform one path into another.
    
    Args:
        as_path1: First AS path (string, int, or list)
        as_path2: Second AS path (string, int, or list)
    
    Returns:
        Edit distance as integer
    """
    if not as_path1 or not as_path2:
        return 0
    
    # Normalize to list of integers
    path1 = _normalize_as_path(as_path1)
    path2 = _normalize_as_path(as_path2)
    
    if not path1 or not path2:
        return 0
    
    m, n = len(path1), len(path2)
    
    # Initialize DP table
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    
    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j
    
    # Fill DP table
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if path1[i-1] == path2[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                dp[i][j] = 1 + min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1])
    
    return dp[m][n]

In [None]:
def attributes_are_same(row1: pd.Series, row2: pd.Series) -> bool:
    """
    Compare BGP attributes between two announcements.
    
    Compares: AS_Path, Origin, Next_Hop, MED, Local_Pref, Communities
    
    Returns True if all comparable attributes are the same.
    """
    attrs_to_compare = ['AS_Path', 'Origin', 'Next_Hop', 'MED', 'Local_Pref', 'Communities']
    
    for attr in attrs_to_compare:
        if attr not in row1.index or attr not in row2.index:
            continue
        
        val1, val2 = row1[attr], row2[attr]
        
        # Both NaN -> same
        if pd.isna(val1) and pd.isna(val2):
            continue
        # One NaN, one not -> different
        if pd.isna(val1) or pd.isna(val2):
            return False
        # Compare values
        if val1 != val2:
            return False
    
    return True

## NADAS and Flaps Calculation

**Definitions:**
- **FLAP**: A prefix is withdrawn and then re-announced with the **same** attributes
- **NADAS**: A prefix is withdrawn and then re-announced with **different** attributes

This requires tracking the state of each (prefix, peer) pair through the time window.

In [None]:
def calculate_nadas_and_flaps(df_window: pd.DataFrame) -> Tuple[int, int]:
    """
    Calculate NADAS and FLAPS from BGP update sequence.
    
    NADAS: Withdrawal followed by Announcement with DIFFERENT attributes
    FLAP: Withdrawal followed by Announcement with SAME attributes
    
    Args:
        df_window: DataFrame with BGP updates in a time window
    
    Returns:
        Tuple of (nadas_count, flap_count)
    """
    nadas_count = 0
    flap_count = 0
    
    withdrawal_types = ['WITHDRAW', 'WITHDRAW_MP_UNREACH_NLRI_AFI2']
    
    # Sort by timestamp
    df_sorted = df_window.sort_values('Timestamp')
    
    # Track state per (prefix, peer)
    # State: {'withdrawn': bool, 'last_ann': row or None}
    prefix_state: Dict[Tuple, Dict] = {}
    
    for _, row in df_sorted.iterrows():
        key = (row['Prefix'], row['Peer_IP'])
        
        if row['Subtype'] == 'ANNOUNCE':
            # Check if this prefix was previously withdrawn
            if key in prefix_state and prefix_state[key].get('withdrawn', False):
                last_ann = prefix_state[key].get('last_ann')
                
                if last_ann is not None:
                    if attributes_are_same(last_ann, row):
                        flap_count += 1
                    else:
                        nadas_count += 1
                else:
                    # No previous announcement to compare -> count as NADAS
                    nadas_count += 1
            
            # Update state
            if key not in prefix_state:
                prefix_state[key] = {}
            prefix_state[key]['last_ann'] = row
            prefix_state[key]['withdrawn'] = False
        
        elif row['Subtype'] in withdrawal_types:
            # Mark prefix as withdrawn
            if key not in prefix_state:
                prefix_state[key] = {'last_ann': None}
            prefix_state[key]['withdrawn'] = True
    
    return nadas_count, flap_count

## Main Feature Extraction Function

This function extracts all 25+ features from a single time window of BGP updates.

In [None]:
def extract_features(df_window: pd.DataFrame) -> Dict:
    """
    Extract BGP features from a time window.
    
    Args:
        df_window: DataFrame containing BGP updates within a time window
    
    Returns:
        Dictionary of feature name -> value
    """
    features = {}
    
    # Separate announcements and withdrawals
    announcements = df_window[df_window['Subtype'] == 'ANNOUNCE']
    withdrawal_types = ['WITHDRAW', 'WITHDRAW_MP_UNREACH_NLRI_AFI2']
    withdrawals = df_window[df_window['Subtype'].isin(withdrawal_types)]
    
    # =========================================================================
    # BASIC COUNTS
    # =========================================================================
    features['announcements'] = len(announcements)
    features['withdrawals'] = len(withdrawals)
    
    # NLRI_ANN: Number of unique prefixes announced (NOT total announcements)
    features['nlri_ann'] = announcements['Prefix'].nunique()
    
    # =========================================================================
    # DUPLICATES
    # =========================================================================
    if not announcements.empty:
        dup_cols = ['Peer_IP', 'Peer_ASN', 'Prefix', 'AS_Path', 'Origin',
                    'Next_Hop', 'MED', 'Local_Pref', 'Communities']
        dup_cols = [c for c in dup_cols if c in announcements.columns]
        
        announcement_counts = announcements.groupby(dup_cols).size()
        features['dups'] = sum(count - 1 for count in announcement_counts if count > 1)
    else:
        features['dups'] = 0
    
    # =========================================================================
    # ORIGIN ATTRIBUTES
    # =========================================================================
    if not announcements.empty and 'Origin' in announcements.columns:
        origin_counts = announcements['Origin'].value_counts()
        features['origin_0'] = origin_counts.get('IGP', 0)
        features['origin_2'] = origin_counts.get('INCOMPLETE', 0)
        
        # Origin changes: prefixes announced with multiple different origins
        unique_prefix_origins = announcements.groupby('Prefix')['Origin'].nunique()
        features['origin_changes'] = (unique_prefix_origins > 1).sum()
    else:
        features['origin_0'] = 0
        features['origin_2'] = 0
        features['origin_changes'] = 0
    
    # =========================================================================
    # IMPLICIT WITHDRAWALS
    # An implicit withdrawal occurs when a prefix is re-announced with
    # different attributes (replacing the previous announcement)
    # =========================================================================
    imp_wd_count = 0
    imp_wd_spath_count = 0  # Same AS_Path, other attrs changed
    imp_wd_dpath_count = 0  # Different AS_Path
    
    edit_distances = []
    edit_distance_dict = defaultdict(list)
    
    attrs_to_compare = ['AS_Path', 'Origin', 'Next_Hop', 'MED', 'Local_Pref', 'Communities']
    
    if not announcements.empty:
        available_attrs = [c for c in attrs_to_compare if c in announcements.columns]
        
        for (prefix, peer), group in announcements.groupby(['Prefix', 'Peer_IP']):
            if len(group) < 2:
                continue
            
            sorted_group = group.sort_values('Timestamp')
            prev_row = None
            
            for _, row in sorted_group.iterrows():
                if prev_row is not None:
                    attributes_changed = False
                    as_path_changed = False
                    
                    for attr in available_attrs:
                        prev_val = prev_row.get(attr)
                        curr_val = row.get(attr)
                        
                        prev_nan = pd.isna(prev_val)
                        curr_nan = pd.isna(curr_val)
                        
                        if prev_nan and curr_nan:
                            continue
                        if prev_nan or curr_nan or prev_val != curr_val:
                            attributes_changed = True
                            if attr == 'AS_Path':
                                as_path_changed = True
                    
                    if attributes_changed:
                        imp_wd_count += 1
                        
                        if as_path_changed:
                            imp_wd_dpath_count += 1
                            
                            # Calculate edit distance for AS_Path changes
                            prev_path = prev_row.get('AS_Path', '')
                            curr_path = row.get('AS_Path', '')
                            dist = calculate_edit_distance(prev_path, curr_path)
                            if dist is not None:
                                edit_distances.append(dist)
                                edit_distance_dict[prefix].append(dist)
                        else:
                            imp_wd_spath_count += 1
                
                prev_row = row
    
    features['imp_wd'] = imp_wd_count
    features['imp_wd_spath'] = imp_wd_spath_count
    features['imp_wd_dpath'] = imp_wd_dpath_count
    
    # =========================================================================
    # AS PATH METRICS
    # =========================================================================
    if not announcements.empty and 'AS_Path' in announcements.columns:
        valid_paths = announcements[
            announcements['AS_Path'].notna() & (announcements['AS_Path'] != '')
        ]
        
        if not valid_paths.empty:
            path_lengths = valid_paths['AS_Path'].apply(get_path_length)
            features['as_path_max'] = path_lengths.max() if not path_lengths.empty else 0
            
            unique_paths_per_prefix = valid_paths.groupby('Prefix')['AS_Path'].nunique()
            features['unique_as_path_max'] = unique_paths_per_prefix.max() if not unique_paths_per_prefix.empty else 0
        else:
            features['as_path_max'] = 0
            features['unique_as_path_max'] = 0
    else:
        features['as_path_max'] = 0
        features['unique_as_path_max'] = 0
    
    # =========================================================================
    # EDIT DISTANCE FEATURES
    # =========================================================================
    if edit_distances:
        features['edit_distance_avg'] = float(np.mean(edit_distances))
        features['edit_distance_max'] = max(edit_distances)
        
        # Distribution of edit distances (0-6)
        ed_counter = Counter(edit_distances)
        for i in range(7):
            features[f'edit_distance_dict_{i}'] = ed_counter.get(i, 0)
        
        # Unique edit distances per prefix
        unique_ed = {}
        for prefix, dists in edit_distance_dict.items():
            for d in set(dists):
                unique_ed[d] = unique_ed.get(d, 0) + 1
        
        for i in range(2):
            features[f'edit_distance_unique_dict_{i}'] = unique_ed.get(i, 0)
    else:
        features['edit_distance_avg'] = 0
        features['edit_distance_max'] = 0
        for i in range(7):
            features[f'edit_distance_dict_{i}'] = 0
        for i in range(2):
            features[f'edit_distance_unique_dict_{i}'] = 0
    
    # =========================================================================
    # RARE AS FEATURES
    # =========================================================================
    if not announcements.empty and 'AS_Path' in announcements.columns:
        all_asns = []
        
        for as_path in announcements['AS_Path']:
            if pd.isnull(as_path) or as_path == '':
                continue
            all_asns.extend([str(asn) for asn in _normalize_as_path(as_path)])
        
        if all_asns:
            asn_counts = Counter(all_asns)
            rare_threshold = 3
            rare_asns = [asn for asn, count in asn_counts.items() if count < rare_threshold]
            
            features['number_rare_ases'] = len(rare_asns)
            features['rare_ases_avg'] = len(rare_asns) / len(all_asns)
        else:
            features['number_rare_ases'] = 0
            features['rare_ases_avg'] = 0
    else:
        features['number_rare_ases'] = 0
        features['rare_ases_avg'] = 0
    
    # =========================================================================
    # NADAS AND FLAPS
    # =========================================================================
    nadas_count, flap_count = calculate_nadas_and_flaps(df_window)
    features['nadas'] = nadas_count
    features['flaps'] = flap_count
    
    # =========================================================================
    # LABEL AGGREGATION
    # =========================================================================
    if 'Label' in df_window.columns:
        labels = df_window['Label'].value_counts()
        if not labels.empty:
            if LABEL_STRATEGY == 'majority':
                features['label'] = labels.idxmax()
            elif LABEL_STRATEGY == 'conservative':
                abnormal = [l for l in labels.index if l != 'normal']
                features['label'] = abnormal[0] if abnormal else 'normal'
            elif LABEL_STRATEGY == 'weighted':
                total = labels.sum()
                abnormal_weight = sum(c for l, c in labels.items() if l != 'normal') / total
                if abnormal_weight > 0.4:
                    abnormal = [l for l in labels.index if l != 'normal']
                    features['label'] = abnormal[0] if abnormal else 'normal'
                else:
                    features['label'] = 'normal'
            else:
                features['label'] = labels.idxmax()
        else:
            features['label'] = 'unknown'
    else:
        features['label'] = 'unknown'
    
    # Keep incident name if present
    if 'Incident' in df_window.columns:
        features['Incident'] = df_window['Incident'].iloc[0]
    
    return features

## Data Processing Functions

In [None]:
def prepare_dataframe(df: pd.DataFrame, schema_type: str = 'auto') -> pd.DataFrame:
    """
    Prepare DataFrame by normalizing column names and data types.
    
    Handles three schema types:
    1. 'standard' - New unified schema (Timestamp, Subtype, Origin, Peer_ASN, Communities)
    2. 'ripe_old' - Old RIPE schema (Time, Entry_Type, Origin_AS, Peer_AS, Community)
    3. 'auto' - Auto-detect based on column names
    
    Args:
        df: Input DataFrame
        schema_type: 'ripe_old', 'standard', or 'auto'
    
    Returns:
        Prepared DataFrame with standardized columns
    """
    df = df.copy()
    
    # Auto-detect schema type
    if schema_type == 'auto':
        # Check for old RIPE schema (Time + Entry_Type columns)
        if 'Time' in df.columns and 'Entry_Type' in df.columns:
            schema_type = 'ripe_old'
            print("Detected OLD RIPE schema (Time, Entry_Type, Origin_AS)")
        # Check for new standard schema (Timestamp + Subtype columns)  
        elif 'Timestamp' in df.columns and 'Subtype' in df.columns:
            schema_type = 'standard'
            print("Detected STANDARD schema (Timestamp, Subtype, Origin)")
        else:
            # Fallback - try to detect by other columns
            if 'Time' in df.columns:
                schema_type = 'ripe_old'
                print("Detected OLD RIPE schema (fallback)")
            else:
                schema_type = 'standard'
                print("Detected STANDARD schema (fallback)")
    
    # =========================================================================
    # OLD RIPE SCHEMA MAPPING
    # Columns: Time, Entry_Type (A/W), Peer_AS, Origin_AS, Community
    # Note: Origin_AS is the originating ASN, NOT the BGP Origin attribute!
    # =========================================================================
    if schema_type == 'ripe_old':
        # Time -> Timestamp
        df['Timestamp'] = pd.to_datetime(df['Time'])
        
        # Entry_Type (A/W) -> Subtype (ANNOUNCE/WITHDRAW)
        def map_subtype(entry_type):
            if entry_type == 'A':
                return 'ANNOUNCE'
            elif entry_type == 'W':
                return 'WITHDRAW'
            return 'UNKNOWN'
        
        df['Subtype'] = df['Entry_Type'].apply(map_subtype)
        
        # Column renames for old RIPE schema
        rename_map = {}
        if 'Peer_AS' in df.columns:
            rename_map['Peer_AS'] = 'Peer_ASN'
        if 'Community' in df.columns:
            rename_map['Community'] = 'Communities'
        
        if rename_map:
            df.rename(columns=rename_map, inplace=True)
        
        # IMPORTANT: Old RIPE schema has Origin_AS (originating ASN), 
        # but NOT the BGP Origin attribute (IGP/EGP/INCOMPLETE)
        # We need to handle this - set Origin to empty if not present
        if 'Origin_AS' in df.columns and 'Origin' not in df.columns:
            # Origin_AS is the originating ASN, not the BGP Origin attribute
            # The BGP Origin attribute is missing in old schema
            df['Origin'] = ''  # Set to empty - this data is missing
            print("WARNING: Old RIPE schema missing BGP Origin attribute (IGP/EGP/INCOMPLETE)")
            print("         Origin-based features (origin_0, origin_2) will be 0")
    
    # =========================================================================
    # STANDARD SCHEMA (new collectors output this format)
    # Columns: Timestamp, Subtype, Peer_ASN, Origin, Origin_ASN, Communities
    # =========================================================================
    else:  # standard schema
        df['Timestamp'] = pd.to_datetime(df['Timestamp'])
        # All columns should already have correct names
    
    # Final validation - ensure required columns exist
    required_cols = ['Timestamp', 'Subtype', 'Prefix', 'Peer_IP']
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        print(f"WARNING: Missing required columns: {missing}")
    
    return df

In [None]:
def process_file(input_path: str, output_path: str = None, 
                 schema_type: str = 'auto') -> pd.DataFrame:
    """
    Process a single BGP data file and extract features.
    
    Args:
        input_path: Path to input CSV file
        output_path: Path to output features CSV (optional)
        schema_type: 'ripe', 'standard', or 'auto'
    
    Returns:
        DataFrame with extracted features
    """
    print(f"Reading {input_path}...")
    df = pd.read_csv(input_path, low_memory=False)
    print(f"Loaded {len(df)} records")
    
    # Prepare data
    df = prepare_dataframe(df, schema_type)
    
    # Sort and set index
    df = df.sort_values('Timestamp')
    
    start_time = df['Timestamp'].min()
    end_time = df['Timestamp'].max()
    print(f"Time range: {start_time} to {end_time}")
    
    df.set_index('Timestamp', inplace=True)
    
    # Process windows
    features_list = []
    count = 0
    
    grouped = df.groupby(pd.Grouper(freq=WINDOW_SIZE))
    total_windows = len(grouped)
    
    for window_start, window_df in grouped:
        if window_df.empty:
            continue
        
        w = window_df.reset_index()
        f = extract_features(w)
        
        if f:
            window_end = window_start + pd.Timedelta(WINDOW_SIZE)
            f['window_start'] = window_start
            f['window_end'] = window_end
            features_list.append(f)
            count += 1
            
            if count % 500 == 0:
                print(f"  Processed {count} windows ({count/total_windows*100:.1f}%)...")
    
    print(f"Total windows processed: {count}")
    
    if not features_list:
        print("No features extracted!")
        return None
    
    out_df = pd.DataFrame(features_list)
    
    if output_path:
        out_df.to_csv(output_path, index=False)
        print(f"Saved features to {output_path}")
    
    # Print diagnostics
    print("\n" + "="*60)
    print("DIAGNOSTICS")
    print("="*60)
    print(f"Total announcements: {out_df['announcements'].sum()}")
    print(f"Total withdrawals:   {out_df['withdrawals'].sum()}")
    print(f"Total flaps:         {out_df['flaps'].sum()}")
    print(f"Total nadas:         {out_df['nadas'].sum()}")
    print(f"Total imp_wd:        {out_df['imp_wd'].sum()}")
    print(f"  - imp_wd_spath:    {out_df['imp_wd_spath'].sum()}")
    print(f"  - imp_wd_dpath:    {out_df['imp_wd_dpath'].sum()}")
    print("="*60)
    
    return out_df

In [None]:
def process_all_incidents(base_dir: str) -> Dict[str, pd.DataFrame]:
    """
    Process all RIPE incident directories.
    
    Args:
        base_dir: Path to RIPE_INCIDENTS directory
    
    Returns:
        Dictionary mapping incident name to features DataFrame
    """
    base_path = Path(base_dir)
    results = {}
    
    for incident_dir in sorted(base_path.iterdir()):
        if not incident_dir.is_dir():
            continue
        
        print(f"\n{'='*60}")
        print(f"Processing incident: {incident_dir.name}")
        print(f"{'='*60}")
        
        for csv_path in incident_dir.glob("*_labeled.csv"):
            out_path = incident_dir / (csv_path.stem + "_features.csv")
            
            # Uncomment to skip existing files
            # if out_path.exists():
            #     print(f"Skipping (exists): {out_path}")
            #     continue
            
            features_df = process_file(str(csv_path), str(out_path), schema_type='ripe')
            if features_df is not None:
                results[incident_dir.name] = features_df
    
    return results

## Option 1: Process a Single File

Use this to process a single BGP data file (either normal or incident data).

In [None]:
# Process single file
# Uncomment and modify the paths as needed

# features_df = process_file(INPUT_FILE, OUTPUT_FILE, schema_type=SCHEMA_TYPE)
# features_df.head()

## Option 2: Process All RIPE Incidents

Use this to batch process all incident directories.

In [None]:
# Process all RIPE incidents
# Uncomment to run

# all_results = process_all_incidents(RIPE_INCIDENTS_DIR)
# print(f"\nProcessed {len(all_results)} incidents")

## Feature Analysis (Optional)

After extraction, you can analyze the features.

In [None]:
def analyze_features(features_df: pd.DataFrame):
    """
    Display summary statistics for extracted features.
    """
    print("Feature Summary Statistics")
    print("="*60)
    
    # Numeric columns only
    numeric_cols = features_df.select_dtypes(include=[np.number]).columns
    
    summary = features_df[numeric_cols].describe().T
    summary['non_zero'] = (features_df[numeric_cols] != 0).sum()
    summary['non_zero_pct'] = summary['non_zero'] / len(features_df) * 100
    
    print(summary[['mean', 'std', 'min', 'max', 'non_zero_pct']].round(2).to_string())
    
    # Label distribution
    if 'label' in features_df.columns:
        print("\nLabel Distribution")
        print("-"*40)
        label_counts = features_df['label'].value_counts()
        for label, count in label_counts.items():
            pct = count / len(features_df) * 100
            print(f"  {label}: {count} ({pct:.1f}%)")

# Example usage:
# analyze_features(features_df)

## Quick Test

Test the feature extraction with sample data.

In [None]:
# Create sample test data
test_data = pd.DataFrame({
    'Timestamp': pd.to_datetime([
        '2025-01-01 00:00:00.100',
        '2025-01-01 00:00:00.200',
        '2025-01-01 00:00:00.300',
        '2025-01-01 00:00:00.400',
        '2025-01-01 00:00:00.500',
    ]),
    'Subtype': ['ANNOUNCE', 'ANNOUNCE', 'WITHDRAW', 'ANNOUNCE', 'ANNOUNCE'],
    'Prefix': ['10.0.0.0/24', '10.0.0.0/24', '10.0.0.0/24', '10.0.0.0/24', '10.0.1.0/24'],
    'Peer_IP': ['192.168.1.1', '192.168.1.1', '192.168.1.1', '192.168.1.1', '192.168.1.1'],
    'Peer_ASN': [65001, 65001, 65001, 65001, 65001],
    'AS_Path': ['65001 65002', '65001 65003', '', '65001 65002', '65001 65004'],
    'Origin': ['IGP', 'IGP', '', 'IGP', 'INCOMPLETE'],
    'Next_Hop': ['192.168.1.1', '192.168.1.1', '', '192.168.1.1', '192.168.1.1'],
    'Label': ['normal', 'normal', 'normal', 'normal', 'normal'],
})

print("Test Data:")
print(test_data.to_string(index=False))
print()

# Extract features
test_features = extract_features(test_data)

print("Extracted Features:")
for k, v in sorted(test_features.items()):
    print(f"  {k}: {v}")