# Unified BGP Feature Extraction

This notebook extracts consistent features from BGP update data for both:
- **Normal traffic data** (from RIPE RRC collectors)
- **Anomaly/Incident data** (from labeled incident datasets)

## Features Extracted (27 total)

| Category | Features |
|----------|----------|
| Volume | announcements, withdrawals, nlri_ann, dups |
| Origin | origin_0 (IGP), origin_2 (INCOMPLETE), origin_changes |
| Implicit Withdrawals | imp_wd, imp_wd_spath, imp_wd_dpath |
| AS Path | as_path_max, unique_as_path_max |
| Edit Distance | edit_distance_avg, edit_distance_max, edit_distance_dict_0-6, edit_distance_unique_dict_0-1 |
| Rare AS | number_rare_ases, rare_ases_avg |
| Stability | nadas, flaps |

## 1. Setup and Configuration

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from pathlib import Path
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print("Libraries loaded successfully!")

In [None]:
# =============================================================================
# CONFIGURATION - MODIFY THESE PATHS FOR YOUR ENVIRONMENT
# =============================================================================

# Mode selection: 'incident' or 'normal'
PROCESSING_MODE = 'incident'  # Change to 'normal' for normal traffic

# For incident data processing
INCIDENT_BASE_DIR = Path("/home/smotaali/BGP_Traffic_Generation/RIPE/RIPE_INCIDENTS")

# For normal traffic processing
NORMAL_INPUT_FILE = Path("/home/smotaali/BGP_Traffic_Generation/RIPE/normal_traffic.csv")
NORMAL_OUTPUT_FILE = Path("/home/smotaali/BGP_Traffic_Generation/RIPE/normal_traffic_features.csv")

# Feature extraction settings
WINDOW_SIZE = '1s'  # Time window for feature aggregation
RARE_AS_THRESHOLD = 3  # ASNs appearing less than this are considered "rare"
SKIP_EXISTING = True  # Skip incidents that already have features extracted

print(f"Processing mode: {PROCESSING_MODE}")
print(f"Window size: {WINDOW_SIZE}")
print(f"Rare AS threshold: {RARE_AS_THRESHOLD}")

## 2. Helper Functions

In [None]:
def calculate_edit_distance(as_path1, as_path2):
    """
    Calculate Levenshtein edit distance between two AS paths.
    """
    if not as_path1 or not as_path2:
        return 0

    def path_to_list(path):
        if isinstance(path, int):
            return [path]
        if isinstance(path, list):
            return path
        if isinstance(path, str):
            path = path.replace('{', '').replace('}', '')
            return [int(a) for a in path.split() if a.isdigit()]
        return []

    list1 = path_to_list(as_path1)
    list2 = path_to_list(as_path2)

    if not list1 or not list2:
        return 0

    m, n = len(list1), len(list2)
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if list1[i-1] == list2[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                dp[i][j] = 1 + min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1])

    return dp[m][n]


def get_path_length(as_path):
    """
    Get the length of an AS path.
    """
    if pd.isnull(as_path) or as_path == '':
        return 0
    if isinstance(as_path, int):
        return 1
    if isinstance(as_path, str):
        as_path = as_path.replace('{', '').replace('}', '')
        return len([a for a in as_path.split() if a.isdigit()])
    return 0


def attributes_are_same(row1, row2, attrs=None):
    """
    Check if BGP attributes are the same between two announcements.
    """
    if attrs is None:
        attrs = ['AS_Path', 'Origin', 'Next_Hop', 'MED', 'Local_Pref', 'Communities']

    for attr in attrs:
        if attr in row1.index and attr in row2.index:
            v1, v2 = row1[attr], row2[attr]
            if pd.isna(v1) and pd.isna(v2):
                continue
            if pd.isna(v1) or pd.isna(v2):
                return False
            if v1 != v2:
                return False
    return True

print("Helper functions defined!")

In [None]:
def calculate_nadas_and_flaps(df_window, entry_type_col='Entry_Type'):
    """
    Calculate NADAS and FLAPS using proper state-based tracking.

    NADAS: Re-announcement after withdrawal with DIFFERENT attributes
    FLAPS: Re-announcement after withdrawal with SAME attributes
    """
    nadas_count = 0
    flap_count = 0

    withdrawal_types = ['W', 'WITHDRAW', 'WITHDRAW_MP_UNREACH_NLRI_AFI2']
    announce_types = ['A', 'ANNOUNCE']

    df_sorted = df_window.sort_values('Timestamp')
    prefix_state = {}

    for _, row in df_sorted.iterrows():
        key = (row.get('Prefix', ''), row.get('Peer_IP', ''))
        entry_type = row.get(entry_type_col, '')

        is_announce = entry_type in announce_types
        is_withdraw = entry_type in withdrawal_types

        if is_announce:
            if key in prefix_state and prefix_state[key].get('withdrawn', False):
                last_ann = prefix_state[key].get('last_ann')
                if last_ann is not None:
                    if attributes_are_same(last_ann, row):
                        flap_count += 1
                    else:
                        nadas_count += 1
                else:
                    nadas_count += 1
                prefix_state[key]['withdrawn'] = False

            prefix_state.setdefault(key, {})
            prefix_state[key]['last_ann'] = row
            prefix_state[key]['withdrawn'] = False

        elif is_withdraw:
            if key in prefix_state:
                prefix_state[key]['withdrawn'] = True
            else:
                prefix_state[key] = {'last_ann': None, 'withdrawn': True}

    return nadas_count, flap_count

print("NADAS/FLAPS function defined!")

## 3. Main Feature Extraction Function

In [None]:
def extract_features(df_window, entry_type_col='Entry_Type'):
    """
    Extract all BGP features from a time window.
    """
    features = {}

    announce_types = ['A', 'ANNOUNCE']
    withdrawal_types = ['W', 'WITHDRAW', 'WITHDRAW_MP_UNREACH_NLRI_AFI2']

    announcements = df_window[df_window[entry_type_col].isin(announce_types)]
    withdrawals = df_window[df_window[entry_type_col].isin(withdrawal_types)]

    # -------------------------------------------------------------------------
    # 1-3. VOLUME FEATURES
    # -------------------------------------------------------------------------
    features['announcements'] = len(announcements)
    features['withdrawals'] = len(withdrawals)
    features['nlri_ann'] = announcements['Prefix'].nunique() if not announcements.empty else 0

    # -------------------------------------------------------------------------
    # 4. DUPLICATES
    # -------------------------------------------------------------------------
    if not announcements.empty:
        dup_cols = ['Peer_IP', 'Peer_ASN', 'Prefix', 'AS_Path', 'Origin',
                    'Next_Hop', 'MED', 'Local_Pref', 'Communities']
        dup_cols = [c for c in dup_cols if c in announcements.columns]

        if dup_cols:
            counts = announcements.groupby(dup_cols, dropna=False).size()
            features['dups'] = sum(c - 1 for c in counts if c > 1)
        else:
            features['dups'] = 0
    else:
        features['dups'] = 0

    # -------------------------------------------------------------------------
    # 5-7. ORIGIN ATTRIBUTES
    # -------------------------------------------------------------------------
    if not announcements.empty and 'Origin' in announcements.columns:
        origin_counts = announcements['Origin'].value_counts()
        features['origin_0'] = origin_counts.get('IGP', 0)
        features['origin_2'] = origin_counts.get('INCOMPLETE', 0)
        unique_origins = announcements.groupby('Prefix')['Origin'].nunique()
        features['origin_changes'] = (unique_origins > 1).sum()
    else:
        features['origin_0'] = 0
        features['origin_2'] = 0
        features['origin_changes'] = 0

    # -------------------------------------------------------------------------
    # 8-10. IMPLICIT WITHDRAWALS
    # -------------------------------------------------------------------------
    imp_wd = 0
    imp_wd_spath = 0
    imp_wd_dpath = 0
    edit_distances = []
    edit_distance_dict = defaultdict(list)

    attrs_to_check = ['AS_Path', 'Origin', 'Next_Hop', 'MED', 'Local_Pref', 'Communities']
    attrs_available = [a for a in attrs_to_check if a in announcements.columns]

    if not announcements.empty and len(attrs_available) > 0:
        for (prefix, peer), grp in announcements.groupby(['Prefix', 'Peer_IP']):
            if len(grp) < 2:
                continue

            grp = grp.sort_values('Timestamp')
            prev = None

            for _, row in grp.iterrows():
                if prev is not None:
                    changed = False
                    as_path_changed = False

                    for attr in attrs_available:
                        pv, cv = prev.get(attr), row.get(attr)
                        pv_nan, cv_nan = pd.isna(pv), pd.isna(cv)

                        if pv_nan and cv_nan:
                            continue
                        if pv_nan or cv_nan or pv != cv:
                            changed = True
                            if attr == 'AS_Path':
                                as_path_changed = True

                    if changed:
                        imp_wd += 1
                        if as_path_changed:
                            imp_wd_dpath += 1
                            d = calculate_edit_distance(
                                prev.get('AS_Path', ''),
                                row.get('AS_Path', '')
                            )
                            edit_distances.append(d)
                            edit_distance_dict[prefix].append(d)
                        else:
                            imp_wd_spath += 1

                prev = row

    features['imp_wd'] = imp_wd
    features['imp_wd_spath'] = imp_wd_spath
    features['imp_wd_dpath'] = imp_wd_dpath

    # -------------------------------------------------------------------------
    # 11-12. AS PATH METRICS
    # -------------------------------------------------------------------------
    if not announcements.empty and 'AS_Path' in announcements.columns:
        valid_paths = announcements[
            announcements['AS_Path'].notna() &
            (announcements['AS_Path'] != '')
        ]

        if not valid_paths.empty:
            lengths = valid_paths['AS_Path'].apply(get_path_length)
            features['as_path_max'] = int(lengths.max()) if not lengths.empty else 0
            unique_paths = valid_paths.groupby('Prefix')['AS_Path'].nunique()
            features['unique_as_path_max'] = int(unique_paths.max()) if not unique_paths.empty else 0
        else:
            features['as_path_max'] = 0
            features['unique_as_path_max'] = 0
    else:
        features['as_path_max'] = 0
        features['unique_as_path_max'] = 0

    # -------------------------------------------------------------------------
    # 13-21. EDIT DISTANCE FEATURES
    # -------------------------------------------------------------------------
    if edit_distances:
        features['edit_distance_avg'] = float(np.mean(edit_distances))
        features['edit_distance_max'] = int(max(edit_distances))

        dist_counter = Counter(edit_distances)
        for i in range(7):
            features[f'edit_distance_dict_{i}'] = dist_counter.get(i, 0)

        unique_ed = defaultdict(int)
        for prefix, dists in edit_distance_dict.items():
            for d in set(dists):
                unique_ed[d] += 1

        for i in range(2):
            features[f'edit_distance_unique_dict_{i}'] = unique_ed.get(i, 0)
    else:
        features['edit_distance_avg'] = 0.0
        features['edit_distance_max'] = 0
        for i in range(7):
            features[f'edit_distance_dict_{i}'] = 0
        for i in range(2):
            features[f'edit_distance_unique_dict_{i}'] = 0

    # -------------------------------------------------------------------------
    # 22-23. RARE AS FEATURES
    # -------------------------------------------------------------------------
    if not announcements.empty and 'AS_Path' in announcements.columns:
        all_asns = []

        for as_path in announcements['AS_Path']:
            if pd.isnull(as_path) or as_path == '':
                continue
            path_str = str(as_path).replace('{', '').replace('}', '')
            all_asns.extend([a for a in path_str.split() if a.isdigit()])

        if all_asns:
            asn_counts = Counter(all_asns)
            rare_asns = [a for a, c in asn_counts.items() if c < RARE_AS_THRESHOLD]
            features['number_rare_ases'] = len(rare_asns)
            features['rare_ases_avg'] = len(rare_asns) / len(all_asns)
        else:
            features['number_rare_ases'] = 0
            features['rare_ases_avg'] = 0.0
    else:
        features['number_rare_ases'] = 0
        features['rare_ases_avg'] = 0.0

    # -------------------------------------------------------------------------
    # 24-25. NADAS AND FLAPS (State-based)
    # -------------------------------------------------------------------------
    nadas, flaps = calculate_nadas_and_flaps(df_window, entry_type_col)
    features['nadas'] = nadas
    features['flaps'] = flaps

    # -------------------------------------------------------------------------
    # LABEL AGGREGATION (for incident data)
    # -------------------------------------------------------------------------
    if 'Label' in df_window.columns:
        labels = df_window['Label'].value_counts()
        if not labels.empty:
            # Use majority label, prefer non-normal if tie
            abnormal = [l for l in labels.index if l != 'normal']
            if abnormal:
                features['label'] = abnormal[0]
            else:
                features['label'] = 'normal'
        else:
            features['label'] = 'unknown'
    else:
        features['label'] = 'unknown'

    if 'Incident' in df_window.columns:
        features['Incident'] = df_window['Incident'].iloc[0]

    return features

print("Feature extraction function defined!")

## 4. Data Standardization

In [None]:
def standardize_dataframe(df):
    """
    Standardize DataFrame column names and types for consistent processing.
    Handles both incident data (Entry_Type='A'/'W') and normal data (Subtype='ANNOUNCE'/'WITHDRAW').
    """
    df = df.copy()

    # Standardize timestamp
    if 'Timestamp' not in df.columns:
        if 'Time' in df.columns:
            df['Timestamp'] = pd.to_datetime(df['Time'])
        else:
            raise ValueError("No timestamp column found (expected 'Time' or 'Timestamp')")
    else:
        df['Timestamp'] = pd.to_datetime(df['Timestamp'])

    # Standardize column names
    column_mapping = {
        'Peer_AS': 'Peer_ASN',
        'Origin_AS': 'Origin',
        'Community': 'Communities',
        'Subtype': 'Entry_Type'
    }

    for old_name, new_name in column_mapping.items():
        if old_name in df.columns and new_name not in df.columns:
            df.rename(columns={old_name: new_name}, inplace=True)

    entry_type_col = 'Entry_Type'

    # Map entry types if needed (ANNOUNCE/WITHDRAW -> A/W)
    if entry_type_col in df.columns:
        unique_types = df[entry_type_col].unique()
        if 'ANNOUNCE' in unique_types:
            type_map = {
                'ANNOUNCE': 'A',
                'WITHDRAW': 'W',
                'WITHDRAW_MP_UNREACH_NLRI_AFI2': 'W'
            }
            df[entry_type_col] = df[entry_type_col].map(lambda x: type_map.get(x, x))

    return df, entry_type_col

print("Standardization function defined!")

## 5. File Processing Functions

In [None]:
def process_single_file(input_path, output_path, window_size=WINDOW_SIZE, show_progress=True):
    """
    Process a single BGP data file and extract features.
    """
    print(f"\n{'='*60}")
    print(f"Processing: {input_path.name}")
    print(f"{'='*60}")

    # Read data
    df = pd.read_csv(input_path, low_memory=False)
    print(f"Loaded {len(df):,} records")

    # Standardize
    df, entry_type_col = standardize_dataframe(df)

    # Sort by timestamp
    df = df.sort_values('Timestamp')

    # Get time range
    start_time = df['Timestamp'].min()
    end_time = df['Timestamp'].max()
    print(f"Time range: {start_time} to {end_time}")

    # Show entry type distribution
    print(f"\nEntry type distribution:")
    print(df[entry_type_col].value_counts())

    # Set index for grouping
    df.set_index('Timestamp', inplace=True)

    # Extract features per window
    features_list = []
    window_count = 0

    for window_start, window_df in df.groupby(pd.Grouper(freq=window_size)):
        if window_df.empty:
            continue

        window_df = window_df.reset_index()
        features = extract_features(window_df, entry_type_col)

        if features:
            window_end = window_start + pd.Timedelta(window_size)
            features['window_start'] = window_start
            features['window_end'] = window_end
            features_list.append(features)
            window_count += 1

            if show_progress and window_count % 1000 == 0:
                print(f"  Processed {window_count} windows...")

    print(f"\nTotal windows: {window_count}")

    # Save features
    if features_list:
        features_df = pd.DataFrame(features_list)

        # Ensure consistent column order
        ordered_cols = [
            'announcements', 'withdrawals', 'nlri_ann', 'dups',
            'origin_0', 'origin_2', 'origin_changes',
            'imp_wd', 'imp_wd_spath', 'imp_wd_dpath',
            'as_path_max', 'unique_as_path_max',
            'edit_distance_avg', 'edit_distance_max',
            'edit_distance_dict_0', 'edit_distance_dict_1', 'edit_distance_dict_2',
            'edit_distance_dict_3', 'edit_distance_dict_4', 'edit_distance_dict_5',
            'edit_distance_dict_6',
            'edit_distance_unique_dict_0', 'edit_distance_unique_dict_1',
            'number_rare_ases', 'rare_ases_avg',
            'nadas', 'flaps',
            'label', 'Incident', 'window_start', 'window_end'
        ]

        final_cols = [c for c in ordered_cols if c in features_df.columns]
        extra_cols = [c for c in features_df.columns if c not in final_cols]
        final_cols.extend(extra_cols)

        features_df = features_df[final_cols]
        features_df.to_csv(output_path, index=False)
        
        print(f"\nSaved features to: {output_path}")
        print(f"Shape: {features_df.shape}")

        return features_df
    else:
        print(f"No features extracted!")
        return None

print("File processing function defined!")

## 6. Process Incident Data

In [None]:
if PROCESSING_MODE == 'incident':
    print("="*70)
    print("PROCESSING INCIDENT DATA")
    print("="*70)
    print(f"\nBase directory: {INCIDENT_BASE_DIR}")
    
    # Find all incident directories
    incident_dirs = [d for d in INCIDENT_BASE_DIR.iterdir() if d.is_dir() and d.name != 'temp_mrt']
    print(f"Found {len(incident_dirs)} incident directories")
    
    # Summary
    processed = 0
    skipped = 0
    failed = 0
    all_features = []
    
    for incident_dir in sorted(incident_dirs):
        # Find labeled CSV file
        labeled_files = list(incident_dir.glob("*_labeled.csv"))
        
        if not labeled_files:
            print(f"\n[SKIP] No labeled file in {incident_dir.name}")
            continue
        
        csv_path = labeled_files[0]
        out_path = incident_dir / (csv_path.stem + "_features.csv")
        
        if SKIP_EXISTING and out_path.exists():
            print(f"\n[SKIP] Already exists: {out_path.name}")
            skipped += 1
            # Load existing features for summary
            try:
                existing_df = pd.read_csv(out_path)
                all_features.append(existing_df)
            except:
                pass
            continue
        
        try:
            features_df = process_single_file(csv_path, out_path)
            if features_df is not None:
                all_features.append(features_df)
                processed += 1
            else:
                failed += 1
        except Exception as e:
            print(f"\n[ERROR] {incident_dir.name}: {e}")
            failed += 1
    
    print("\n" + "="*70)
    print(f"INCIDENT PROCESSING COMPLETE")
    print("="*70)
    print(f"Processed: {processed}")
    print(f"Skipped: {skipped}")
    print(f"Failed: {failed}")
else:
    print("Skipping incident processing (PROCESSING_MODE != 'incident')")

## 7. Process Normal Traffic Data

In [None]:
if PROCESSING_MODE == 'normal':
    print("="*70)
    print("PROCESSING NORMAL TRAFFIC DATA")
    print("="*70)
    
    if NORMAL_INPUT_FILE.exists():
        features_df = process_single_file(NORMAL_INPUT_FILE, NORMAL_OUTPUT_FILE)
        print("\nNormal traffic processing complete!")
    else:
        print(f"\nInput file not found: {NORMAL_INPUT_FILE}")
else:
    print("Skipping normal traffic processing (PROCESSING_MODE != 'normal')")

## 8. Feature Distribution Visualization

In [None]:
# Load all features for visualization
if PROCESSING_MODE == 'incident' and all_features:
    combined_df = pd.concat(all_features, ignore_index=True)
    print(f"Combined features shape: {combined_df.shape}")
    print(f"\nLabel distribution:")
    print(combined_df['label'].value_counts())
elif PROCESSING_MODE == 'normal' and 'features_df' in dir() and features_df is not None:
    combined_df = features_df
    print(f"Features shape: {combined_df.shape}")
else:
    print("No features available for visualization")
    combined_df = None

In [None]:
if combined_df is not None:
    # Feature columns (exclude metadata)
    feature_cols = [
        'announcements', 'withdrawals', 'nlri_ann', 'dups',
        'origin_0', 'origin_2', 'origin_changes',
        'imp_wd', 'imp_wd_spath', 'imp_wd_dpath',
        'as_path_max', 'unique_as_path_max',
        'nadas', 'flaps'
    ]
    feature_cols = [c for c in feature_cols if c in combined_df.columns]
    
    # Create visualization
    n_cols = 4
    n_rows = (len(feature_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 3*n_rows))
    axes = axes.flatten()
    
    for i, col in enumerate(feature_cols):
        ax = axes[i]
        
        if 'label' in combined_df.columns:
            for label in combined_df['label'].unique():
                data = combined_df[combined_df['label'] == label][col]
                ax.hist(data, bins=50, alpha=0.5, label=label)
            ax.legend()
        else:
            ax.hist(combined_df[col], bins=50, alpha=0.7)
        
        ax.set_title(col)
        ax.set_xlabel('Value')
        ax.set_ylabel('Count')
    
    # Hide empty subplots
    for i in range(len(feature_cols), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.savefig('feature_distributions.png', dpi=150, bbox_inches='tight')
    plt.show()
    print("\nSaved: feature_distributions.png")

In [None]:
if combined_df is not None and 'label' in combined_df.columns:
    # Feature statistics by label
    print("\n" + "="*70)
    print("FEATURE STATISTICS BY LABEL")
    print("="*70)
    
    numeric_cols = combined_df.select_dtypes(include=[np.number]).columns.tolist()
    stats = combined_df.groupby('label')[numeric_cols].agg(['mean', 'std', 'max'])
    
    # Display key features
    key_features = ['announcements', 'withdrawals', 'nlri_ann', 'imp_wd', 'nadas', 'flaps']
    key_features = [f for f in key_features if f in numeric_cols]
    
    for feature in key_features:
        print(f"\n{feature}:")
        print(stats[feature].round(2))

## 9. Merge Anomaly Features

In [None]:
if PROCESSING_MODE == 'incident':
    print("\n" + "="*70)
    print("MERGING ANOMALY FEATURES")
    print("="*70)
    
    # Collect all feature files
    feature_files = list(INCIDENT_BASE_DIR.glob("*/*_features.csv"))
    print(f"Found {len(feature_files)} feature files")
    
    all_anomalies = []
    
    for f in feature_files:
        try:
            df = pd.read_csv(f)
            # Filter for anomalies only
            anomaly_df = df[df['label'] != 'normal']
            if not anomaly_df.empty:
                anomaly_df['source_file'] = f.parent.name
                all_anomalies.append(anomaly_df)
                print(f"  {f.parent.name}: {len(anomaly_df)} anomalies")
        except Exception as e:
            print(f"  Error reading {f}: {e}")
    
    if all_anomalies:
        merged_anomalies = pd.concat(all_anomalies, ignore_index=True)
        
        # Save merged file
        output_file = INCIDENT_BASE_DIR / "all_incidents_anomalies_only.csv"
        merged_anomalies.to_csv(output_file, index=False)
        
        print(f"\n{'='*70}")
        print(f"MERGED ANOMALIES SUMMARY")
        print(f"{'='*70}")
        print(f"Total anomaly samples: {len(merged_anomalies):,}")
        print(f"\nLabel distribution:")
        print(merged_anomalies['label'].value_counts())
        print(f"\nSaved to: {output_file}")
    else:
        print("\nNo anomalies found to merge!")

## 10. Summary Report

In [None]:
print("\n" + "="*70)
print("FEATURE EXTRACTION COMPLETE")
print("="*70)

if PROCESSING_MODE == 'incident':
    print(f"\nMode: Incident Data Processing")
    print(f"Base directory: {INCIDENT_BASE_DIR}")
    
    if combined_df is not None:
        print(f"\nTotal windows processed: {len(combined_df):,}")
        print(f"Total features: {len(combined_df.columns) - 4}")
        
        if 'label' in combined_df.columns:
            anomaly_count = (combined_df['label'] != 'normal').sum()
            print(f"\nAnomaly windows: {anomaly_count:,} ({anomaly_count/len(combined_df)*100:.1f}%)")
            print(f"Normal windows: {len(combined_df) - anomaly_count:,}")
else:
    print(f"\nMode: Normal Traffic Processing")
    print(f"Input file: {NORMAL_INPUT_FILE}")
    print(f"Output file: {NORMAL_OUTPUT_FILE}")

print(f"\nWindow size: {WINDOW_SIZE}")
print(f"Rare AS threshold: {RARE_AS_THRESHOLD}")