In [None]:
# Full code with greedy bundling

import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np
from scipy import linalg as LA
import time
from BLSGreedy import BLS

# 数据预处理部分
def load_and_preprocess():
    # 读取数据
    train_data = pd.read_csv('UNSW_NB15_training-set.csv')
    # train_data = train_data.iloc[:, 1:]

    test_data = pd.read_csv('UNSW_NB15_testing-set.csv')
    # test_data = test_data.iloc[:, 1:]

    print("\nOriginal dimensions:")
    print(f"Training set: {train_data.shape}, Test set: {test_data.shape}")

    # Safely remove columns by position
    def drop_columns(df):
        # Column 0 = 'id', Column -2 = 'attack_cat' (before label)
        return df.drop(df.columns[[0, -2]], axis=1)

    train_data = drop_columns(train_data)
    test_data = drop_columns(test_data)

    print("\nAfter dropping columns:")
    print(f"Training set: {train_data.shape}, Test set: {test_data.shape}")

    # Rest of the original preprocessing code remains unchanged
    train_data = train_data.iloc[:, 1:]  # Original first column removal (keep if needed)
    test_data = test_data.iloc[:, 1:]


    # 处理分类特征
    categorical_cols = ['proto', 'service', 'state']
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    # 训练集编码
    train_encoded = encoder.fit_transform(train_data[categorical_cols])
    train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(categorical_cols))
    train_data = pd.concat([train_data.drop(categorical_cols, axis=1), train_encoded_df], axis=1)

    # 测试集编码
    test_encoded = encoder.transform(test_data[categorical_cols])
    test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(categorical_cols))
    test_data = pd.concat([test_data.drop(categorical_cols, axis=1), test_encoded_df], axis=1)

    # 分割特征标签
    X_train = train_data.drop('label', axis=1)
    y_train = train_data['label']
    X_test = test_data.drop('label', axis=1)
    y_test = test_data['label']

    # 标签编码
    label_encoder = OneHotEncoder(sparse_output=False)
    y_train_onehot = label_encoder.fit_transform(y_train.values.reshape(-1, 1))
    y_test_onehot = label_encoder.transform(y_test.values.reshape(-1, 1))

    # 处理剩余分类特征
    categorical_cols_x = X_train.select_dtypes(include=['object']).columns
    if not categorical_cols_x.empty:
        encoder_x = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        X_train_encoded = encoder_x.fit_transform(X_train[categorical_cols_x])
        X_test_encoded = encoder_x.transform(X_test[categorical_cols_x])
        X_train = pd.concat([X_train.drop(categorical_cols_x, axis=1),
                            pd.DataFrame(X_train_encoded, columns=encoder_x.get_feature_names_out(categorical_cols_x))], axis=1)
        X_test = pd.concat([X_test.drop(categorical_cols_x, axis=1),
                           pd.DataFrame(X_test_encoded, columns=encoder_x.get_feature_names_out(categorical_cols_x))], axis=1)

    # 标准化
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Final dimensions before returning
    print("\nFinal preprocessed dimensions:")
    print(f"X_train: {X_train_scaled.shape}, y_train: {y_train_onehot.shape}")
    print(f"X_test: {X_test_scaled.shape}, y_test: {y_test_onehot.shape}")

    return X_train_scaled, y_train_onehot, X_test_scaled, y_test_onehot


def run_comparison():
    X_train, y_train, X_test, y_test = load_and_preprocess()

    # Print input dimensions for BLS
    print("\nBLS Input Dimensions:")
    print(f"Training data: {X_train.shape} → {y_train.shape}")
    print(f"Testing data: {X_test.shape} → {y_test.shape}")

    params = {
        's': 0.5,      # Shrinkage coefficient
        'c': 1e-5,     # Regularization coefficient
        'N1': 10,      # Nodes per feature window
        'N2': 10,      # Number of feature windows
        'N3': 50       # Enhancement nodes
    }

    print("="*60 + "\nRunning Original BLS:")
    orig_results = BLS(X_train, y_train, X_test, y_test, K=None, **params)
    # orig_results = BLS(X_train, y_train, X_test, y_test, **params)

    print("\n" + "="*60 + "\nRunning BLS with Greedy Bundling:")
    bundled_results = BLS(X_train, y_train, X_test, y_test, K=3, **params)
    # bundled_results = BLS(X_train, y_train, X_test, y_test, **params)

    # Print comparison table for greedy bundling in the middle
    print("\n" + "="*60)
    print(f"{'Metric':<25} | {'Original BLS':<15} | {'Bundled BLS':<15}")
    print("-"*60)
    print(f"{'Training Accuracy':<25} | {orig_results[2]:<15.4f} | {bundled_results[2]:<15.4f}")
    print(f"{'Training Time (s)':<25} | {orig_results[3]:<15.4f} | {bundled_results[3]:<15.4f}")
    print(f"{'Testing Accuracy':<25} | {orig_results[0]:<15.4f} | {bundled_results[0]:<15.4f}")
    print(f"{'Testing Time (s)':<25} | {orig_results[1]:<15.4f} | {bundled_results[1]:<15.4f}")

if __name__ == "__main__":
    run_comparison()


Original dimensions:
Training set: (175341, 45), Test set: (82332, 45)

After dropping columns:
Training set: (175341, 43), Test set: (82332, 43)

Final preprocessed dimensions:
X_train: (175341, 193), y_train: (175341, 2)
X_test: (82332, 193), y_test: (82332, 2)

BLS Input Dimensions:
Training data: (175341, 193) → (175341, 2)
Testing data: (82332, 193) → (82332, 2)
Running Original BLS:
Training Accuracy:   93.44%
Training Time:       10.51s
Testing Accuracy:    45.47%
Testing Time:        2.53s

Running BLS with Greedy Bundling:

Feature Bundle Diagnostics:
Original input features: 193
Feature mapping dimension: 10 groups x 10 nodes = 100
Reduced to 12 bundles with max conflict 3

Bundle Details (feature mapping columns grouped):
Bundle 1: [np.int64(6), np.int64(10), np.int64(18), np.int64(36), np.int64(37), np.int64(45), np.int64(46), np.int64(47), np.int64(50), np.int64(54), np.int64(59), np.int64(70), np.int64(75), np.int64(76), np.int64(80), np.int64(84), np.int64(90), np.int64