In [None]:
# Full code with merge exclusive features

import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np
from scipy import linalg as LA
import time
from google.colab import runtime
from sklearn.preprocessing import RobustScaler
from BroadLearningSystemMergeExclusiveFeatures import BLS

def load_and_preprocess():
    # Load data
    train = pd.read_csv('UNSW_NB15_training-set.csv')
    test = pd.read_csv('UNSW_NB15_testing-set.csv')

    # Drop columns
    train = train.drop(train.columns[[0, -2]], axis=1).iloc[:, 1:]
    test = test.drop(test.columns[[0, -2]], axis=1).iloc[:, 1:]

    # Categorical encoding
    cat_cols = ['proto', 'service', 'state']
    encoder = OneHotEncoder(sparse_output=False,
                          handle_unknown='ignore',  # Add this parameter
                          categories='auto')  # Explicitly set categories

    train_encoded = encoder.fit_transform(train[cat_cols])
    # Fixed code using explicit column dropping
    train = pd.concat([
        train.drop(columns=cat_cols),  # Explicit column specification
        pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(cat_cols))
    ], axis=1)

    # Modified with error handling and validation
    try:
        test_encoded = encoder.transform(test[cat_cols])
    except ValueError as e:
        print(f"Validation error: {str(e)}")
        # Handle unknown categories by combining train+test
        full_data = pd.concat([train[cat_cols], test[cat_cols]])
        encoder.fit(full_data)
        train_encoded = encoder.transform(train[cat_cols])
        test_encoded = encoder.transform(test[cat_cols])

    test = pd.concat([
        test.drop(columns=cat_cols),
        pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(cat_cols))
    ], axis=1)

    # Split features and labels
    X_train = train.drop(columns=['label'])
    y_train = train['label']
    X_test = test.drop(columns=['label'])
    y_test = test['label']

    # Prescale with RobustScaler
    prescaler = RobustScaler(quantile_range=(5, 95))
    X_train = prescaler.fit_transform(X_train)
    X_test = prescaler.transform(X_test)

    # Standard scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train).astype(np.float32)
    X_test = scaler.transform(X_test).astype(np.float32)

    # Label encoding
    label_encoder = OneHotEncoder(sparse_output=False)
    y_train = label_encoder.fit_transform(y_train.values.reshape(-1, 1))
    y_test = label_encoder.transform(y_test.values.reshape(-1, 1))

    return X_train, y_train, X_test, y_test

def run_comparison():
    X_train, y_train, X_test, y_test = load_and_preprocess()

    print("\nData Dimensions:")
    print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
    print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

    params = {
        's': 0.5, 'c': 1e-5, 'N1': 10, 'N2': 10, 'N3': 50
    }

    print("\nRunning Original BLS:")
    orig = BLS(X_train, y_train, X_test, y_test, K=None, **params)

    print("\nRunning BLS with Feature Bundling:")
    bundled = BLS(X_train, y_train, X_test, y_test, K=3, **params)

    print("\nComparison Results:")
    print(f"{'Metric':<20} | {'Original':<10} | {'Bundled':<10}")
    print("-" * 45)
    print(f"{'Training Accuracy':<20} | {orig[2][0]:<10.4f} | {bundled[2][0]:<10.4f}")
    print(f"{'Testing Accuracy':<20} | {orig[0]:<10.4f} | {bundled[0]:<10.4f}")
    print(f"{'Training Time (s)':<20} | {orig[3][0]:<10.4f} | {bundled[3][0]:<10.4f}")
    print(f"{'Testing Time (s)':<20} | {orig[1]:<10.4f} | {bundled[1]:<10.4f}")

if __name__ == "__main__":
    # Colab-specific memory optimization
    try:
        from google.colab import data_storage
        data_storage.enable_garbage_collection(True)
        print("Colab garbage collection optimized")
    except:
        print("Not running in Colab - proceeding without memory optimization")

    # GPU memory growth (if using TensorFlow)
    try:
        import tensorflow as tf
        physical_devices = tf.config.list_physical_devices('GPU')
        if physical_devices:
            tf.config.experimental.set_memory_growth(physical_devices[0], True)
            print("GPU memory growth enabled")
    except:
        pass
    run_comparison()

Not running in Colab - proceeding without memory optimization

Data Dimensions:
X_train: (175341, 193), y_train: (175341, 2)
X_test: (82332, 193), y_test: (82332, 2)

Running Original BLS:
