In [1]:
import pandas as pd
train_data = pd.read_csv('UNSW_NB15_training-set.csv')
test_data = pd.read_csv('UNSW_NB15_testing-set.csv')

from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Define categorical columns
categorical_cols = ['proto', 'service', 'state']

# Initialize OneHotEncoder to ignore unknown categories
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Fit encoder on training data
encoder.fit(train_data[categorical_cols])

# Transform both train and test data
train_encoded = encoder.transform(train_data[categorical_cols])
test_encoded = encoder.transform(test_data[categorical_cols])

# Convert encoded arrays to DataFrames
train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(categorical_cols))
test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(categorical_cols))

# Drop original categorical columns and merge encoded data
train_data = pd.concat([train_data.drop(categorical_cols, axis=1), train_encoded_df], axis=1)
test_data = pd.concat([test_data.drop(categorical_cols, axis=1), test_encoded_df], axis=1)

from sklearn.preprocessing import StandardScaler
X_train = train_data.drop('label', axis=1)
y_train = train_data['label']
X_test = test_data.drop('label', axis=1)
y_test = test_data['label']

# Convert labels to one-hot format
label_encoder = OneHotEncoder(sparse_output=False)
y_train_onehot = label_encoder.fit_transform(y_train.values.reshape(-1, 1))
y_test_onehot = label_encoder.transform(y_test.values.reshape(-1, 1))

# Handle remaining categorical features in X (if any)
categorical_cols_x = X_train.select_dtypes(include=['object']).columns
if not categorical_cols_x.empty:
    encoder_x = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    X_train_encoded = encoder_x.fit_transform(X_train[categorical_cols_x])
    X_test_encoded = encoder_x.transform(X_test[categorical_cols_x])
    X_train = pd.concat([X_train.drop(categorical_cols_x, axis=1),
                         pd.DataFrame(X_train_encoded, columns=encoder_x.get_feature_names_out(categorical_cols_x))], axis=1)
    X_test = pd.concat([X_test.drop(categorical_cols_x, axis=1),
                        pd.DataFrame(X_test_encoded, columns=encoder_x.get_feature_names_out(categorical_cols_x))], axis=1)

# Normalize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

from BroadLearningSystem import BLS

# BLS parameters
s = 0.8    # Convergence coefficient
c = 2**-30 # Regularization coefficient
N1 = 10    # Feature nodes per window
N2 = 10    # Number of windows
N3 = 100   # Enhancement nodes

# Running BLS
test_acc, test_time, train_acc, train_time = BLS(
    train_x=X_train_scaled,
    train_y=y_train_onehot,  # Use one-hot encoded labels
    test_x=X_test_scaled,
    test_y=y_test_onehot,     # Use one-hot encoded labels
    s=s,
    c=c,
    N1=N1,
    N2=N2,
    N3=N3
)

# print outcome
print("test accuracy:", test_acc)
print("training accuracy:", train_acc)
print("test time spent:", test_time)
print("training time spent:", train_time)

Training accurate is 99.913 %
Training time is  8.502995014190674 s
Testing accurate is 99.81700000000001 %
Testing time is  1.1878211498260498 s
test accuracy: [[0.99817]]
training accuracy: [[0.99913]]
test time spent: [[1.18782115]]
training time spent: [[8.50299501]]
