In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf

from collections import defaultdict, Counter
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, balanced_accuracy_score, roc_auc_score, average_precision_score, fbeta_score, matthews_corrcoef
from imblearn.over_sampling import SMOTE
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Lambda, Concatenate

In [3]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [4]:
# Custom function to build hidden combination (i.e. convolution) layers

def create_random_combination_layer(input_layer, combination_size, num_combinations, input_dim):
    outputs = []
    
    for _ in range(num_combinations):
        # First random feature selection
        indices_1 = np.random.choice(input_dim, combination_size, replace=False)
        indices_tensor_1 = tf.constant(indices_1, dtype=tf.int32)
        
        # First feature selection using Lambda layer
        slice_layer_1 = Lambda(
            lambda x: tf.gather(x, indices_tensor_1, axis=1),  # Gather selected features
            output_shape=(combination_size,)
        )(input_layer)
        
        # Second random feature selection (after the first random selection)
        indices_2 = np.random.choice(combination_size, combination_size, replace=False)
        indices_tensor_2 = tf.constant(indices_2, dtype=tf.int32)
        
        # Second feature selection using Lambda layer
        slice_layer_2 = Lambda(
            lambda x: tf.gather(x, indices_tensor_2, axis=1),  # Apply a second feature selection
            output_shape=(combination_size,)
        )(slice_layer_1)

        # Apply Dense layers on the final selected subset
        selected_features = Dense(16, activation='relu')(
            Dense(8, activation='relu')(
                Dense(4, activation='relu')(slice_layer_2)
            )
        )
        outputs.append(selected_features)
    
    # Concatenate the outputs from all the random feature combinations
    return Concatenate()(outputs)

In [5]:
def preprocess_largest_class(X, y):
    """
    Preprocess the dataset by reducing the size of the largest class based on its
    relative size to the second largest class.
    
    Args:
        X (pd.DataFrame or np.array): Features.
        y (pd.Series or np.array): Labels.
        
    Returns:
        tuple: Reduced X and y.
    """
    # Count class frequencies
    class_counts = Counter(y)
    sorted_classes = sorted(class_counts.items(), key=lambda x: x[1], reverse=True)
    largest_class, largest_count = sorted_classes[0]
    second_largest_count = sorted_classes[1][1]
    
    # Determine the target size for the largest class
    if largest_count > 2 * second_largest_count:
        target_size = largest_count // 2
    else:
        target_size = second_largest_count
    
    # Split the largest class
    X_largest = X[y == largest_class]
    y_largest = y[y == largest_class]
    
    # Resample the largest class to the target size
    X_largest_reduced, y_largest_reduced = resample(
        X_largest, y_largest, replace=False, n_samples=target_size, random_state=42
    )
    
    # Combine reduced largest class with the rest
    X_rest = X[y != largest_class]
    y_rest = y[y != largest_class]
    X_final = pd.concat([X_rest, X_largest_reduced])
    y_final = pd.concat([y_rest, y_largest_reduced])
    
    return X_final, y_final


def apply_smote_dynamic_with_reduction(X, y):
    """
    Apply preprocessing to reduce the largest class, then apply SMOTE dynamically
    with adjusted `n_neighbors` for smallest classes.
    
    Args:
        X (pd.DataFrame or np.array): Features.
        y (pd.Series or np.array): Labels.
        
    Returns:
        tuple: Resampled X and y.
    """
    # Preprocess largest class
    X, y = preprocess_largest_class(X, y)
    
    # Dynamically determine n_neighbors based on smallest class
    class_counts = Counter(y)
    min_class_size = min(class_counts.values())
    n_neighbors = min(5, min_class_size - 1)  # Adjust neighbors to fit smallest class
    
    # Apply SMOTE
    smote = SMOTE(random_state=42, k_neighbors=n_neighbors)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    
    return X_resampled, y_resampled

In [6]:
def run_model(train_data_path, test_data_path, is_string_labels = False, label_mapping = None):

    # Initialize the one-hot encoder for the target
    encoder = OneHotEncoder(sparse_output=False)

    # Load Training Data
    train_data = import_data(train_data_path)
    train_data = train_data.sample(frac=1).reset_index(drop=True)  # Shuffle

    # Resample Training Data
    X = train_data.iloc[:, :-1]
    y = train_data.iloc[:, -1]
    X_resampled, y_resampled = apply_smote_dynamic_with_reduction(X, y)

    # Prepare Training Data
    if (is_string_labels):
        y_resampled = y_resampled.map(label_mapping)
    train_X = X_resampled.to_numpy()
    train_y = y_resampled.to_numpy()
    train_y = encoder.fit_transform(train_y.reshape(-1, 1))

    # Perform a stratified split into train and validation sets (80% train, 20% validation)
    X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, test_size=0.2, random_state=42, stratify=train_y)

    # Load and Prepare Test Data (this will not be used in training)
    test_data = import_data(test_data_path)
    test_data = test_data.sample(frac=1).reset_index(drop=True)  # Shuffle
    if (is_string_labels):
        test_data['label'] = test_data['label'].map(label_mapping)
    test_X = test_data.drop(columns=['label']).values
    test_y = test_data['label'].values
    test_y = encoder.transform(test_y.reshape(-1, 1))

    # Parameters for Random Feature Combination
    num_combinations = 20  # Number of random column combinations
    combination_size = 3   # Number of columns in each combination

    # EarlyStopping Callback (optional, to avoid overfitting)
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Number of runs for averaging results
    num_runs = 5

    # Initialize storage for metrics
    metrics_storage = defaultdict(list)

    # Train the Model with Validation Split N tines for more accurate metrics
    #print("Verbose output only for first run...")
    verbose_run = 0
    for run in range(num_runs):
        
        # Model is defined separately in each run, since the random combination layers
        # must be randomly initialized each time. Otherwise, the "random" indices stay the same
        # throughout all runs
        input_layer = Input(shape=(X_train.shape[1],))  # Input shape from the training data
        feature_layer = create_random_combination_layer(input_layer, combination_size, num_combinations, X_train.shape[1])
        hidden_layer = Dense(128, activation='relu')(feature_layer)
        hidden_layer = Dropout(0.5)(hidden_layer)
        output_layer = Dense(test_y.shape[1], activation='softmax')(hidden_layer)
        model = Model(inputs=input_layer, outputs=output_layer)
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

        print(f"Run {run + 1}/{num_runs} started...")
        history = model.fit(
            X_train, y_train, 
            epochs=1000, 
            batch_size=int(X_train.shape[0] * 0.01), 
            validation_data=(X_val, y_val),
            callbacks=[early_stopping],
            verbose=verbose_run
        )
        verbose_run = 0 # Suppress detailed output for multiple runs

        test_loss, test_acc = model.evaluate(test_X, test_y, verbose=0)
        y_pred = model.predict(test_X, verbose=0)
        y_pred_classes = y_pred.argmax(axis=1)
        y_true_classes = test_y.argmax(axis=1)

        # Compute metrics
        balanced_acc = balanced_accuracy_score(y_true_classes, y_pred_classes)
        roc_auc = roc_auc_score(test_y, y_pred, multi_class='ovr')  # `test_y` is fine here for AUC
        pr_auc = average_precision_score(test_y, y_pred, average='weighted')
        f2 = fbeta_score(y_true_classes, y_pred_classes, beta=2, average='weighted')
        mcc = matthews_corrcoef(y_true_classes, y_pred_classes)

        # Store metrics
        metrics_storage['test_loss'].append(test_loss)
        metrics_storage['test_accuracy'].append(test_acc)
        metrics_storage['balanced_accuracy'].append(balanced_acc)
        metrics_storage['roc_auc'].append(roc_auc)
        metrics_storage['pr_auc'].append(pr_auc)
        metrics_storage['f2'].append(f2)
        metrics_storage['mcc'].append(mcc)

        # Store classification report metrics
        report = classification_report(y_true_classes, y_pred_classes, output_dict=True)
        for label, values in report.items():
            # Check if the value is a dictionary (e.g., 'precision', 'recall', 'f1-score')
            if isinstance(values, dict):
                for metric, value in values.items():
                    metrics_storage[f"{label}_{metric}"].append(value)
            else:
                # Handle scalar values (like 'accuracy')
                metrics_storage[label].append(values)

    # Average the metrics over all runs
    print("\nAggregated Metrics:")
    for metric, values in metrics_storage.items():
        avg_value = np.mean(values)
        print(f"{metric}: {avg_value:.4f}")
        

In [7]:
run_model("/kaggle/input/ma-datasets/shuttle_train.csv", "/kaggle/input/ma-datasets/shuttle_test.csv", is_string_labels = False)

Memory usage of dataframe is 3.54 MB
Memory usage after optimization is: 0.84 MB
Decreased by 76.2%
Memory usage of dataframe is 0.89 MB
Memory usage after optimization is: 0.21 MB
Decreased by 76.2%
Run 1/5 started...
Run 2/5 started...
Run 3/5 started...
Run 4/5 started...
Run 5/5 started...

Aggregated Metrics:
test_loss: 0.6491
test_accuracy: 0.7138
balanced_accuracy: 0.7252
roc_auc: 0.9108
pr_auc: 0.9320
f2: 0.7198
mcc: 0.6710
0_precision: 0.9722
0_recall: 0.6823
0_f1-score: 0.7338
0_support: 9117.0000
1_precision: 0.4627
1_recall: 0.8600
1_f1-score: 0.5041
1_support: 10.0000
2_precision: 0.2560
2_recall: 0.6529
2_f1-score: 0.3227
2_support: 34.0000
3_precision: 0.8364
3_recall: 0.8031
3_f1-score: 0.7619
3_support: 1781.0000
4_precision: 0.8212
4_recall: 0.9115
4_f1-score: 0.8345
4_support: 653.0000
5_precision: 0.4133
5_recall: 0.5000
5_f1-score: 0.4076
5_support: 2.0000
6_precision: 0.1428
6_recall: 0.6667
6_f1-score: 0.2114
6_support: 3.0000
accuracy: 0.7138
macro avg_precision

In [8]:
run_model("/kaggle/input/ma-datasets/covtype_train.csv", "/kaggle/input/ma-datasets/covtype_test.csv", is_string_labels = False)

Memory usage of dataframe is 195.04 MB
Memory usage after optimization is: 48.32 MB
Decreased by 75.2%
Memory usage of dataframe is 48.76 MB
Memory usage after optimization is: 12.08 MB
Decreased by 75.2%
Run 1/5 started...
Run 2/5 started...
Run 3/5 started...
Run 4/5 started...
Run 5/5 started...

Aggregated Metrics:
test_loss: 1.6250
test_accuracy: 0.2804
balanced_accuracy: 0.3385
roc_auc: 0.6846
pr_auc: 0.4691
f2: 0.2765
mcc: 0.1305
0_precision: 0.3520
0_recall: 0.2186
0_f1-score: 0.2480
0_support: 42368.0000
1_precision: 0.6329
1_recall: 0.3116
1_f1-score: 0.3883
1_support: 56661.0000
2_precision: 0.1574
2_recall: 0.3891
2_f1-score: 0.1758
2_support: 7151.0000
3_precision: 0.0396
3_recall: 0.6601
3_f1-score: 0.0701
3_support: 549.0000
4_precision: 0.0402
4_recall: 0.2891
4_f1-score: 0.0657
4_support: 1899.0000
5_precision: 0.0773
5_recall: 0.1354
5_f1-score: 0.0900
5_support: 3473.0000
6_precision: 0.1826
6_recall: 0.3655
6_f1-score: 0.2120
6_support: 4102.0000
accuracy: 0.2804
ma

In [9]:
labels_map = {
    'normal.': 0, 'satan.': 1, 'ipsweep.': 2, 'portsweep.': 3, 'nmap.': 4,
    'back.': 5, 'warezclient.': 6, 'teardrop.': 7, 'pod.': 8, 'guess_passwd.': 9,
    'buffer_overflow.': 10, 'land.': 11, 'warezmaster.': 12, 'imap.': 13, 'rootkit.': 14,
    'loadmodule.': 15, 'multihop.': 16, 'ftp_write.': 17, 'phf.': 18, 'perl.': 19, 'spy.': 20
}

run_model("/kaggle/input/ma-datasets/kdd_train.csv", "/kaggle/input/ma-datasets/kdd_test.csv", is_string_labels = True, label_mapping = labels_map)

Memory usage of dataframe is 789.51 MB
Memory usage after optimization is: 196.60 MB
Decreased by 75.1%
Memory usage of dataframe is 197.38 MB
Memory usage after optimization is: 49.15 MB
Decreased by 75.1%
Run 1/5 started...
Run 2/5 started...
Run 3/5 started...
Run 4/5 started...
Run 5/5 started...

Aggregated Metrics:
test_loss: 2.8790
test_accuracy: 0.0132
balanced_accuracy: 0.1453
roc_auc: 0.5809
pr_auc: 0.9313
f2: 0.0095
mcc: 0.0863
0_precision: 0.3999
0_recall: 0.0022
0_f1-score: 0.0044
0_support: 194557.0000
1_precision: 0.2936
1_recall: 0.4213
1_f1-score: 0.3173
1_support: 3178.0000
2_precision: 0.0000
2_recall: 0.0000
2_f1-score: 0.0000
2_support: 2496.0000
3_precision: 0.0302
3_recall: 0.3530
3_f1-score: 0.0516
3_support: 2083.0000
4_precision: 0.0585
4_recall: 0.0834
4_f1-score: 0.0687
4_support: 463.0000
5_precision: 0.0253
5_recall: 0.1664
5_f1-score: 0.0429
5_support: 441.0000
6_precision: 0.0018
6_recall: 0.0245
6_f1-score: 0.0034
6_support: 204.0000
7_precision: 0.0002

In [10]:
labels_map = {
    'Normal': 0, 'Darknet_Audio-Streaming': 1, 'Darknet_Chat': 2, 'Darknet_File-Transfer': 3, 'Darknet_VOIP': 4,
    'Darknet_Video-Streaming': 5, 'Darknet_Email': 6, 'Darknet_Browsing': 7, 'Darknet_P2P': 8
}

run_model("/kaggle/input/ma-datasets/darknet_train.csv", "/kaggle/input/ma-datasets/darknet_test.csv", is_string_labels = True, label_mapping = labels_map)

Memory usage of dataframe is 100.68 MB
Memory usage after optimization is: 25.05 MB
Decreased by 75.1%
Memory usage of dataframe is 25.17 MB
Memory usage after optimization is: 6.26 MB
Decreased by 75.1%
Run 1/5 started...
Run 2/5 started...
Run 3/5 started...
Run 4/5 started...
Run 5/5 started...

Aggregated Metrics:
test_loss: 1.9212
test_accuracy: 0.0954
balanced_accuracy: 0.2912
roc_auc: 0.6964
pr_auc: 0.8042
f2: 0.0934
mcc: 0.1287
0_precision: 0.5846
0_recall: 0.0605
0_f1-score: 0.1094
0_support: 26862.0000
1_precision: 0.4006
1_recall: 0.1591
1_f1-score: 0.2089
1_support: 2657.0000
2_precision: 0.4373
2_recall: 0.7725
2_f1-score: 0.4162
2_support: 908.0000
3_precision: 0.0347
3_recall: 0.0747
3_f1-score: 0.0469
3_support: 522.0000
4_precision: 0.0215
4_recall: 0.1420
4_f1-score: 0.0279
4_support: 293.0000
5_precision: 0.0448
5_recall: 0.5100
5_f1-score: 0.0765
5_support: 269.0000
6_precision: 0.0173
6_recall: 0.2517
6_f1-score: 0.0266
6_support: 116.0000
7_precision: 0.0044
7_rec