#### Copyright Raymond Soto Jr. D.Eng(c).
#### From Edge to Enterprise: 
#### Federated Learning Threat Classification with Heterogeneous Devices in Converged Energy Sector Networks
#### Revised for Github on July 9th, 2025

# Load Modules

In [None]:
# Load Program Modules
import pandas as pd
import numpy as np
import seaborn as sns

import shutil
import os

import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

from xgboost import XGBClassifier 

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.inspection import permutation_importance
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from sklearn.ensemble import ExtraTreesClassifier

from collections import Counter

import pickle
import time
from datetime import datetime

# TensorFlow Keras
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Load Data

In [None]:
# imports the feature descriptions csv to work with the full traffic dataset
unswnb15_features = pd.read_csv('UNSWB15_CSV_Files/UNSW-NB15_features.csv')
# convert the feature names into a list
feature_header = unswnb15_features['Name'].tolist()
# import UNSW NB traffic dataset part 1 without headers, then map the list of headers, and silence low memory alert
df1 = pd.read_csv('UNSWB15_CSV_Files/UNSW-NB15_1.csv',header=None, names=feature_header , low_memory=False)
df2 = pd.read_csv('UNSWB15_CSV_Files/UNSW-NB15_2.csv',header=None, names=feature_header , low_memory=False)
df3 = pd.read_csv('UNSWB15_CSV_Files/UNSW-NB15_3.csv',header=None, names=feature_header , low_memory=False)
df4 = pd.read_csv('UNSWB15_CSV_Files/UNSW-NB15_4.csv',header=None, names=feature_header , low_memory=False)
# print shape of each dataframe
print(df1.shape)
print(df2.shape)
print(df3.shape)
print(df4.shape)
# Assume 'label' column indicates 1 for malicious, 0 for normal
# Ensure class distribution is examined, Print out the percentage of malicious vs normal instances
label_col = 'label'
print(df1[label_col].value_counts(normalize=True) * 100)
print(df2[label_col].value_counts(normalize=True) * 100)
print(df3[label_col].value_counts(normalize=True) * 100)
print(df4[label_col].value_counts(normalize=True) * 100)

#### Combine CSV Files

In [None]:
# combined dataframe df1, df2, df3, d4
df_combined = pd.concat([df1, df2, df3, df4], ignore_index=True)
print("Combined DataFrame shape:", df_combined.shape)
print(df_combined[label_col].value_counts(normalize=True) * 100)

# Feature Engineering

#### Map Protocol

In [4]:
# Create map_protocol function to group network traffic protocols
def map_protocol(proto):
    if proto in ["tcp", "udp", "sctp", "udt", "mux", "iso-tp4", "tp++", "ddp", "xtp", "vmtp", "mtp", "crudp"]:
        return "transport"
    elif proto in ["icmp", "igmp", "rsvp", "ptp"]:
        return 'control'
    elif proto in ["ospf", "egp", "igp", "idrp", "ipv6-route", "gre", "nsfnet-igp", "eigrp", "isis", "vrrp"]:
        return "Routing"
    elif proto in ["ip", "ipv6", "ipv6-frag", "ipv6-opts", "iso-ip", "ipnip", "ggp", "ipip", "ipx-n-ip"]:
        return "Internet"
    elif proto in ["pim", "rtp", "gmtp", "micp", "pgm"]:
        return "Multicast"
    elif proto in ["etherip", "l2tp", "encap"]:
        return "Tunneling"
    elif proto in ["arp", "stp", "ax.25", "fc", "ib"]:
        return "Link"
    elif proto in ["esp", "ipcomp", "secure-vmtp"]:
        return "Security"
    else:
        return "other"

# Use map_protocol function to build a new column with categories from the Train set
df_combined['protocol_category'] = df_combined['proto'].apply(map_protocol)
# drop original proto column before OneHot encoding 'protocol_category'
df_combined.drop(['proto'], axis=1, inplace=True)

#### Map Connection State

In [5]:
# Creates map_connection_state function that maps connection state values to broader groups.
def map_connection_state(state):
    # Group states that indicate an established connection
    if state in ["CON", "ACC", "REQ"]:
        return "established"
    # Group states that indicate termination of the connection
    elif state in ["FIN", "RST", "CLO"]:
        return "terminated"
    # Group states that might indicate the handshake phase (if applicable)
    elif state in ["SYN", "SYN-ACK"]:
        return "handshaking"
    else:
        return "other"

# use map_connection_state function to build a new column with categories from the combined set
df_combined['state_category'] = df_combined['state'].apply(map_connection_state)
# drop original proto column before OneHot encoding 'protocol_category'
df_combined.drop(['state'], axis=1, inplace=True)

# Data Preprocessing

#### Feature Reduction: Drop Columns

In [None]:
# drop columns to align the shape of the dataframes
df_combined.drop(['srcip','sport','dstip','service','stime','ltime','is_sm_ips_ports',
                  'ct_ftp_cmd','ct_flw_http_mthd','is_ftp_login','attack_cat'],axis=1, inplace=True)

# print dataframe shape
print(f'DF Shape Combined Set {df_combined.shape}')

#### Drops Rows of Invalid Hex and Regex and Convert to int

In [None]:
# Build a boolean mask for rows where dsport looks like a hex string or regex
mask_hex = df_combined['dsport'].astype(str).str.lower().str.startswith('0x')
mask_digits = df_combined['dsport'].astype(str).str.fullmatch(r'\d+')

# Drop rows
df_combined = df_combined.loc[~mask_hex].copy()
df_combined = df_combined.loc[mask_digits].copy()

# Convert the remaining dsport values to int
df_combined['dsport'] = df_combined['dsport'].astype(int)

# Quick sanity check
print("Remaining dsport dtype:", df_combined['dsport'].dtype)
print("Any hex rows left?", df_combined['dsport'].astype(str).str.lower().str.startswith('0x').any())

#### Convert to Boolean

In [8]:
# Conver label to a boolean type
df_combined['label'] = df_combined['label'].astype(bool)

#### Categorical Encoding: One-Hot Encoding

In [None]:
# OneHotEncoding
# Select the columns to be one-hot encoded
cols_to_encode = ['protocol_category','state_category']
# Create the OneHotEncoder instance with the desired parameters
encoder = OneHotEncoder(dtype=bool, sparse_output=False, handle_unknown='ignore')

# Fit and transform the selected columns from the Combined set. This returns a NumPy array.
encoded_array_combined = encoder.fit_transform(df_combined[cols_to_encode])
# Retrieve the names for the new columns
encoded_columns_combined = encoder.get_feature_names_out(cols_to_encode)
# Convert the encoded array to a DataFrame. Preserve the index to merge correctly.
encoded_df_combined = pd.DataFrame(encoded_array_combined, columns=encoded_columns_combined, index=df_combined.index)
# Drop the original columns and concatenate the one-hot encoded DataFrame
df_encoded_combined = pd.concat([df_combined.drop(columns=cols_to_encode), encoded_df_combined], axis=1)

# Print to confirm
print(f'DF Combined Shape Combined Set {df_encoded_combined.shape}')

#### Split Dataset

In [None]:
# Confirm proportions of malicious/benign and dataframe size
label_col = 'label'
print(df_encoded_combined[label_col].value_counts(normalize=True) * 100)
print(f'DF Encoded Combined Shape: {df_encoded_combined.shape}')

In [None]:
# Split dataset to seperate the testing set before training and validation
strat_col = 'label'

df_encoded_combined, df_encoded_4 = train_test_split(
    df_encoded_combined,
    test_size=0.20,
    stratify=df_encoded_combined[strat_col]
)

# Confirm
print(f"Train & Validate shape: {df_encoded_combined.shape},  Test shape: {df_encoded_4.shape}")
print("Train True proportion:", df_encoded_combined[strat_col].mean())
print("Test  True proportion:", df_encoded_4[strat_col].mean())

# Train, Validation, Testing Split

In [None]:
# split the combined dataset into testing and validation
X_train0 = df_encoded_combined.drop(['label'], axis=1) # set X as all features/predicator variables
y_train0 = df_encoded_combined['label'].astype(bool) # set y as target variable

# confirm distribution of malicous to normal: REMOVED for testing , random_state=42
print(X_train0.shape)
print(y_train0.value_counts(normalize=True) * 100)

In [None]:
# split the testing dataset
X2 = df_encoded_4.drop(['label'], axis=1) # set X as all features/predicator variables
y2 = df_encoded_4['label'].astype(bool) # set y as target variable
# confirm distribution of malicous to normal
print(y2.value_counts(normalize=True) * 100)
print(X2.shape)

#### Feature Standardization: Standard Scaler

In [14]:
# Identify numeric columns in your training set
numeric_cols = X_train0.select_dtypes(include=[np.number]).columns
# Create and fit the scaler on TRAINING data
scaler = StandardScaler()
X_train0[numeric_cols] = scaler.fit_transform(X_train0[numeric_cols])

In [16]:
# Transform your testing data 
X2[numeric_cols] = scaler.transform(X2[numeric_cols])

In [None]:
# Confirm StandardScaler
# Compute separately
means0 = X_train0[numeric_cols].mean()
stds0  = X_train0[numeric_cols].std()
# Show the top 5 means and stds
print("Means:\n", means0.head(), "\n")
print("Stds:\n",  stds0.head(), "\n")

means2 = X2[numeric_cols].mean()
stds2  = X2[numeric_cols].std()
# Show the top 5 means and stds
print("Means:\n", means2.head(), "\n")
print("Stds:\n",  stds2.head(), "\n")

# Handling Class Imbalance

#### SMOTE

In [None]:
# SMOTE will increase the count of miniroty items
# Apply SMOTE to training data only
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train0, y_train0)
print("Before SMOTE:", y_train0.value_counts())
print("After SMOTE:", y_train_smote.value_counts())

# Model Training

#### Convert to Numpy

In [19]:
# TensorFlow operations and the tf.data.Dataset API expect data 
# in a format that it can work with efficiently—typically NumPy arrays or tensors.
X_train_smote_np = X_train_smote.to_numpy(dtype=np.float32)
y_train_smote_np = y_train_smote.to_numpy(dtype=np.float32)

## Automated grid search with K-fold cross-validation for a Keras MLP model

In [None]:
# --- 0. Your SMOTE-resampled data and split helper ---
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# --- 1. Define your hyperparameter grid ---
param_grid = {
    'learning_rate': [1e-4, 1e-3, 1e-2, 1e-1],
    'activation':    ['relu', 'elu'],
    'hidden_layer_sizes': [(128, 64), (64, 32)]
}

# --- 2. EarlyStopping callback factory ---
def make_early_stop():
    return EarlyStopping(
        monitor='val_recall',
        patience=20,
        verbose=1,
        restore_best_weights=True
    )

# --- 3. Outer grid loop ---
start_time = time.time()
for params in ParameterGrid(param_grid):
    lr  = params['learning_rate']
    act = params['activation']
    hl1, hl2 = params['hidden_layer_sizes']
    print(f"\n=== GRID SEARCH: lr={lr}, activation={act}, hidden=[{hl1},{hl2}] ===")

    # --- 3a. Model factory that “captures” these params ---
    def create_mlp_model():
        m = models.Sequential([
            layers.Input(shape=(47,)),
            layers.Dense(hl1, activation=act),
            layers.Dense(hl2, activation=act),
            layers.Dense(1, activation='sigmoid')
        ])
        opt = optimizers.Adam(learning_rate=lr)
        m.compile(
            optimizer=opt,
            loss='binary_crossentropy',
            metrics=[
                'binary_accuracy',
                tf.keras.metrics.Precision(name='precision'),
                tf.keras.metrics.Recall(name='recall')
            ]
        )
        return m

    # --- 3b. 5-fold CV for this combination ---
    best_model = None
    best_val_recall = 0.0
    fold_no = 1

    for train_idx, val_idx in skf.split(X_train_smote_np, y_train_smote_np):
        print(f"-- Fold {fold_no} --")
        X_tr, X_va = X_train_smote_np[train_idx], X_train_smote_np[val_idx]
        y_tr, y_va = y_train_smote_np[train_idx], y_train_smote_np[val_idx]

        train_ds = (
            tf.data.Dataset
              .from_tensor_slices((X_tr, y_tr))
              .shuffle(len(X_tr))
              .batch(64)
              .prefetch(tf.data.AUTOTUNE)
        )
        val_ds = (
            tf.data.Dataset
              .from_tensor_slices((X_va, y_va))
              .batch(64)
              .prefetch(tf.data.AUTOTUNE)
        )

        model = create_mlp_model()
        history = model.fit(
            train_ds,
            epochs=100,
            validation_data=val_ds,
            callbacks=[make_early_stop()],
            verbose=1
        )

        # best recall this fold
        fold_recall = max(history.history['val_recall'])
        print(f"Fold {fold_no} best val_recall: {fold_recall:.4f}")

        if fold_recall > best_val_recall:
            best_val_recall = fold_recall
            best_model = model

        fold_no += 1

    # --- 3c. Save the best model for *this* param combo ---
    if best_model is not None:
        fname = f"mlp_lr{lr}_act{act}_hl{hl1}-{hl2}.h5"
        best_model.save(fname)
        print(f"Saved best model (recall={best_val_recall:.4f}) → {fname}")

# --- 4. Summary timing ---
elapsed = time.time() - start_time
print("Grid search complete — total time: {:.2f}s".format(elapsed))
print("Finished at", datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

## Change model names

In [None]:
# Define the filenames for the source and destination lists
source_list_filename = "s_names.txt"
destination_list_filename = "d_names.txt"

try:
    # --- 1. Read the lists of filenames ---
    with open(source_list_filename, 'r') as f:
        # Use strip() to remove newline characters from each line
        source_files = [line.strip() for line in f.readlines()]

    with open(destination_list_filename, 'r') as f:
        destination_files = [line.strip() for line in f.readlines()]

    # --- 2. Validate that the lists are the same size ---
    if len(source_files) != len(destination_files):
        print("Error: The source and destination lists have a different number of entries.")
        exit() # Stop the script if lists don't match

    # --- 3. Loop through the files and copy them ---
    print(f"Found {len(source_files)} file(s) to copy.\n")
    for source, dest in zip(source_files, destination_files):
        try:
            # Check if the source file exists
            if os.path.exists(source):
                # Copy the file (shutil.copy also works)
                shutil.copy(source, dest)
                print(f"✅ Successfully copied '{source}' to '{dest}'")
            else:
                print(f"❌ Error: Source file '{source}' not found.")
        
        except Exception as e:
            # This catches errors during the copy process itself
            print(f"❌ An error occurred while processing '{source}': {e}")

except FileNotFoundError as e:
    # This catches an error if s_names.txt or d_names.txt is missing
    print(f"❌ Error: A list file was not found. Please ensure '{source_list_filename}' and '{destination_list_filename}' exist.")
    print(f"   Details: {e}")

except Exception as e:
    # This catches any other unexpected errors
    print(f"❌ An unexpected error occurred: {e}")

# Model Testing

In [None]:
# --- Configuration ---
MODEL_PATH_TEMPLATE = 'M{}.h5' # Template for model filenames
NUM_MODELS = 16

X2_np = X2.to_numpy(dtype=np.float32)
y2_np = y2.to_numpy(dtype=np.int32)

# --- Initialize lists to store metrics ---
model_names = []
recalls = []
accuracies = []
fprs = [] # False Positive Rates
fnrs = [] # False Negative Rates

# --- Loop through each model ---
for i in range(1, NUM_MODELS + 1):
    model_name = f"M{str(i).zfill(2)}" # Ensures names like LR01, LR02, ..., LR16
    model_file = MODEL_PATH_TEMPLATE.format(str(i).zfill(2))
    print(f"\n--- Processing Model: {model_name} ({model_file}) ---")

    try:
        # Load the saved model
        model = tf.keras.models.load_model(model_file)
        print(f"Model {model_name} loaded successfully.")

        # Use the model to predict probabilities, then convert to binary predictions
        y_pred_prob = model.predict(X2_np)
        y_pred = (y_pred_prob > 0.5).astype(int)

        # Compute the confusion matrix
        cm = confusion_matrix(y2_np, y_pred)
        print("Confusion Matrix:")
        print(cm)

        # Extract TP, FP, TN, FN
        # Ensure cm has 2x2 shape, otherwise handle appropriately
        if cm.shape == (2, 2):
            tn, fp, fn, tp = cm.ravel()
        elif cm.shape == (1, 1): # Handle case where only one class is predicted or present
            if y2_np[0] == 0 and y_pred[0] == 0: # All TN
                tn, fp, fn, tp = cm[0][0], 0, 0, 0
            elif y2_np[0] == 1 and y_pred[0] == 1: # All TP
                tn, fp, fn, tp = 0, 0, 0, cm[0][0]
            elif y2_np[0] == 0 and y_pred[0] == 1: # All FP (tn=0, fn=0, tp=0)
                 tn, fp, fn, tp = 0, cm[0][0], 0, 0
            elif y2_np[0] == 1 and y_pred[0] == 0: # All FN (tn=0, fp=0, tp=0)
                 tn, fp, fn, tp = 0, 0, cm[0][0], 0
            else: # Should not happen with binary classification if data exists
                print(f"Warning: Confusion matrix for {model_name} has unexpected shape: {cm.shape}. Setting metrics to NaN.")
                tn, fp, fn, tp = 0,0,0,0 # Or handle as an error

        else:
            print(f"Warning: Confusion matrix for {model_name} is not 2x2 or 1x1 ({cm.shape}). Skipping metric calculation.")
            recalls.append(float('nan'))
            accuracies.append(float('nan'))
            fprs.append(float('nan'))
            fnrs.append(float('nan'))
            model_names.append(model_name)
            continue


        # Calculate metrics (handle potential division by zero if a class is not present or not predicted)
        # Recall (Sensitivity or True Positive Rate): TP / (TP + FN)
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        # Accuracy: (TP + TN) / (TP + TN + FP + FN)
        accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0.0
        # False Positive Rate (FPR): FP / (FP + TN)
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0
        # False Negative Rate (FNR): FN / (TP + FN)
        fnr = fn / (tp + fn) if (tp + fn) > 0 else 0.0

        print(f"Recall: {recall:.4f}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"False Positive Rate: {fpr:.4f}")
        print(f"False Negative Rate: {fnr:.4f}")

        # Append metrics and model name
        model_names.append(model_name)
        recalls.append(recall)
        accuracies.append(accuracy)
        fprs.append(fpr)
        fnrs.append(fnr)

    except FileNotFoundError:
        print(f"Error: Model file {model_file} not found. Skipping.")
        # Append NaN or placeholder if you want to keep the graph structure consistent
        model_names.append(model_name + " (Not Found)")
        recalls.append(float('nan'))
        accuracies.append(float('nan'))
        fprs.append(float('nan'))
        fnrs.append(float('nan'))
    except Exception as e:
        print(f"An error occurred while processing {model_name}: {e}")
        model_names.append(model_name + " (Error)")
        recalls.append(float('nan'))
        accuracies.append(float('nan'))
        fprs.append(float('nan'))
        fnrs.append(float('nan'))


# --- Create a DataFrame for plotting ---
metrics_df = pd.DataFrame({
    'Model': model_names,
    'Recall': recalls,
    'Accuracy': accuracies,
    'FPR': fprs,
    'FNR': fnrs
})

# --- Generate the labeled bar graph ---
if not metrics_df.empty:

    # Melt the DataFrame for easier plotting with Seaborn or Matplotlib
    metrics_melted = metrics_df.melt(id_vars='Model', var_name='Metric', value_name='Score')

    plt.figure(figsize=(18, 10)) # Adjust figure size as needed
    sns.barplot(x='Model', y='Score', hue='Metric', data=metrics_melted, palette='viridis')

    plt.title('Model Performance Metrics (LR01-LR16)', fontsize=16)
    plt.xlabel('Model', fontsize=14)
    plt.ylabel('Score', fontsize=14)
    plt.xticks(rotation=45, ha='right', fontsize=10) # Rotate x-axis labels for better readability
    plt.yticks(fontsize=10)
    plt.legend(title='Metric', fontsize=10, title_fontsize=12)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout() # Adjust layout to prevent labels from overlapping
    plt.show()
    print("\nBar graph generated.")

    # --- Optional: Print the metrics DataFrame ---
    print("\n--- Summary of Metrics ---")
    print(metrics_df.to_string())

else:
    print("\nNo metrics were calculated, skipping plot generation.")