#### Import Packages

In [1]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils import shuffle

import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Softmax, Multiply, Lambda, Flatten
from tensorflow.keras.callbacks import EarlyStopping


#### Read Data

In [2]:
def read_data(folder_path):
    dataframes = {} # Create an empty dictionary to store DataFrames
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')] # List all CSV files in the folder
    
    for csv_file in csv_files:
        attack = os.path.splitext(csv_file)[0] 
        file_path = os.path.join(folder_path, csv_file)
        dataframes[attack] = pd.read_csv(file_path)
    return dataframes

#### Auxiliary Methods

In [3]:
def scale_df(data,features):
    scaler = MinMaxScaler(feature_range=(0, 1))
    data[features] = scaler.fit_transform(data[features]) # normalize the above features
    return data

In [4]:
# Function to create sequences for a single user-event group
def create_sequences_for_group(group, features, target, sequence_length):
    sequences = []
    labels = []
    for i in range(len(group) - sequence_length):
        seq_features = group[features].iloc[i:i + sequence_length].values
        seq_label = group[target].iloc[i + sequence_length]
        sequences.append(seq_features)
        labels.append(seq_label)
    return sequences, labels

In [5]:
def lstm_model(input_shape, lstm_units=32, learning_rate=0.01):
    inputs = Input(shape=input_shape, name="input_layer")  # Input Layer (batch_size, time_steps, features)
    lstm_out = LSTM(lstm_units, return_sequences=False, name="lstm_layer")(inputs)   # LSTM Layer  (batch_size, lstm_units)
    outputs = Dense(1, activation="sigmoid", name="output_layer")(lstm_out)  # # Classification Layer (binary)

    # Compile Model
    model = Model(inputs=inputs, outputs=outputs)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
    return model

#### Training

In [6]:
def precompute_sequences(dataframes, features, ws):
    precomputed_sequences = {}  # Store sequences for each attack
    for attack, data in dataframes.items():
        print(f"🔄 Precomputing sequences for {attack}")
        # Drop unnecessary columns
        data = data.drop(columns=['attack', 'malicious', 'attack_number'], errors='ignore')
        # Normalize Data
        data_norm = scale_df(data, features)

        # Parse and sort data
        data_norm['_time'] = pd.to_datetime(data_norm['_time'])  
        data_norm.sort_values(by=['imeisv', '_time'], inplace=True)

        # Create Sequences
        all_sequences, all_labels = [], []
        target = 'binary_label'
        for (imeisv, event), group in data_norm.groupby(['imeisv', 'event']):
            if len(group) > ws:
                sequences, labels = create_sequences_for_group(group, features, target, ws)
                all_sequences.extend(sequences)
                all_labels.extend(labels)

        # Store the computed sequences
        precomputed_sequences[attack] = (np.array(all_sequences), np.array(all_labels))

        print(f"✅ Completed {attack}: {len(all_sequences)} sequences stored.")

    return precomputed_sequences


In [7]:
def training(sequences, features, ws, previous_data_ratio, csv_path):
    attacks = ['SYN', 'ICMP', 'UDP', 'DNS', 'GTPU'] # Order of attacks: SYN - ICMP - UDP - DNS - GTPU 
    models = [] # store models for each day
    results = [] # store results of cumulative evaluations (up to day x)
    per_day_results = []  # Store per-day evaluations
    future_days_evaluation = {}  # Store future days' evaluations
    past_evaluation_results = {}  # Store evaluations on previous days' test sets
    cumulative_X_test, cumulative_y_test = np.array([]), np.array([]) # Initialize cumulative test sets
    test_sets = {} # Initialize a dictionary to store test sets for future and past evaluations
    previous_normal_data, previous_attack_data = [], [] # Initialize buffers for sampled data 
    
    # Training and fine-tuning loop
    for i, attack in enumerate(attacks):
        
        all_sequences, all_labels = sequences[attack] 
        X = np.array(all_sequences)
        y = np.array(all_labels)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
        
        test_sets[i] = (X_test, y_test) # Save current test set for evaluations
        # Concatenate cumulative test sets for evaluation
        cumulative_X_test = np.concatenate([cumulative_X_test, X_test]) if cumulative_X_test.size else X_test
        cumulative_y_test = np.concatenate([cumulative_y_test, y_test]) if cumulative_y_test.size else y_test
    
        # Stratified mixing of previous data
        if i > 0 and (previous_normal_data or previous_attack_data):
            print(f"Mixing {previous_data_ratio * 100:.0f}% of previous days' data with current day's data")

            # Use only the last stored array instead of concatenating all
            sampled_X_normal = previous_normal_data[-1] 
            sampled_X_attack = previous_attack_data[-1] 
    
            # Combine normal and attack samples
            sampled_previous_X = np.concatenate((sampled_X_normal, sampled_X_attack), axis=0)
            sampled_previous_y = np.concatenate((np.zeros(len(sampled_X_normal)), np.ones(len(sampled_X_attack))), axis=0)
    
            # Combine sampled previous data with current day's training data
            mixed_X_train = np.concatenate((X_train, sampled_previous_X), axis=0)
            mixed_y_train = np.concatenate((y_train, sampled_previous_y), axis=0)
            mixed_X_train, mixed_y_train = shuffle(mixed_X_train, mixed_y_train, random_state=42) # Shuffle the combined data
        else:
            mixed_X_train = X_train
            mixed_y_train = y_train
    
        # Train or fine-tune the model
        # early stopping
        early_stopping = EarlyStopping( monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)
        if i == 0: # Initial training
            model = lstm_model(input_shape=X_train.shape[1:])
            print(model.summary())
            model.fit(mixed_X_train, mixed_y_train, epochs=100, batch_size=64, validation_split=0.2, callbacks=[early_stopping])
        else:  # Fine-tune
            model.fit(mixed_X_train, mixed_y_train, epochs=100, batch_size=64, validation_split=0.2, callbacks=[early_stopping])
        models.append(model) # Save the model
    
        # Sampling data from the current day's training set for future use
        print(f"Sampling data from current training set for future use...")    
        shuffled_normal_X = shuffle(mixed_X_train[mixed_y_train == 0], random_state=42) # Shuffle and sample normal data
        num_normal_samples = int(previous_data_ratio * len(shuffled_normal_X)) # get ratio of normal data
        previous_normal_data.append(shuffled_normal_X[:num_normal_samples])

        shuffled_attack_X = shuffle(mixed_X_train[mixed_y_train == 1], random_state=42) # Shuffle and sample attack data
        num_attack_samples = int(previous_data_ratio * len(shuffled_attack_X)) # get ratio of attack data
        previous_attack_data.append(shuffled_attack_X[:num_attack_samples])

        results_list = []
        # Per day evaluation
        print(f"Evaluating Model of Day {i + 1} on Day {i + 1} Test Set...")
        per_day_loss, per_day_accuracy, per_day_precision, per_day_recall = model.evaluate(X_test, y_test, verbose=0)
        predictions = model.predict(X_test)
        predicted_labels = (predictions >= 0.5).astype(int).flatten()
        class_report = classification_report(y_test, predicted_labels, target_names=["Normal", "Attack"], output_dict=True)
        conf_matrix = confusion_matrix(y_test, predicted_labels)
        results_list.append([i + 1, "Per-Day", ws, per_day_loss, per_day_accuracy, per_day_precision, per_day_recall, class_report, conf_matrix.tolist()])
        
        # Evaluate cumulative test set
        cumulative_loss, cumulative_accuracy, cumulative_precision, cumulative_recall = model.evaluate(cumulative_X_test, cumulative_y_test, verbose=0)
        predictions = model.predict(cumulative_X_test)
        predicted_labels = (predictions >= 0.5).astype(int).flatten()
        class_report = classification_report(cumulative_y_test, predicted_labels, target_names=["Normal", "Attack"], output_dict=True)
        conf_matrix = confusion_matrix(cumulative_y_test, predicted_labels)
        results_list.append([i + 1, "Cumulative", ws, cumulative_loss, cumulative_accuracy, cumulative_precision, cumulative_recall, class_report, conf_matrix.tolist()])
        
        # Evaluate on previous days' test sets
        if i > 0:
            for prev_day in range(i):
                prev_X_test, prev_y_test = test_sets[prev_day]
                prev_loss, prev_accuracy, prev_precision, prev_recall = model.evaluate(prev_X_test, prev_y_test, verbose=0)
                predictions = model.predict(prev_X_test)
                predicted_labels = (predictions >= 0.5).astype(int).flatten()
                class_report = classification_report(prev_y_test, predicted_labels, target_names=["Normal", "Attack"], output_dict=True)
                conf_matrix = confusion_matrix(prev_y_test, predicted_labels)
                results_list.append([i + 1, f"Previous Day {prev_day + 1}", ws, prev_loss, prev_accuracy, prev_precision, prev_recall, class_report, conf_matrix.tolist()])
        
        # Append results to CSV file
        df_results = pd.DataFrame(results_list, columns=["Day", "Evaluation Type", "Window Size", "Loss", "Accuracy", "Precision", "Recall", "Classification Report", "Confusion Matrix"])
        df_results.to_csv(csv_path, mode='a', header=not pd.io.common.file_exists(csv_path), index=False)
        
    print("\nPerforming Future Test Set Evaluations:")
    for i, model in enumerate(models):
        for j in range(i + 1, len(attacks)):
            future_X_test, future_y_test = test_sets[j]
            future_loss, future_accuracy, future_precision, future_recall = model.evaluate(future_X_test, future_y_test, verbose=0)
            predictions = model.predict(future_X_test)
            predicted_labels = (predictions >= 0.5).astype(int).flatten()
            class_report = classification_report(future_y_test, predicted_labels, target_names=["Normal", "Attack"], output_dict=True)
            conf_matrix = confusion_matrix(future_y_test, predicted_labels)
            df_results = pd.DataFrame([[i + 1, f"Future Day {j + 1}", ws, future_loss, future_accuracy, future_precision, future_recall, class_report, conf_matrix.tolist()]],
                                      columns=["Day", "Evaluation Type", "Window Size", "Loss", "Accuracy", "Precision", "Recall", "Classification Report", "Confusion Matrix"])
            df_results.to_csv(csv_path, mode='a', header=False, index=False)

#### Main Script

In [8]:
folder_path = '../Datasets/Per_Attack_Datasets'
dataframes = read_data(folder_path)

features = ['epre','pusch_snr','p_ue','ul_mcs','cqi','ul_bitrate',
            'dl_mcs','dl_retx','ul_tx','dl_tx','ul_retx','dl_bitrate','dl_err','ul_err'] # features in the dataset
num_features = 14 # number of features 
previous_data_ratio = 0.3  # Ratio of previous training data to be used
ws = 5  # Window size


import os

# Ensure the directory exists before calling training
output_dir = "Experiments_11_03_25"
os.makedirs(output_dir, exist_ok=True)  # Creates the directory if it doesn't exist



In [9]:
ws = [10]
ratio = [0.5] 
for ws_i in ws:
    sequences = precompute_sequences(dataframes, features, ws_i)
    print(f"Window size {ws_i}")
    for r in ratio: 
        print(f"Ratio {r}")
        for i in range(9,10):
            print(f"Iteration {i}")
            training(sequences, features, ws_i, r,f"{output_dir}/experiments_{ws_i}_{r}_iter_{i}.csv")

🔄 Precomputing sequences for DNS
✅ Completed DNS: 162599 sequences stored.
🔄 Precomputing sequences for GTPU
✅ Completed GTPU: 107063 sequences stored.
🔄 Precomputing sequences for ICMP
✅ Completed ICMP: 169966 sequences stored.
🔄 Precomputing sequences for SYN
✅ Completed SYN: 132194 sequences stored.
🔄 Precomputing sequences for UDP
✅ Completed UDP: 113607 sequences stored.
Window size 10
Ratio 0.5
Iteration 9


None
Epoch 1/100
[1m1322/1322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9902 - loss: 0.0497 - precision: 0.5834 - recall: 0.1970 - val_accuracy: 0.9998 - val_loss: 0.0022 - val_precision: 1.0000 - val_recall: 0.9804
Epoch 2/100
[1m1322/1322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9987 - loss: 0.0068 - precision: 0.9794 - recall: 0.8970 - val_accuracy: 0.9998 - val_loss: 0.0020 - val_precision: 1.0000 - val_recall: 0.9804
Epoch 3/100
[1m1322/1322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9988 - loss: 0.0070 - precision: 0.9689 - recall: 0.9102 - val_accuracy: 0.9999 - val_loss: 7.4159e-04 - val_precision: 1.0000 - val_recall: 0.9902
Epoch 4/100
[1m1322/1322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9995 - loss: 0.0015 - precision: 0.9727 - recall: 0.9776 - val_accuracy: 0.9999 - val_loss: 6.5123e-04 - val_precision: 1.0000 - val_recall: 0.9902