#### Import Packages

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils import shuffle
from tensorflow.keras import layers, models, optimizers

#### Read data

In [None]:
# Read data per attack
# Order of attacks: SYN - ICMP - UDP - DNS - GTPU 
folder_path = '../Datasets/Per_Attack_Datasets'

# Create an empty dictionary to store DataFrames
dataframes = {}

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Load each CSV file into a separate DataFrame
for csv_file in csv_files:
    attack = os.path.splitext(csv_file)[0] 
    
    # Load the CSV file
    file_path = os.path.join(folder_path, csv_file)
    dataframes[attack] = pd.read_csv(file_path)

# Check loaded DataFrames
for attack, df in dataframes.items():
    print(f"DataFrame for {attack}:")
    print(df.head(), '\n')

### Auxiliary Methods

#### Normalize data

In [None]:
features = ['bearer_0_dl_total_bytes', 'bearer_0_ul_total_bytes','bearer_1_dl_total_bytes',
            'bearer_1_ul_total_bytes','ul_path_loss','ul_phr','turbo_decoder_avg','epre','pusch_snr','p_ue','ul_mcs','cqi','ul_bitrate',
            'dl_mcs','dl_retx','ul_tx','dl_tx','ul_retx','dl_bitrate','dl_err','ul_err']
num_features = [14, 11, 8, 5]

In [None]:
# normalize the above features
def scale_df(data,features):
    scaler = MinMaxScaler(feature_range=(0, 1))
    data[features] = scaler.fit_transform(data[features])
    return data

#### Create sequences

In [None]:
# Function to create sequences for a single user-event group
def create_sequences_for_group(group, features, target, sequence_length):
    sequences = []
    labels = []
    for i in range(len(group) - sequence_length):
        seq_features = group[features].iloc[i:i + sequence_length].values
        seq_label = group[target].iloc[i + sequence_length]
        sequences.append(seq_features)
        labels.append(seq_label)
    return sequences, labels

#### Create LSTM model

In [None]:
# Binary Classification with a One-Layer LSTM
def create_lstm_model_binary(input_shape):
    model = Sequential([
        LSTM(32, input_shape=input_shape),  # One LSTM layer with 32 units
        Dropout(0.3),  # Dropout rate of 0.3 to reduce overfitting
        Dense(1, activation='sigmoid')  # Binary classification output with sigmoid
    ])
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)  # Adam optimizer with learning rate = 0.01
    model.compile(optimizer=optimizer, 
                  loss='binary_crossentropy', 
                  metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
    return model


#### LSTM parameters finetuning (window size = 5, features = 14, ratio = 0)

In [None]:
# Define hyperparameter grid
hyperparameter_grid = {
    "lstm_units": [16, 32, 64],      # Number of LSTM units
    "num_layers": [1],               # Number of layers
    "dropout_rate": [0.1, 0.2, 0.3], # Dropout rates
    "learning_rate": [0.001, 0.01],  # Learning rates
    "batch_size": [32, 64, 128, 256, 1024],  # Batch sizes
}

# Experiment parameters
ws = 5  # Window sizes
attacks = ['SYN', 'ICMP', 'UDP', 'DNS', 'GTPU']
output_dir = "lstm_fine_tune_results_new/"
os.makedirs(output_dir, exist_ok=True)

# Remove features from the start
start_remove = len(features) - num_features[0]
curr_features = features[start_remove:]
print(f"Using features: {curr_features}")

# Pre-load and preprocess data for all attacks
preprocessed_data = {}
for attack in attacks:
    print(f"Preprocessing data for {attack}")
    data = dataframes[attack]
    data = data.drop(columns=['attack', 'malicious', 'attack_number'])
    data_norm = scale_df(data, curr_features)
    data_norm['_time'] = pd.to_datetime(data_norm['_time'])  # Parse time column
    data_norm.sort_values(by=['imeisv', '_time'], inplace=True)  # Sort by UE ID and time
    
    sequences, labels = [], []
    target = 'binary_label'
    for (imeisv, event), group in data_norm.groupby(['imeisv', 'event']):
        if len(group) > ws:  # Process groups with enough data
            seqs, lbls = create_sequences_for_group(group, curr_features, target, ws)
            sequences.extend(seqs)
            labels.extend(lbls)
    preprocessed_data[attack] = (np.array(sequences), np.array(labels))

# Function to create LSTM model
def create_lstm_model(input_shape, lstm_units, num_layers, dropout_rate, learning_rate):
    model = models.Sequential()
    for i in range(num_layers):
        return_sequences = i < num_layers - 1  # Return sequences for intermediate layers
        model.add(layers.LSTM(lstm_units, return_sequences=return_sequences, input_shape=input_shape))
        model.add(layers.Dropout(dropout_rate))
    model.add(layers.Dense(1, activation="sigmoid"))  # Binary classification
    optimizer = optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy", "Precision", "Recall"])
    return model

# Initialize results storage
all_results = []
cumulative_results_day_5 = []

# Fine-tuning loop
for lstm_units in hyperparameter_grid["lstm_units"]:
    for num_layers in hyperparameter_grid["num_layers"]:
        for dropout_rate in hyperparameter_grid["dropout_rate"]:
            for learning_rate in hyperparameter_grid["learning_rate"]:
                for batch_size in hyperparameter_grid["batch_size"]:
                    print(f"Training with LSTM Units: {lstm_units}, Layers: {num_layers}, Dropout: {dropout_rate}, "
                          f"Learning Rate: {learning_rate}, Batch Size: {batch_size}")

                    # Initialize variables
                    cumulative_X_test, cumulative_y_test = [], []
                    per_day_results = []

                    # Training and evaluation loop
                    for i, attack in enumerate(attacks):
                        print(f"Processing {attack}")
                        X, y = preprocessed_data[attack]
                        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
                        cumulative_X_test.append(X_test)
                        cumulative_y_test.append(y_test)

                        # Concatenate cumulative test sets for evaluation
                        X_cumulative_test = np.concatenate(cumulative_X_test, axis=0)
                        y_cumulative_test = np.concatenate(cumulative_y_test, axis=0)

                        # Train or fine-tune the model
                        if i == 0: 
                            # Initial training
                            model = create_lstm_model(
                                input_shape=X_train.shape[1:], 
                                lstm_units=lstm_units, 
                                num_layers=num_layers, 
                                dropout_rate=dropout_rate, 
                                learning_rate=learning_rate
                            )
                            model.fit(X_train, y_train, epochs=10, batch_size=batch_size, validation_split=0.2)
                        else:
                            # Fine-tune
                            model.fit(X_train, y_train, epochs=5, batch_size=batch_size, validation_split=0.2)

                        # Evaluate on current test set
                        print(f"Evaluating Model on Day {i + 1}")
                        loss, accuracy, precision, recall = model.evaluate(X_test, y_test, verbose=0)
                        per_day_results.append({
                            "day": i + 1,
                            "loss": loss,
                            "accuracy": accuracy,
                            "precision": precision,
                            "recall": recall,
                            "lstm_units": lstm_units,
                            "num_layers": num_layers,
                            "dropout_rate": dropout_rate,
                            "learning_rate": learning_rate,
                            "batch_size": batch_size,
                            "window_size": ws,
                        })

                        # Save cumulative results for Day 5
                        if i == 4:  # Day 5 corresponds to index 4
                            cumulative_loss, cumulative_accuracy, cumulative_precision, cumulative_recall = model.evaluate(
                                X_cumulative_test, y_cumulative_test, verbose=0)
                            cumulative_results_day_5.append({
                                "lstm_units": lstm_units,
                                "num_layers": num_layers,
                                "dropout_rate": dropout_rate,
                                "learning_rate": learning_rate,
                                "batch_size": batch_size,
                                "window_size": ws,
                                "loss": cumulative_loss,
                                "accuracy": cumulative_accuracy,
                                "precision": cumulative_precision,
                                "recall": cumulative_recall,
                            })

                    # Save results to all_results
                    all_results.extend(per_day_results)

# Export all results to a CSV
results_df = pd.DataFrame(all_results)
results_filename = os.path.join(output_dir, "lstm_hyperparameter_tuning_results.csv")
results_df.to_csv(results_filename, index=False)
print(f"Results saved to {results_filename}")

# Export cumulative results for Day 5 to a separate CSV
cumulative_results_df = pd.DataFrame(cumulative_results_day_5)
cumulative_results_filename = os.path.join(output_dir, "cumulative_results_day_5.csv")
cumulative_results_df.to_csv(cumulative_results_filename, index=False)
print(f"Cumulative results for Day 5 saved to {cumulative_results_filename}")


#### Features used fine tuning (ratio = 0.0, window size = 5)

In [None]:
previous_data_ratio = 0.0 # ratio of previous training data to be used
ws = 5 # window size
for n_features in num_features:
    print(f"\nExperiment with {len(features) - n_features} features removed...")
    
    # Remove features from the start
    start_remove = len(features) - n_features
    curr_features = features[start_remove:]
    print(f"Using features: {curr_features}")

    models = []
    results = []
    per_day_results = []  # Store per-day evaluations
    future_days_evaluation = {}  # Store future days' evaluations
    past_evaluation_results = {}  # Store evaluations on previous days' test sets
    attacks = ['SYN', 'ICMP', 'UDP', 'DNS', 'GTPU']
    
    # Initialize cumulative test sets
    cumulative_X_test = []
    cumulative_y_test = []
    
    # Initialize a dictionary to store test sets for future and past evaluations
    test_sets = {}
    
    # Initialize buffers for sampled data 
    previous_normal_data = []
    previous_attack_data = []

    # Training and fine-tuning loop
    for i, attack in enumerate(attacks):
        print(f"Processing {attack}")
        
        # 0) Get Data
        data = dataframes[attack]
        data = data.drop(columns=['attack', 'malicious', 'attack_number'])
    
        # 1) Normalize Data
        print("Normalizing Data")
        data_norm = scale_df(data, curr_features)
    
        # 2) Create Sequences
        print("Creating Sequences")
        data_norm['_time'] = pd.to_datetime(data_norm['_time'])  # Parse time column
        data_norm.sort_values(by=['imeisv', '_time'], inplace=True)  # Sort by UE ID and time
    
        all_sequences = []
        all_labels = []
        target = 'binary_label'
        for (imeisv, event), group in data_norm.groupby(['imeisv', 'event']):
            if len(group) > ws:  # Only process groups with enough data
                sequences, labels = create_sequences_for_group(group, curr_features, target, ws)
                all_sequences.extend(sequences)
                all_labels.extend(labels)
    
        # 3) Split train/test
        print("Splitting Sequences")
        X = np.array(all_sequences)
        y = np.array(all_labels)
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
        
        # Save current test set for evaluations
        test_sets[i] = (X_test, y_test)
        
        # Append current test set to cumulative test sets
        cumulative_X_test.append(X_test)
        cumulative_y_test.append(y_test)
        
        # Concatenate cumulative test sets for evaluation
        X_cumulative_test = np.concatenate(cumulative_X_test, axis=0)
        y_cumulative_test = np.concatenate(cumulative_y_test, axis=0)
    
        # Stratified mixing of previous data
        if i > 0 and (previous_normal_data or previous_attack_data):
            print(f"Mixing {previous_data_ratio * 100:.0f}% of previous days' data with current day's data")
            
            # Combine all previous samples
            sampled_X_normal = np.concatenate(previous_normal_data, axis=0)
            sampled_X_attack = np.concatenate(previous_attack_data, axis=0)

            # Combine normal and attack samples
            sampled_previous_X = np.concatenate((sampled_X_normal, sampled_X_attack), axis=0)
            sampled_previous_y = np.concatenate((np.zeros(len(sampled_X_normal)), np.ones(len(sampled_X_attack))), axis=0)

            # Combine sampled previous data with current day's training data
            mixed_X_train = np.concatenate((X_train, sampled_previous_X), axis=0)
            mixed_y_train = np.concatenate((y_train, sampled_previous_y), axis=0)

            # Shuffle the combined data
            mixed_X_train, mixed_y_train = shuffle(mixed_X_train, mixed_y_train, random_state=42)
        else:
            mixed_X_train = X_train
            mixed_y_train = y_train

        # Train or fine-tune the model
        if i == 0:
            # Initial training
            model = create_lstm_model_binary(input_shape=X_train.shape[1:])
            model.fit(mixed_X_train, mixed_y_train, epochs=10, batch_size=64, validation_split=0.2)
        else:
            # Fine-tune
            model.fit(mixed_X_train, mixed_y_train, epochs=5, batch_size=64, validation_split=0.2)
    
        # Save the model
        models.append(model)
    
        # Per-Day Evaluation
        print(f"Evaluating Model of Day {i + 1} on Day {i + 1} Test Set...")
        per_day_loss, per_day_accuracy, per_day_precision, per_day_recall = model.evaluate(X_test, y_test, verbose=0)
        per_day_results.append({
            "day": i + 1,
            "loss": per_day_loss,
            "accuracy": per_day_accuracy,
            "precision": per_day_precision,
            "recall": per_day_recall,
        })
        print(f"Day {i + 1} Per-Day Test Loss: {per_day_loss:.4f}, Accuracy: {per_day_accuracy:.4f}, "
              f"Precision: {per_day_precision:.4f}, Recall: {per_day_recall:.4f}")
    
        # Evaluate on cumulative test set
        cumulative_loss, cumulative_accuracy, cumulative_precision, cumulative_recall = model.evaluate(X_cumulative_test, y_cumulative_test, verbose=0)
        results.append({
            "day": i + 1,
            "loss": cumulative_loss,
            "accuracy": cumulative_accuracy,
            "precision": cumulative_precision,
            "recall": cumulative_recall
        })
        print(f"Day {i+1} Cumulative Test Loss: {cumulative_loss:.4f}, Accuracy: {cumulative_accuracy:.4f}")
    
        # Evaluate on previous days' test sets
        if i > 0:
            print(f"Evaluating Model of Day {i + 1} on Previous Days' Test Sets:")
            for prev_day in range(i):
                prev_X_test, prev_y_test = test_sets[prev_day]
                prev_loss, prev_accuracy, prev_precision, prev_recall = model.evaluate(prev_X_test, prev_y_test, verbose=0)
                # Save results
                past_evaluation_results[(i + 1, prev_day + 1)] = {
                    "model_day": i + 1,
                    "test_day": prev_day + 1,
                    "loss": prev_loss,
                    "accuracy": prev_accuracy,
                    "precision": prev_precision,
                    "recall": prev_recall,
                }
                print(f"  Model of Day {i + 1} -> Test Set Day {prev_day + 1}: Loss = {prev_loss:.4f}, "
                      f"Accuracy = {prev_accuracy:.4f}, Precision = {prev_precision:.4f}, Recall = {prev_recall:.4f}")

    # Perform evaluations on future test sets
    print("\nPerforming Future Test Set Evaluations:")
    for i, model in enumerate(models):
        print(f"Evaluating Model of Day {i + 1} on Future Test Sets:")
        for j in range(i + 1, len(attacks)):  # Evaluate on test sets of future days
            future_X_test, future_y_test = test_sets[j]
            future_loss, future_accuracy, future_precision, future_recall = model.evaluate(future_X_test, future_y_test, verbose=0)
            # Save results for future evaluations
            future_days_evaluation[(i + 1, j + 1)] = {
                "model_day": i + 1,
                "future_day": j + 1,
                "loss": future_loss,
                "accuracy": future_accuracy,
                "precision": future_precision,
                "recall": future_recall,
            }
            print(f"  Model of Day {i + 1} -> Future Test Set Day {j + 1}: Loss = {future_loss:.4f}, "
                  f"Accuracy = {future_accuracy:.4f}, Precision = {future_precision:.4f}, Recall = {future_recall:.4f}")

        # Sampling data from the current day's training set for future use
        print(f"Sampling data from current training set for future use...")
        normal_indices = y_train == 0
        attack_indices = y_train == 1

        # Shuffle and sample normal data
        shuffled_normal_X = shuffle(X_train[normal_indices], random_state=42)
        num_normal_samples = int(previous_data_ratio * len(y_train)) // 2
        previous_normal_data.append(shuffled_normal_X[:num_normal_samples])

        # Shuffle and sample attack data
        shuffled_attack_X = shuffle(X_train[attack_indices], random_state=42)
        num_attack_samples = int(previous_data_ratio * len(y_train)) - num_normal_samples
        previous_attack_data.append(shuffled_attack_X[:num_attack_samples])

    # Reset previous data buffers after processing all attacks for this feature configuration
    previous_normal_data = []
    previous_attack_data = []
    print("Reset previous data buffers for the next feature configuration.")

    # Export Results to CSV
    print("\nExporting Results to CSV...")
    output_dir = "ft/"
    os.makedirs(output_dir, exist_ok=True)

    per_day_filename = f"{output_dir}per_day_results_seq_{ws}_removed_{len(features) - n_features}.csv"
    cumulative_filename = f"{output_dir}cumulative_results_seq_{ws}_removed_{len(features) - n_features}.csv"
    past_filename = f"{output_dir}past_evaluation_results_seq_{ws}_removed_{len(features) - n_features}.csv"
    future_filename = f"{output_dir}future_days_evaluation_seq_{ws}_removed_{len(features) - n_features}.csv"

    try:
        pd.DataFrame(per_day_results).to_csv(per_day_filename, index=False)
        pd.DataFrame(results).to_csv(cumulative_filename, index=False)
        pd.DataFrame(past_evaluation_results).T.reset_index(drop=True).to_csv(past_filename, index=False)
        pd.DataFrame(future_days_evaluation).T.reset_index(drop=True).to_csv(future_filename, index=False)
        print(f"Results saved: {output_dir}")
    except Exception as e:
        print(f"Error saving results: {e}")


#### Experiments to determine the best ratio of previous data to use (windows_size = 5, features = 14)

In [None]:

# Define the range of previous data ratios to experiment with
previous_data_ratios = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]

# Experiment parameters
fixed_window_size = 5  # Fixed window size
fixed_n_features = 14  # Constant number of features to keep
start_remove = len(features) - fixed_n_features
curr_features = features[start_remove:]
print(f"Using features: {curr_features}")

output_dir_base = "ft_experiment_results/"
os.makedirs(output_dir_base, exist_ok=True)

for previous_data_ratio in previous_data_ratios:
    print(f"\n--- Experiment with previous data ratio: {previous_data_ratio} ---")
    output_dir = os.path.join(output_dir_base, f"ratio_{int(previous_data_ratio * 100):02d}")
    os.makedirs(output_dir, exist_ok=True)

    models = []
    results = []
    per_day_results = []  # Store per-day evaluations
    future_days_evaluation = {}  # Store future days' evaluations
    past_evaluation_results = {}  # Store evaluations on previous days' test sets
    attacks = ['SYN', 'ICMP', 'UDP', 'DNS', 'GTPU']

    # Initialize cumulative test sets
    cumulative_X_test = []
    cumulative_y_test = []

    # Initialize buffers for sampled data 
    previous_normal_data = []
    previous_attack_data = []

    # Training and fine-tuning loop
    for i, attack in enumerate(attacks):
        print(f"Processing {attack}")

        # 0) Get Data
        data = dataframes[attack]
        data = data.drop(columns=['attack', 'malicious', 'attack_number'])

        # 1) Normalize Data
        print("Normalizing Data")
        data_norm = scale_df(data, curr_features)

        # 2) Create Sequences
        print("Creating Sequences")
        data_norm['_time'] = pd.to_datetime(data_norm['_time'])  # Parse time column
        data_norm.sort_values(by=['imeisv', '_time'], inplace=True)  # Sort by UE ID and time

        all_sequences = []
        all_labels = []
        target = 'binary_label'
        for (imeisv, event), group in data_norm.groupby(['imeisv', 'event']):
            if len(group) > fixed_window_size:  # Only process groups with enough data
                sequences, labels = create_sequences_for_group(group, curr_features, target, fixed_window_size)
                all_sequences.extend(sequences)
                all_labels.extend(labels)

        # 3) Split train/test
        print("Splitting Sequences")
        X = np.array(all_sequences)
        y = np.array(all_labels)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

        # Save current test set for evaluations
        test_sets[i] = (X_test, y_test)

        # Append current test set to cumulative test sets
        cumulative_X_test.append(X_test)
        cumulative_y_test.append(y_test)

        # Concatenate cumulative test sets for evaluation
        X_cumulative_test = np.concatenate(cumulative_X_test, axis=0)
        y_cumulative_test = np.concatenate(cumulative_y_test, axis=0)

        # Stratified mixing of previous data
        if i > 0 and (previous_normal_data or previous_attack_data):
            print(f"Mixing {previous_data_ratio * 100:.0f}% of previous days' data with current day's data")

            # Combine all previous samples
            sampled_X_normal = np.concatenate(previous_normal_data, axis=0)
            sampled_X_attack = np.concatenate(previous_attack_data, axis=0)

            # Combine normal and attack samples
            sampled_previous_X = np.concatenate((sampled_X_normal, sampled_X_attack), axis=0)
            sampled_previous_y = np.concatenate((np.zeros(len(sampled_X_normal)), np.ones(len(sampled_X_attack))), axis=0)

            # Combine sampled previous data with current day's training data
            mixed_X_train = np.concatenate((X_train, sampled_previous_X), axis=0)
            mixed_y_train = np.concatenate((y_train, sampled_previous_y), axis=0)

            # Shuffle the combined data
            mixed_X_train, mixed_y_train = shuffle(mixed_X_train, mixed_y_train, random_state=42)
        else:
            mixed_X_train = X_train
            mixed_y_train = y_train

        # Train or fine-tune the model
        if i == 0:
            # Initial training
            model = create_lstm_model_binary(input_shape=mixed_X_train.shape[1:])
            model.fit(mixed_X_train, mixed_y_train, epochs=10, batch_size=64, validation_split=0.2)
        else:
            # Fine-tune
            model.fit(mixed_X_train, mixed_y_train, epochs=5, batch_size=64, validation_split=0.2)

        # Save the model
        models.append(model)

        # Per-Day Evaluation
        print(f"Evaluating Model of Day {i + 1} on Day {i + 1} Test Set...")
        per_day_loss, per_day_accuracy, per_day_precision, per_day_recall = model.evaluate(X_test, y_test, verbose=0)
        per_day_results.append({
            "day": i + 1,
            "loss": per_day_loss,
            "accuracy": per_day_accuracy,
            "precision": per_day_precision,
            "recall": per_day_recall,
        })
        print(f"Day {i + 1} Per-Day Test Loss: {per_day_loss:.4f}, Accuracy: {per_day_accuracy:.4f}, "
              f"Precision: {per_day_precision:.4f}, Recall: {per_day_recall:.4f}")

        # Evaluate on cumulative test set
        cumulative_loss, cumulative_accuracy, cumulative_precision, cumulative_recall = model.evaluate(X_cumulative_test, y_cumulative_test, verbose=0)
        results.append({
            "day": i + 1,
            "loss": cumulative_loss,
            "accuracy": cumulative_accuracy,
            "precision": cumulative_precision,
            "recall": cumulative_recall
        })
        print(f"Day {i+1} Cumulative Test Loss: {cumulative_loss:.4f}, Accuracy: {cumulative_accuracy:.4f}")

        # Sampling data from the current day's training set for future use
        print(f"Sampling data from current training set for future use...")
        normal_indices = y_train == 0
        attack_indices = y_train == 1

        # Shuffle and sample normal data
        shuffled_normal_X = shuffle(X_train[normal_indices], random_state=42)
        num_normal_samples = int(previous_data_ratio * len(y_train)) // 2
        previous_normal_data.append(shuffled_normal_X[:num_normal_samples])

        # Shuffle and sample attack data
        shuffled_attack_X = shuffle(X_train[attack_indices], random_state=42)
        num_attack_samples = int(previous_data_ratio * len(y_train)) - num_normal_samples
        previous_attack_data.append(shuffled_attack_X[:num_attack_samples])

    # Reset previous data buffers after processing all attacks
    previous_normal_data = []
    previous_attack_data = []
    print("Reset previous data buffers for the next previous_data_ratio configuration.")

    # Export Results to CSV
    print("\nExporting Results to CSV...")

    per_day_filename = f"{output_dir}/per_day_results_removed_{len(features) - fixed_n_features}.csv"
    cumulative_filename = f"{output_dir}/cumulative_results_removed_{len(features) - fixed_n_features}.csv"

    try:
        pd.DataFrame(per_day_results).to_csv(per_day_filename, index=False)
        pd.DataFrame(results).to_csv(cumulative_filename, index=False)
        print(f"Results saved to {output_dir}")
    except Exception as e:
        print(f"Error saving results: {e}")


#### Experiments for ideal window size (ratio=0.3, features=14)

In [None]:
# Ratio for mixing previous training data with the current day
previous_data_ratio = 0.3

# Experiment parameters
window_size = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
n_features = 14  # Fixed number of features

for ws in window_size:
    print(f"\n--- Window size: {ws} ---")
    
    print(f"\nExperiment with {len(features) - n_features} features removed...")
    
    # Remove features from the start
    start_remove = len(features) - n_features
    curr_features = features[start_remove:]
    print(f"Using features: {curr_features}")
    
    models = []
    results = []
    per_day_results = []  # Store per-day evaluations
    future_days_evaluation = {}  # Store future days' evaluations
    past_evaluation_results = {}  # Store evaluations on previous days' test sets
    attacks = ['SYN', 'ICMP', 'UDP', 'DNS', 'GTPU']
    
    # Initialize cumulative test sets
    cumulative_X_test = []
    cumulative_y_test = []
    
    # Initialize a dictionary to store test sets for future and past evaluations
    test_sets = {}
    
    # Initialize buffers for sampled data 
    previous_normal_data = []
    previous_attack_data = []

    # Training and fine-tuning loop
    for i, attack in enumerate(attacks):
        print(f"Processing {attack}")
        
        # 0) Get Data
        data = dataframes[attack]
        data = data.drop(columns=['attack', 'malicious', 'attack_number'])
    
        # 1) Normalize Data
        print("Normalizing Data")
        data_norm = scale_df(data, curr_features)
    
        # 2) Create Sequences
        print("Creating Sequences")
        data_norm['_time'] = pd.to_datetime(data_norm['_time'])  # Parse time column
        data_norm.sort_values(by=['imeisv', '_time'], inplace=True)  # Sort by UE ID and time
    
        all_sequences = []
        all_labels = []
        target = 'binary_label'
        for (imeisv, event), group in data_norm.groupby(['imeisv', 'event']):
            if len(group) > ws:  # Only process groups with enough data
                sequences, labels = create_sequences_for_group(group, curr_features, target, ws)
                all_sequences.extend(sequences)
                all_labels.extend(labels)
    
        # 3) Split train/test
        print("Splitting Sequences")
        X = np.array(all_sequences)
        y = np.array(all_labels)
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
        
        # Save current test set for evaluations
        test_sets[i] = (X_test, y_test)
        
        # Append current test set to cumulative test sets
        cumulative_X_test.append(X_test)
        cumulative_y_test.append(y_test)
        
        # Concatenate cumulative test sets for evaluation
        X_cumulative_test = np.concatenate(cumulative_X_test, axis=0)
        y_cumulative_test = np.concatenate(cumulative_y_test, axis=0)
    
        # Stratified mixing of previous data
        if i > 0 and (previous_normal_data or previous_attack_data):
            print(f"Mixing {previous_data_ratio * 100:.0f}% of previous days' data with current day's data")
            
            # Combine all previous samples
            sampled_X_normal = np.concatenate(previous_normal_data, axis=0)
            sampled_X_attack = np.concatenate(previous_attack_data, axis=0)

            # Combine normal and attack samples
            sampled_previous_X = np.concatenate((sampled_X_normal, sampled_X_attack), axis=0)
            sampled_previous_y = np.concatenate((np.zeros(len(sampled_X_normal)), np.ones(len(sampled_X_attack))), axis=0)

            # Combine sampled previous data with current day's training data
            mixed_X_train = np.concatenate((X_train, sampled_previous_X), axis=0)
            mixed_y_train = np.concatenate((y_train, sampled_previous_y), axis=0)

            # Shuffle the combined data
            mixed_X_train, mixed_y_train = shuffle(mixed_X_train, mixed_y_train, random_state=42)
        else:
            mixed_X_train = X_train
            mixed_y_train = y_train

        # Train or fine-tune the model
        if i == 0:
            # Initial training
            model = create_lstm_model_binary(input_shape=mixed_X_train.shape[1:])
            model.fit(mixed_X_train, mixed_y_train, epochs=10, batch_size=64, validation_split=0.2)
        else:
            # Fine-tune
            model.fit(mixed_X_train, mixed_y_train, epochs=5, batch_size=64, validation_split=0.2)
    
        # Save the model
        models.append(model)
    
        # Per-Day Evaluation
        print(f"Evaluating Model of Day {i + 1} on Day {i + 1} Test Set...")
        per_day_loss, per_day_accuracy, per_day_precision, per_day_recall = model.evaluate(X_test, y_test, verbose=0)
        per_day_results.append({
            "day": i + 1,
            "loss": per_day_loss,
            "accuracy": per_day_accuracy,
            "precision": per_day_precision,
            "recall": per_day_recall,
        })
        print(f"Day {i + 1} Per-Day Test Loss: {per_day_loss:.4f}, Accuracy: {per_day_accuracy:.4f}, "
              f"Precision: {per_day_precision:.4f}, Recall: {per_day_recall:.4f}")
    
        # Evaluate on cumulative test set
        cumulative_loss, cumulative_accuracy, cumulative_precision, cumulative_recall = model.evaluate(X_cumulative_test, y_cumulative_test, verbose=0)
        results.append({
            "day": i + 1,
            "loss": cumulative_loss,
            "accuracy": cumulative_accuracy,
            "precision": cumulative_precision,
            "recall": cumulative_recall
        })
        print(f"Day {i+1} Cumulative Test Loss: {cumulative_loss:.4f}, Accuracy: {cumulative_accuracy:.4f}")
    
        # Evaluate on previous days' test sets
        if i > 0:
            print(f"Evaluating Model of Day {i + 1} on Previous Days' Test Sets:")
            for prev_day in range(i):
                prev_X_test, prev_y_test = test_sets[prev_day]
                prev_loss, prev_accuracy, prev_precision, prev_recall = model.evaluate(prev_X_test, prev_y_test, verbose=0)
                # Save results
                past_evaluation_results[(i + 1, prev_day + 1)] = {
                    "model_day": i + 1,
                    "test_day": prev_day + 1,
                    "loss": prev_loss,
                    "accuracy": prev_accuracy,
                    "precision": prev_precision,
                    "recall": prev_recall,
                }
                print(f"  Model of Day {i + 1} -> Test Set Day {prev_day + 1}: Loss = {prev_loss:.4f}, "
                      f"Accuracy = {prev_accuracy:.4f}, Precision = {prev_precision:.4f}, Recall = {prev_recall:.4f}")


            # Sampling data from the current day's training set for future use
            print(f"Sampling data from current training set for future use...")
            normal_indices = y_train == 0
            attack_indices = y_train == 1

            # Shuffle and sample normal data
            shuffled_normal_X = shuffle(X_train[normal_indices], random_state=42)
            num_normal_samples = int(previous_data_ratio * len(y_train)) // 2
            previous_normal_data.append(shuffled_normal_X[:num_normal_samples])

            # Shuffle and sample attack data
            shuffled_attack_X = shuffle(X_train[attack_indices], random_state=42)
            num_attack_samples = int(previous_data_ratio * len(y_train)) - num_normal_samples
            previous_attack_data.append(shuffled_attack_X[:num_attack_samples])
            
    # Reset previous data buffers after processing all attacks for this feature configuration
    previous_normal_data = []
    previous_attack_data = []
    print("Reset previous data buffers for the next feature configuration.")
  
    # Export Results to CSV
    print("\nExporting Results to CSV...")
    output_dir = "lstm_finetuning_window_size/"
    os.makedirs(output_dir, exist_ok=True)

    per_day_filename = f"{output_dir}per_day_results_seq_{ws}_removed_{len(features) - n_features}.csv"
    cumulative_filename = f"{output_dir}cumulative_results_seq_{ws}_removed_{len(features) - n_features}.csv"
    past_filename = f"{output_dir}past_evaluation_results_seq_{ws}_removed_{len(features) - n_features}.csv"
    future_filename = f"{output_dir}future_days_evaluation_seq_{ws}_removed_{len(features) - n_features}.csv"

    try:
        pd.DataFrame(per_day_results).to_csv(per_day_filename, index=False)
        pd.DataFrame(results).to_csv(cumulative_filename, index=False)
        pd.DataFrame(past_evaluation_results).T.reset_index(drop=True).to_csv(past_filename, index=False)
        pd.DataFrame(future_days_evaluation).T.reset_index(drop=True).to_csv(future_filename, index=False)
        print(f"Results saved: {output_dir}")
    except Exception as e:
        print(f"Error saving results: {e}")