### Moving Forward to Layer 2 Implementation  

The implementation of Layer 1 (Anomaly Detection & Feature Extraction) is now complete. However, for further modifications and validation, we require more dataset variations or additional traffic patterns. Constructing these datasets will take some time.  

In the meantime, I'm now proceeding with the **implementation of Layer 2 (Attack Classification & Adaptive Learning)**.  

### Key Next Steps:
- **Dataset Construction:** Since Layer 2 relies on anomalous samples detected by Layer 1, we will integrate the Layer 1 code with the Layer 2 pipeline.  
- **Feature Extraction:** Extracting CNN-enhanced features from Layer 1 to improve classification performance.  
- **Attack Classification Model:** Implementing a CNN-BiLSTM model with Knowledge Distillation for efficient and scalable attack classification.  

In [19]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
import joblib
import matplotlib.pyplot as plt

In [20]:
# Loading and preprocessing dataset
def preprocess_data(file_path, test_size=0.2, random_state=42):
    """
    Load and preprocess dataset for ANIDS.
    - Applies robust scaling
    - Removes outliers using IQR
    - Splits data into training & validation sets
    """
    df = pd.read_csv(file_path)
    X = df.drop(['Attack_label'], axis=1)
    
    # Outlier removal using IQR
    Q1, Q3 = X.quantile(0.25), X.quantile(0.75)
    IQR = Q3 - Q1
    X = X[~((X < (Q1 - 3 * IQR)) | (X > (Q3 + 3 * IQR))).any(axis=1)]
    
    # Scaling
    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X)
    joblib.dump(scaler, 'robust_scaler.pkl')

    return train_test_split(X_scaled, test_size=test_size, random_state=random_state)

# Load dataset
X_train, X_val = preprocess_data("/Users/siddhantgond/Desktop/6THSEM/Project_Elective/Adaptive-Network-Intrusion-Detection-System/Implementaiton/training_dataset.csv")


In [21]:
# Define Layer 1 of the Adaptive NIDS
class AdaptiveNIDSLayer1:
    def __init__(self, input_dim, latent_dim=16):
        self.input_dim = input_dim
        self.latent_dim = latent_dim
        self.model = self._build_autoencoder()

    def _build_autoencoder(self):
        inputs = layers.Input(shape=(self.input_dim,))
        x = layers.BatchNormalization()(inputs)
        x = layers.Reshape((-1, 1))(x)

        # Feature extraction via Residual CNN
        x = layers.Conv1D(16, 3, activation='relu', padding='same')(x)
        x = layers.Conv1D(32, 3, activation='relu', padding='same')(x)
        x = layers.GlobalAveragePooling1D()(x)

        # Latent Representation
        x = layers.Dense(64, activation='mish', kernel_regularizer=regularizers.l1(0.0005))(x)
        x = layers.Dropout(0.3)(x)
        encoded = layers.Dense(self.latent_dim, activation='linear')(x)

        # Decoder
        x = layers.RepeatVector(self.input_dim)(encoded)
        x = layers.LSTM(self.latent_dim * 2, return_sequences=True, recurrent_dropout=0.25)(x)
        decoded = layers.TimeDistributed(layers.Dense(1, activation='linear'))(x)
        decoded = layers.Flatten()(decoded)

        autoencoder = keras.Model(inputs=inputs, outputs=decoded)
        autoencoder.compile(optimizer=keras.optimizers.Adam(1e-4), loss='mse')
        return autoencoder

    def train(self, X_train, X_val, epochs=50):
        self.model.fit(X_train, X_train, epochs=epochs, batch_size=64, validation_data=(X_val, X_val))

    def detect_anomalies(self, X_data, threshold=0.02):
        reconstructed = self.model.predict(X_data)
        errors = np.mean(np.square(X_data - reconstructed), axis=1)
        return X_data[errors > threshold], np.where(errors > threshold)[0]

# Train Layer 1
layer1 = AdaptiveNIDSLayer1(input_dim=X_train.shape[1])
layer1.train(X_train, X_val)

# Detect anomalies
anomalies, anomaly_indices = layer1.detect_anomalies(X_val)

Epoch 1/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 21ms/step - loss: 0.2371 - val_loss: 0.2149
Epoch 2/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - loss: 0.2108 - val_loss: 0.1954
Epoch 3/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - loss: 0.1945 - val_loss: 0.1790
Epoch 4/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - loss: 0.1813 - val_loss: 0.1649
Epoch 5/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - loss: 0.1703 - val_loss: 0.1525
Epoch 6/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - loss: 0.1555 - val_loss: 0.1412
Epoch 7/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - loss: 0.1443 - val_loss: 0.1316
Epoch 8/50
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - loss: 0.1368 - val_loss: 0.1233
Epoch 9/50
[1m189/189[0m [32m

In [23]:
# Extracting features from Layer 1 for Layer 2
def extract_features(model, X_anomalies):
    feature_extractor = keras.Model(inputs=model.input, outputs=model.get_layer("global_average_pooling1d").output)
    return feature_extractor.predict(X_anomalies)

X_layer2 = extract_features(layer1.model, anomalies)


ValueError: No such layer: global_average_pooling1d. Existing layers are: ['input_layer_3', 'batch_normalization_3', 'reshape_1', 'conv1d_4', 'conv1d_5', 'global_average_pooling1d_1', 'dense_7', 'dropout_3', 'dense_8', 'repeat_vector_1', 'lstm_1', 'time_distributed_1', 'flatten_1'].

In [24]:
# Preparing dataset for Layer 2
def attach_attack_labels(original_df, anomaly_indices):
    labeled_anomalies = original_df.iloc[anomaly_indices]
    return labeled_anomalies.drop(columns=['Attack_label']), labeled_anomalies['Attack_label']

# Load original dataset for labels
original_df = pd.read_csv("/Users/siddhantgond/Desktop/6THSEM/Project_Elective/Adaptive-Network-Intrusion-Detection-System/Implementaiton/training_dataset.csv")
X_layer2, y_layer2 = attach_attack_labels(original_df, anomaly_indices)

In [25]:
# Define Layer 2 of the Adaptive NIDS
class AdaptiveNIDSLayer2:
    def __init__(self, input_dim, num_classes, seq_length=10):
        """
        Initializes Layer 2 for attack classification using CNN-BiLSTM.
        
        Args:
            input_dim (int): Number of input features per time step.
            num_classes (int): Number of attack classes.
            seq_length (int): Number of time steps in sequence.
        """
        self.input_dim = input_dim
        self.num_classes = num_classes
        self.seq_length = seq_length
        self.model = self._build_model()

    def _build_model(self):
        """
        Builds the CNN-BiLSTM classification model.
        """
        inputs = layers.Input(shape=(self.seq_length, self.input_dim))
        
        # CNN Feature Extraction
        x = layers.Conv1D(64, kernel_size=3, activation='relu', padding='same')(inputs)
        x = layers.BatchNormalization()(x)
        
        # BiLSTM for Temporal Sequence Learning
        x = layers.Bidirectional(layers.GRU(48, return_sequences=False))(x)
        
        # Fully Connected Layers
        x = layers.Dense(64, activation='relu')(x)
        x = layers.Dropout(0.2)(x)
        
        # Output Layer (Softmax for Classification)
        outputs = layers.Dense(self.num_classes, activation='softmax')(x)

        # Compile Model
        model = keras.Model(inputs=inputs, outputs=outputs)
        model.compile(optimizer=keras.optimizers.Adam(1e-3), 
                      loss='sparse_categorical_crossentropy', 
                      metrics=['accuracy'])
        return model

    def train(self, X_train, y_train, epochs=50, batch_size=64):
        """
        Trains the Layer 2 model.
        
        Args:
            X_train (np.array): Input sequences.
            y_train (np.array): Attack labels.
            epochs (int): Number of training epochs.
            batch_size (int): Batch size.
        """
        self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)


def create_sequences(data, labels, seq_length=10):
    """
    Converts feature data into time-series sequences for Layer 2.
    
    Args:
        data (np.array): 2D feature matrix.
        labels (pd.Series or np.array): Corresponding labels.
        seq_length (int): Number of time steps per sequence.
    
    Returns:
        Tuple of (sequential data, adjusted labels)
    """
    sequences, seq_labels = [], []
    
    # Convert labels to NumPy array to avoid indexing issues
    labels = np.array(labels)

    for i in range(len(data) - seq_length):
        sequences.append(data[i:i + seq_length])
        seq_labels.append(labels[i + seq_length - 1])  # ✅ Now works correctly

    return np.array(sequences), np.array(seq_labels)

# Generate sequences for Layer 2
seq_length = 10
X_layer2_reshaped, y_layer2_adjusted = create_sequences(X_layer2, y_layer2, seq_length=seq_length)

# Train Layer 2
layer2 = AdaptiveNIDSLayer2(input_dim=X_layer2_reshaped.shape[2], num_classes=5, seq_length=seq_length)
layer2.train(X_layer2_reshaped, y_layer2_adjusted)

Epoch 1/50
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.5838 - loss: 1.0874 - val_accuracy: 0.8537 - val_loss: 0.3313
Epoch 2/50
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9504 - loss: 0.1208 - val_accuracy: 0.9099 - val_loss: 0.2789
Epoch 3/50
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9737 - loss: 0.0649 - val_accuracy: 0.9381 - val_loss: 0.1425
Epoch 4/50
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9821 - loss: 0.0488 - val_accuracy: 0.9587 - val_loss: 0.0755
Epoch 5/50
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9924 - loss: 0.0290 - val_accuracy: 0.9625 - val_loss: 0.0920
Epoch 6/50
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9856 - loss: 0.0318 - val_accuracy: 0.9831 - val_loss: 0.0437
Epoch 7/50
[1m34/34[0m [32m━━━━━━

In [36]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

def knowledge_distillation(teacher_model, student_model, X_train, y_train, temperature=3.0, alpha=0.5, epochs=50):
    """
    Implements knowledge distillation by training the student model with soft targets from the teacher.

    Args:
        teacher_model: Pre-trained teacher model.
        student_model: Student model to be trained.
        X_train (np.array): Training features.
        y_train (np.array): Training labels (should be a NumPy array).
        temperature (float): Softmax temperature for distillation.
        alpha (float): Weight balance between hard loss and soft loss.
        epochs (int): Number of training epochs.
    """
    # ✅ Ensure y_train is a NumPy array
    y_train = np.array(y_train)

    # ✅ Ensure labels are integers (for sparse categorical crossentropy)
    if len(y_train.shape) > 1 and y_train.shape[1] > 1:
        y_train = np.argmax(y_train, axis=1)  # Convert one-hot to integer labels

    # Get number of classes for one-hot encoding
    num_classes = student_model.output_shape[-1]
    if isinstance(num_classes, tf.TensorShape):
        num_classes = num_classes.as_list()[-1]  # Fix for unknown TensorShape
    
    # Step 1: Get teacher predictions first
    print("Getting teacher predictions...")
    teacher_logits = teacher_model.predict(X_train)
    teacher_probs = tf.nn.softmax(teacher_logits / temperature).numpy()
    
    # Step 2: Define custom loss function for distillation
    def distillation_loss(y_true, y_pred):
        """
        Computes the knowledge distillation loss:
        - Hard loss: Student's predictions vs. true labels (Sparse Categorical Crossentropy)
        - Soft loss: Student's predictions vs. Teacher's soft probabilities (KL Divergence)
        """
        # Get index of the batch in the dataset
        batch_indices = tf.range(tf.shape(y_true)[0])
        
        # Convert y_true to one-hot format (needed for loss calculation)
        y_true_one_hot = tf.one_hot(tf.cast(tf.squeeze(y_true), tf.int32), depth=num_classes)
        
        # Get teacher soft targets for this batch
        # We use a more robust approach that doesn't depend on keeping the teacher predictions in memory
        batch_teacher_probs = tf.nn.softmax(teacher_model(tf.cast(tf.gather(X_train, batch_indices), tf.float32), training=False) / temperature)
        
        # Get student soft predictions
        student_logits = y_pred
        student_soft_probs = tf.nn.softmax(student_logits / temperature)
        
        # Compute losses
        hard_loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, student_logits)
        soft_loss = tf.keras.losses.KLDivergence()(batch_teacher_probs, student_soft_probs)
        
        # Weighted combination of hard and soft loss
        return (1 - alpha) * hard_loss + alpha * soft_loss * (temperature ** 2)

    # Step 3: Compile the student model with distillation loss
    student_model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3),
                          loss=distillation_loss,
                          metrics=['accuracy'])

    # Step 4: Train the student model using distillation
    print("Training the student model with knowledge distillation...")
    student_model.fit(X_train, y_train, epochs=epochs, batch_size=64, verbose=1)

    return student_model