In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
file_paths = [
    "../data/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv",
    "../data/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
    "../data/Friday-WorkingHours-Morning.pcap_ISCX.csv",
    "../data/Monday-WorkingHours.pcap_ISCX.csv",
    "../data/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
    "../data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
    "../data/Tuesday-WorkingHours.pcap_ISCX.csv",
    "../data/Wednesday-workingHours.pcap_ISCX.csv"
]

In [3]:
# Step 1: Load and Concatenate Datasets
dataframes = []
for path in file_paths:
    try:
        df = pd.read_csv(path)
        dataframes.append(df)
    except Exception as e:
        print(f"Error loading {path}: {e}")

# Combine all data into a single dataframe
combined_df = pd.concat(dataframes, ignore_index=True)

In [4]:
# Step 2: Data Cleaning
# Drop any columns with all NaN values and drop rows with any NaN values
cleaned_df = combined_df.dropna(axis=1, how='all')  # Drop columns with all NaN
cleaned_df = cleaned_df.dropna()  # Drop rows with any NaN values

# Remove leading and trailing whitespaces from column names
cleaned_df.columns = cleaned_df.columns.str.strip()

In [5]:
# Step 3: Feature Selection
# Separate features and labels
features = cleaned_df.drop(columns=['Label'])
labels = cleaned_df['Label']

In [6]:
# Step 4: Handle Infinite Values
# Replace infinite values with NaN and then drop rows containing NaN values
features.replace([np.inf, -np.inf], np.nan, inplace=True)
features.dropna(inplace=True)

# Update labels to match the cleaned features
labels = labels[features.index]

In [7]:
# Step 5: Label Encoding
# Encode categorical labels into numerical values
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [8]:
# Step 6: Standardization
# Standardize the feature columns to have zero mean and unit variance
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [9]:
# Step 7: Train, Validation, Test Split
# Split data into training, validation, and test sets (60% train, 20% validation, 20% test)
X_train, X_temp, y_train, y_temp = train_test_split(scaled_features, encoded_labels, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Display the size of each dataset
print(f"Training set size: {X_train.shape}")
print(f"Validation set size: {X_val.shape}")
print(f"Test set size: {X_test.shape}")

Training set size: (1696725, 78)
Validation set size: (565575, 78)
Test set size: (565576, 78)


In [10]:
# Display the unique values and their range for the encoded labels
unique_labels = np.unique(encoded_labels)
label_range = (unique_labels.min(), unique_labels.max())

(unique_labels, label_range)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]), (0, 14))

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import random

In [12]:
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [13]:
class IncrementalLearningModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(IncrementalLearningModel, self).__init__()
        # Improved feedforward neural network with additional hidden layers and more neurons
        self.fc1 = nn.Linear(input_size, 256)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(256, 128)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(128, 64)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(64, 32)
        self.relu4 = nn.ReLU()
        self.fc5 = nn.Linear(32, num_classes)
        
        # Initialize weights
        self.init_weights()

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.fc4(x)
        x = self.relu4(x)
        x = self.fc5(x)
        
        # Check for NaN or infinite values in output
        assert torch.isfinite(x).all(), "Model output contains NaN or infinite values"
        return x

    def incremental_train(self, train_data, train_labels, val_data, val_labels, num_epochs=50, lr=1e-4):
        # Ensure labels are of type LongTensor
        train_labels = train_labels.long()
        val_labels = val_labels.long()
        
        # Set model to training mode
        self.train()
        
        # Define loss function and optimizer
        class_weights = torch.ones(self.fc5.out_features).to(device)  # Assuming balanced classes initially
        criterion = nn.CrossEntropyLoss(weight=class_weights)
        optimizer = optim.Adam(self.parameters(), lr=lr)  # Changed back to Adam for faster convergence
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)  # Learning rate scheduler
        
        # Create DataLoader for training and validation data
        train_dataset = TensorDataset(train_data, train_labels)
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        val_dataset = TensorDataset(val_data, val_labels)
        val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

        # Training loop
        for epoch in range(num_epochs):
            running_loss = 0.0
            for inputs, targets in train_loader:
                inputs, targets = inputs.to(device), targets.to(device)

                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward pass
                outputs = self(inputs)
                loss = criterion(outputs, targets)

                # Check for NaN or infinite values in loss
                assert torch.isfinite(loss).all(), "Loss contains NaN or infinite values"

                # Backward pass
                loss.backward()

                # Clip gradients to prevent exploding gradients
                torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm=1.0)

                # Check gradients for NaN or infinite values
                for name, param in self.named_parameters():
                    if param.grad is not None:
                        assert torch.isfinite(param.grad).all(), f"Gradient for {name} contains NaN or infinite values"

                # Optimize
                optimizer.step()

                running_loss += loss.item()

            # Step the learning rate scheduler
            scheduler.step()

            # Validation step
            val_loss = 0.0
            self.eval()
            with torch.no_grad():
                for inputs, targets in val_loader:
                    inputs, targets = inputs.to(device), targets.to(device)
                    outputs = self(inputs)
                    loss = criterion(outputs, targets)
                    val_loss += loss.item()
            self.train()

            print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {running_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}, Learning Rate: {scheduler.get_last_lr()[0]:.6f}")

    def train_model(self, X_train, y_train, X_val, y_val, num_epochs=50, lr=1e-4, poisoning_rate=0.0, poisoning_strategy='label_flip'):
        # Convert numpy arrays to torch tensors
        train_data = torch.tensor(X_train, dtype=torch.float32).to(device)
        train_labels = torch.tensor(y_train, dtype=torch.long).to(device)
        val_data = torch.tensor(X_val, dtype=torch.float32).to(device)
        val_labels = torch.tensor(y_val, dtype=torch.long).to(device)
        
        # Apply data poisoning if specified
        if poisoning_rate > 0:
            train_data, train_labels = poison_data(train_data, train_labels, poisoning_rate, poisoning_strategy)
        
        # Train the model
        self.incremental_train(train_data, train_labels, val_data, val_labels, num_epochs, lr)

    def predict(self, data):
        # Set model to evaluation mode
        self.eval()
        with torch.no_grad():
            data = torch.tensor(data, dtype=torch.float32).to(device)
            outputs = self(data)
            _, predicted = torch.max(outputs, 1)
        return predicted

    def evaluate(self, X_test, y_test):
        # Convert numpy arrays to torch tensors
        test_data = torch.tensor(X_test, dtype=torch.float32).to(device)
        test_labels = torch.tensor(y_test, dtype=torch.long).to(device)
        
        # Predict the labels for the test data
        predictions = self.predict(test_data)
        # Calculate accuracy and F1 score
        accuracy = accuracy_score(test_labels.cpu().numpy(), predictions.cpu().numpy())
        f1 = f1_score(test_labels.cpu().numpy(), predictions.cpu().numpy(), average='weighted')
        print(f"Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")
        return accuracy, f1

In [14]:
# Function to handle catastrophic forgetting using Replay mechanism
def replay_mechanism(model, previous_data, new_data, previous_labels, new_labels, num_epochs=5, lr=1e-4):
    # Convert numpy arrays to torch tensors
    previous_data = torch.tensor(previous_data, dtype=torch.float32).to(device)
    new_data = torch.tensor(new_data, dtype=torch.float32).to(device)
    previous_labels = torch.tensor(previous_labels, dtype=torch.long).to(device)
    new_labels = torch.tensor(new_labels, dtype=torch.long).to(device)
    
    # Ensure label values are in the correct range
    assert previous_labels.min() >= 0 and previous_labels.max() < model.fc5.out_features, f"Previous labels are out of range: min={previous_labels.min()}, max={previous_labels.max()}"
    assert new_labels.min() >= 0 and new_labels.max() < model.fc5.out_features, f"New labels are out of range: min={new_labels.min()}, max={new_labels.max()}"
    
    # Check for NaN or infinite values in data
    assert torch.isfinite(previous_data).all(), "Previous data contains NaN or infinite values"
    assert torch.isfinite(new_data).all(), "New data contains NaN or infinite values"
    
    # Combine previous and new data
    combined_data = torch.cat((previous_data, new_data), dim=0)
    combined_labels = torch.cat((previous_labels, new_labels), dim=0)

    # Retrain the model with combined dataset
    model.incremental_train(combined_data, combined_labels, combined_data, combined_labels, num_epochs, lr)

In [15]:
# Function to generate poisoned data
def poison_data(data, labels, poisoning_rate=0.1, strategy='label_flip'):
    """
    Generate poisoned data.

    Parameters:
    - data: np.array or torch.Tensor, original input data
    - labels: np.array or torch.Tensor, original labels
    - poisoning_rate: float, proportion of data to be poisoned
    - strategy: str, the type of poisoning strategy ('label_flip' or 'feature_perturbation')

    Returns:
    - poisoned_data: torch.Tensor, data with poisoning applied
    - poisoned_labels: torch.Tensor, labels with poisoning applied
    """
    data = data.clone() if isinstance(data, torch.Tensor) else torch.tensor(data, dtype=torch.float32)
    labels = labels.clone() if isinstance(labels, torch.Tensor) else torch.tensor(labels, dtype=torch.long)

    num_samples = data.shape[0]
    num_poisoned = int(poisoning_rate * num_samples)
    poisoned_indices = random.sample(range(num_samples), num_poisoned)

    if strategy == 'label_flip':
        poisoned_labels = labels.clone()
        # Flip labels to the next class (cyclically)
        for idx in poisoned_indices:
            poisoned_labels[idx] = (labels[idx] + 1) % len(torch.unique(labels))
        return data, poisoned_labels

    elif strategy == 'feature_perturbation':
        poisoned_data = data.clone()
        # Add small perturbations to features of the poisoned data
        perturbation = torch.randn_like(poisoned_data[poisoned_indices]) * 0.1
        poisoned_data[poisoned_indices] += perturbation
        return poisoned_data, labels

    else:
        raise ValueError("Unsupported poisoning strategy. Use 'label_flip' or 'feature_perturbation'.")

In [16]:
# Experiment to evaluate model under different poisoning levels and use replay mechanism
def run_poisoning_experiment_with_replay(model, X_train, y_train, X_val, y_val, X_test, y_test, poisoning_rates, replay_data, replay_labels, num_epochs=50, lr=1e-4):
    results = []

    for poisoning_rate in poisoning_rates:
        print(f"\nRunning experiment with poisoning rate: {poisoning_rate:.2f}")

        # Initialize a new model for each experiment
        model_instance = IncrementalLearningModel(input_size=X_train.shape[1], num_classes=len(np.unique(y_train))).to(device)

        # Poison data
        poisoned_data, poisoned_labels = poison_data(X_train, y_train, poisoning_rate=poisoning_rate, strategy='label_flip')

        # Train model
        model_instance.train_model(poisoned_data, poisoned_labels, X_val, y_val, num_epochs=num_epochs, lr=lr)

        # Evaluate model before replay mechanism
        print("\nEvaluating model before replay...")
        accuracy_before, f1_before = model_instance.evaluate(X_test, y_test)

        # Apply replay mechanism
        print("\nApplying replay mechanism...")
        replay_mechanism(model_instance, replay_data, poisoned_data[:50], replay_labels, poisoned_labels[:50], num_epochs=5, lr=lr)

        # Evaluate model after replay mechanism
        print("\nEvaluating model after replay...")
        accuracy_after, f1_after = model_instance.evaluate(X_test, y_test)

        # Store results
        results.append({
            "poisoning_rate": poisoning_rate,
            "accuracy_before": accuracy_before,
            "f1_score_before": f1_before,
            "accuracy_after": accuracy_after,
            "f1_score_after": f1_after
        })

    return results

In [None]:
if __name__ == "__main__":
    
    input_size = X_train.shape[1]  # Number of features in the dataset
    num_classes = len(np.unique(y_train))  # Number of classes

    model = IncrementalLearningModel(input_size, num_classes).to(device)

    # Run poisoning experiments with replay mechanism
    poisoning_rates = [0.0, 0.1, 0.2, 0.3]  # Different levels of poisoning rates to evaluate
    replay_data = X_train[:50]
    replay_labels = y_train[:50]

    experiment_results = run_poisoning_experiment_with_replay(model, X_train, y_train, X_val, y_val, X_test, y_test, poisoning_rates, replay_data, replay_labels)

    # Print results
    for result in experiment_results:
        print(f"Poisoning Rate: {result['poisoning_rate']:.2f}, "
              f"Accuracy Before Replay: {result['accuracy_before']:.4f}, F1 Score Before Replay: {result['f1_score_before']:.4f}, "
              f"Accuracy After Replay: {result['accuracy_after']:.4f}, F1 Score After Replay: {result['f1_score_after']:.4f}")


Running experiment with poisoning rate: 0.00


  train_data = torch.tensor(X_train, dtype=torch.float32).to(device)
  train_labels = torch.tensor(y_train, dtype=torch.long).to(device)


Epoch [1/50], Train Loss: 0.0737, Val Loss: 0.0573, Learning Rate: 0.000100
Epoch [2/50], Train Loss: 0.0504, Val Loss: 0.0507, Learning Rate: 0.000100


In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Function to visualize the impact of poisoned data on model performance
def visualize_poisoning_impact(results):
    poisoning_rates = [result["poisoning_rate"] for result in results]
    accuracies = [result["accuracy"] for result in results]
    f1_scores = [result["f1_score"] for result in results]

    # Plot Accuracy vs Poisoning Rate
    plt.figure(figsize=(12, 5))
    
    # Accuracy plot
    plt.subplot(1, 2, 1)
    plt.plot(poisoning_rates, accuracies, marker='o', linestyle='-', color='b', label='Accuracy')
    plt.xlabel('Poisoning Rate')
    plt.ylabel('Accuracy')
    plt.title('Accuracy vs Poisoning Rate')
    plt.grid(True)
    plt.ylim(0, 1)
    plt.xticks(poisoning_rates)
    plt.legend()

    # F1 Score plot
    plt.subplot(1, 2, 2)
    plt.plot(poisoning_rates, f1_scores, marker='o', linestyle='-', color='r', label='F1 Score')
    plt.xlabel('Poisoning Rate')
    plt.ylabel('F1 Score')
    plt.title('F1 Score vs Poisoning Rate')
    plt.grid(True)
    plt.ylim(0, 1)
    plt.xticks(poisoning_rates)
    plt.legend()

    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    experiment_results = [
        {"poisoning_rate": 0.0, "accuracy": 0.85, "f1_score": 0.83},
        {"poisoning_rate": 0.1, "accuracy": 0.75, "f1_score": 0.73},
        {"poisoning_rate": 0.2, "accuracy": 0.65, "f1_score": 0.62},
        {"poisoning_rate": 0.3, "accuracy": 0.55, "f1_score": 0.52},
    ]

    # Visualize the impact of poisoning
    visualize_poisoning_impact(experiment_results)

In [None]:
def visualize_replay_impact(results):
    poisoning_rates = [result["poisoning_rate"] for result in results]
    accuracy_before = [result["accuracy_before"] for result in results]
    accuracy_after = [result["accuracy_after"] for result in results]
    f1_before = [result["f1_score_before"] for result in results]
    f1_after = [result["f1_score_after"] for result in results]

    # Plot Accuracy before and after Replay
    plt.figure(figsize=(12, 5))
    
    # Accuracy bar plot
    plt.subplot(1, 2, 1)
    width = 0.3
    x = np.arange(len(poisoning_rates))
    plt.bar(x - width/2, accuracy_before, width, label='Before Replay', color='b')
    plt.bar(x + width/2, accuracy_after, width, label='After Replay', color='g')
    plt.xlabel('Poisoning Rate')
    plt.ylabel('Accuracy')
    plt.title('Accuracy Before and After Replay')
    plt.xticks(x, [f"{rate:.2f}" for rate in poisoning_rates])
    plt.ylim(0, 1)
    plt.legend()

    # F1 Score bar plot
    plt.subplot(1, 2, 2)
    plt.bar(x - width/2, f1_before, width, label='Before Replay', color='r')
    plt.bar(x + width/2, f1_after, width, label='After Replay', color='g')
    plt.xlabel('Poisoning Rate')
    plt.ylabel('F1 Score')
    plt.title('F1 Score Before and After Replay')
    plt.xticks(x, [f"{rate:.2f}" for rate in poisoning_rates])
    plt.ylim(0, 1)
    plt.legend()

    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    experiment_results_with_replay = [
        {"poisoning_rate": 0.0, "accuracy_before": 0.85, "f1_score_before": 0.83, "accuracy_after": 0.85, "f1_score_after": 0.83},
        {"poisoning_rate": 0.1, "accuracy_before": 0.75, "f1_score_before": 0.73, "accuracy_after": 0.78, "f1_score_after": 0.76},
        {"poisoning_rate": 0.2, "accuracy_before": 0.65, "f1_score_before": 0.62, "accuracy_after": 0.70, "f1_score_after": 0.68},
        {"poisoning_rate": 0.3, "accuracy_before": 0.55, "f1_score_before": 0.52, "accuracy_after": 0.60, "f1_score_after": 0.58},
    ]

    # Visualize the impact of replay mechanism
    visualize_replay_impact(experiment_results_with_replay)