In [1]:
import pandas as pd

In [2]:
preprocessed_dataset = 'edge_iiot_preprocessed.csv'  # Replace with your actual path
df = pd.read_csv(preprocessed_dataset, low_memory=False)

In [4]:
df.head()

Unnamed: 0,time_difference,arp.opcode,arp.hw.size,icmp.checksum,icmp.seq_le,http.content_length,http.request.method,http.referer,http.request.version,http.response,...,mqtt.hdrflags,mqtt.len,mqtt.msgtype,mqtt.proto_len,mqtt.protoname,mqtt.topic,mqtt.topic_len,mqtt.ver,Attack_label,Attack_type
0,-0.000309,-0.047835,0.0,-0.219227,-0.220062,-0.040216,0.131863,0.235181,-0.005951,0,...,-0.300891,-0.281368,-0.300891,0.0,-0.460693,-0.46068,0.0,0.0,0,Normal
1,0.000818,-0.047835,0.0,-0.219227,-0.220062,-0.040216,0.131863,0.235181,-0.005951,0,...,0.044291,1.180232,0.044291,4.0,3.395124,-0.46068,0.0,4.0,0,Normal
2,-0.000308,-0.047835,0.0,-0.219227,-0.220062,-0.040216,0.131863,0.235181,-0.005951,0,...,-0.300891,-0.281368,-0.300891,0.0,-0.460693,-0.46068,0.0,0.0,0,Normal
3,-0.000304,-0.047835,0.0,-0.219227,-0.220062,-0.040216,0.131863,0.235181,-0.005951,0,...,0.389473,-0.037768,0.389473,0.0,-0.460693,-0.46068,0.0,0.0,0,Normal
4,-0.00026,-0.047835,0.0,-0.219227,-0.220062,-0.040216,0.131863,0.235181,-0.005951,0,...,0.734654,4.468833,0.734654,0.0,-0.460693,3.395354,24.0,0.0,0,Normal


In [3]:
X = df.drop(['Attack_label', 'Attack_type'], axis=1)
y = df['Attack_label']  # for binary classification (0 or 1)

In [None]:
import numpy as np

# This function gives the number of packets in the last 1 second (approx)
def packets_last_10s(time_diff_series, threshold=1.0):
    result = []
    for i in range(len(time_diff_series)):
        total_time = 0
        count = 0
        j = i
        while j >= 0 and total_time <= threshold:
            total_time += time_diff_series.iloc[j]
            count += 1
            j -= 1
        result.append(count)
    return result

X['packet_count_10s'] = packets_last_10s(X['time_difference'], threshold=10.0)


In [7]:
X.columns

Index(['time_difference', 'arp.opcode', 'arp.hw.size', 'icmp.checksum',
       'icmp.seq_le', 'http.content_length', 'http.request.method',
       'http.referer', 'http.request.version', 'http.response', 'tcp.ack',
       'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin',
       'tcp.connection.rst', 'tcp.connection.syn', 'tcp.connection.synack',
       'tcp.flags', 'tcp.flags.ack', 'tcp.len', 'tcp.seq', 'udp.stream',
       'udp.time_delta', 'dns.qry.name', 'dns.qry.qu', 'dns.retransmission',
       'dns.retransmit_request', 'dns.retransmit_request_in',
       'mqtt.conack.flags', 'mqtt.conflag.cleansess', 'mqtt.conflags',
       'mqtt.hdrflags', 'mqtt.len', 'mqtt.msgtype', 'mqtt.proto_len',
       'mqtt.protoname', 'mqtt.topic', 'mqtt.topic_len', 'mqtt.ver'],
      dtype='object')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import DataLoader, TensorDataset

# Step 1: Drop non-numeric or high-cardinality categorical columns
# columns_to_drop = ['mqtt.topic', 'dns.qry.name']  # you can adjust
X_clean = X.drop(columns=columns_to_drop, errors='ignore')

# Step 2: Scale features
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X_clean)

# Step 3: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_clean, y, test_size=0.2, random_state=42)

# Step 4: Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)

X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Step 5: Create DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)


In [None]:
import torch.nn as nn
import torch.nn.functional as F

class SimpleCNN(nn.Module):
    def __init__(self, num_features):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=8, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.fc1 = nn.Linear((num_features // 2) * 8, 32)
        self.fc2 = nn.Linear(32, 2)  # 2 classes for Attack_label

    def forward(self, x):
        x = x.unsqueeze(1)  # shape: (batch_size, 1, num_features)
        x = self.pool(F.relu(self.conv1(x)))  # -> (batch_size, 8, num_features//2)
        x = x.view(x.size(0), -1)  # flatten
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Assuming X_np and y_np are your input features and target labels as NumPy arrays

# Step 1: Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_np, y_np, test_size=0.2, random_state=42)

# Step 2: Define the sigmoid function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Step 3: Modified fitness function with L2 regularization and validation loss
def fitness(position):
    weights = position[:X_train.shape[1]].reshape(-1, 1)  # Shape it for logistic regression (n_features, 1)
    bias = position[-1]  # Last value is bias
    
    # Calculate logits and probabilities for training set
    logits_train = X_train @ weights + bias
    probs_train = sigmoid(logits_train)
    
    # Calculate the training loss (binary cross-entropy)
    train_loss = -np.mean(y_train * np.log(probs_train + 1e-8) + (1 - y_train) * np.log(1 - probs_train + 1e-8))
    
    # L2 regularization (Ridge regularization)
    lambda_reg = 0.01
    train_loss += lambda_reg * np.sum(weights**2)

    # Calculate logits and probabilities for validation set
    logits_val = X_val @ weights + bias
    probs_val = sigmoid(logits_val)
    
    # Calculate the validation loss (binary cross-entropy)
    val_loss = -np.mean(y_val * np.log(probs_val + 1e-8) + (1 - y_val) * np.log(1 - probs_val + 1e-8))
    
    # Combine training and validation loss
    total_loss = train_loss + val_loss
    return total_loss

# Step 4: Initialize PSO parameters
n_particles = 30
n_iterations = 10
n_features = X_train.shape[1]
global_best_position = np.random.randn(n_features + 1)  # Weights + bias
global_best_loss = float('inf')
particles_positions = np.random.randn(n_particles, n_features + 1)
particles_velocities = np.random.randn(n_particles, n_features + 1)
particles_best_positions = particles_positions.copy()
particles_best_losses = np.array([fitness(p) for p in particles_positions])

# Step 5: Run the PSO algorithm
for iteration in range(n_iterations):
    for i, position in enumerate(particles_positions):
        # Evaluate fitness (loss) for each particle
        current_loss = fitness(position)
        
        # Update personal best if current loss is better
        if current_loss < particles_best_losses[i]:
            particles_best_losses[i] = current_loss
            particles_best_positions[i] = position
            
        # Update global best if the personal best is better
        if current_loss < global_best_loss:
            global_best_loss = current_loss
            global_best_position = position

    # Update the velocity and position for each particle
    inertia = 0.5
    c1, c2 = 2.0, 2.0
    r1, r2 = np.random.rand(n_particles, n_features + 1), np.random.rand(n_particles, n_features + 1)
    
    particles_velocities = (inertia * particles_velocities + 
                            c1 * r1 * (particles_best_positions - particles_positions) + 
                            c2 * r2 * (global_best_position - particles_positions))
    
    particles_positions += particles_velocities
    
    # Print progress for each iteration
    print(f"Iteration {iteration + 1}/{n_iterations} - Best Loss: {global_best_loss:.4f}")

# Step 6: After optimization, test the best model
weights = global_best_position[:n_features].reshape(-1, 1)
bias = global_best_position[-1]

# Predict on validation set
logits_val = X_val @ weights + bias
probs_val = sigmoid(logits_val)
y_pred = (probs_val >= 0.5).astype(int)

# Calculate accuracy on the validation set
accuracy = accuracy_score(y_val, y_pred)
print(f"Final Test Accuracy: {accuracy:.4f}")


In [15]:
import pickle

# Save the best model (weights and bias) to a file using pickle
model = {
    'weights': global_best_position[:n_features].reshape(-1, 1),
    'bias': global_best_position[-1]
}

# Save the model to a file
with open('pso_optimized_logistic_regression_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("Model saved successfully!")


Model saved successfully!


In [12]:
import sys

epochs = 10
for epoch in range(epochs):
    model.train()
    correct = 0
    total = 0

    for batch_idx, (batch_x, batch_y) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        _, predicted = torch.max(outputs.data, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()

        if (batch_idx + 1) % 100 == 0 or (batch_idx + 1) == len(train_loader):
            print(
                f"\rEpoch {epoch+1}/{epochs} - Batch {batch_idx+1}/{len(train_loader)}", 
                end=""
            )
            sys.stdout.flush()

    train_acc = 100 * correct / total
    print(f"  Train Accuracy: {train_acc:.2f}%", end="")

    # Evaluate on test set
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            outputs = model(batch_x)
            _, predicted = torch.max(outputs.data, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()
    test_acc = 100 * correct / total
    print(f"  Test Accuracy: {test_acc:.2f}%")


Epoch 1/10 - Batch 23810/23810  Train Accuracy: 100.00%  Test Accuracy: 100.00%
Epoch 2/10 - Batch 23810/23810  Train Accuracy: 100.00%  Test Accuracy: 100.00%
Epoch 3/10 - Batch 23810/23810  Train Accuracy: 100.00%  Test Accuracy: 100.00%
Epoch 4/10 - Batch 21400/23810

KeyboardInterrupt: 