In [4]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import sys

In [25]:
data_path = 'F:/Documents/CRCE/Project/NIDS/dataset/Edge-IIoT/Edge-IIoTset dataset/Selected dataset for ML and DL/DNN-EdgeIIoT-dataset.csv'  # Replace with your actual path
df = pd.read_csv(data_path, low_memory=False)
df.head()

Unnamed: 0,frame.time,ip.src_host,ip.dst_host,arp.dst.proto_ipv4,arp.opcode,arp.hw.size,arp.src.proto_ipv4,icmp.checksum,icmp.seq_le,icmp.transmit_timestamp,...,mqtt.proto_len,mqtt.protoname,mqtt.topic,mqtt.topic_len,mqtt.ver,mbtcp.len,mbtcp.trans_id,mbtcp.unit_id,Attack_label,Attack_type
0,2021 11:44:10.081753000,192.168.0.128,192.168.0.101,0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,Normal
1,2021 11:44:10.162218000,192.168.0.101,192.168.0.128,0,0.0,0.0,0,0.0,0.0,0.0,...,4.0,MQTT,0,0.0,4.0,0.0,0.0,0.0,0,Normal
2,2021 11:44:10.162271000,192.168.0.128,192.168.0.101,0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,Normal
3,2021 11:44:10.162641000,192.168.0.128,192.168.0.101,0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,Normal
4,2021 11:44:10.166132000,192.168.0.101,192.168.0.128,0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0,Temperature_and_Humidity,24.0,0.0,0.0,0.0,0.0,0,Normal


In [26]:
# -------------------- 1. Preprocessing -------------------- #
# Drop unnecessary columns
drop_columns = [
    "frame.time", "ip.src_host", "ip.dst_host", "arp.src.proto_ipv4", "arp.dst.proto_ipv4",
    "http.file_data", "http.request.full_uri", "icmp.transmit_timestamp", "http.request.uri.query",
    "tcp.options", "tcp.payload", "tcp.srcport", "tcp.dstport", "udp.port", "mqtt.msg"
]
df.drop(drop_columns, axis=1, inplace=True, errors='ignore')

# Remove rows with NaN or duplicates
df.dropna(axis=0, how='any', inplace=True)
df.drop_duplicates(subset=None, keep="first", inplace=True)

# Remove leading/trailing spaces from strings
df = df.map(lambda x: x.strip() if isinstance(x, str) else x)

# Shuffle data
df = shuffle(df)

# Helper function to one-hot encode categorical columns
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name], prefix=name)
    df = pd.concat([df, dummies], axis=1)
    return df.drop(name, axis=1)

# Encode categorical features
for col in ['http.request.method', 'http.referer', 'http.request.version',
            'dns.qry.name.len', 'mqtt.conack.flags', 'mqtt.protoname', 'mqtt.topic']:
    if col in df.columns:
        df = encode_text_dummy(df, col)

# Drop any remaining non-numeric columns
# df = df.select_dtypes(include=[np.number])

# Save 'Attack_type' before dropping non-numeric columns
labels = df['Attack_type']

# Drop any remaining non-numeric columns (preserve only numeric features)
df = df.select_dtypes(include=[np.number]).copy()

# Add 'Attack_type' column back
df['Attack_type'] = labels

In [27]:
df.columns

Index(['arp.opcode', 'arp.hw.size', 'icmp.checksum', 'icmp.seq_le',
       'icmp.unused', 'http.content_length', 'http.response', 'http.tls_port',
       'tcp.ack', 'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin',
       'tcp.connection.rst', 'tcp.connection.syn', 'tcp.connection.synack',
       'tcp.flags', 'tcp.flags.ack', 'tcp.len', 'tcp.seq', 'udp.stream',
       'udp.time_delta', 'dns.qry.name', 'dns.qry.qu', 'dns.qry.type',
       'dns.retransmission', 'dns.retransmit_request',
       'dns.retransmit_request_in', 'mqtt.conflag.cleansess', 'mqtt.conflags',
       'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg_decoded_as', 'mqtt.msgtype',
       'mqtt.proto_len', 'mqtt.topic_len', 'mqtt.ver', 'mbtcp.len',
       'mbtcp.trans_id', 'mbtcp.unit_id', 'Attack_label', 'Attack_type'],
      dtype='object')

In [28]:
# -------------------- 2. Feature Prep -------------------- #
# Separate labels and features
if 'Attack_type' in df.columns:
    labels = df['Attack_type']
    df.drop(['Attack_type'], axis=1, inplace=True)
else:
    raise ValueError("Attack_type column is missing.")

# Optional: also drop 'Attack_label' if still present
df.drop(['Attack_label'], axis=1, inplace=True, errors='ignore')

# Encode target labels
le = LabelEncoder()
labels_encoded = le.fit_transform(labels)

# Scale features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(df)

# Convert to tensors
X = torch.tensor(features_scaled, dtype=torch.float32).unsqueeze(1)  # Add channel dim
y = torch.tensor(labels_encoded, dtype=torch.long)


In [29]:
# -------------------- 3. Data Split -------------------- #
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.25, stratify=y_trainval, random_state=42)

batch_size = 64
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=batch_size)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=batch_size)


In [30]:
# -------------------- 4. CNN Model -------------------- #
num_class = len(le.classes_)

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(1, 64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(64, 32, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(32, 16, kernel_size=3, padding=1)        
        self.fc1 = nn.Linear(X.shape[2] * 16, 30)
        self.out_layer = nn.Linear(30, num_class)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.max_pool1d(x, kernel_size=1)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return F.log_softmax(self.out_layer(x), dim=1)

In [31]:
# -------------------- 5. Training -------------------- #
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNN().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.NLLLoss()
epochs = 10

for epoch in range(epochs):
    model.train()
    total = correct = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)

        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        pred = output.argmax(dim=1)
        correct += pred.eq(target).sum().item()
        total += target.size(0)

        print(f"\rEpoch {epoch+1}/{epochs} - Batch {batch_idx+1}/{len(train_loader)} - Loss: {loss.item():.4f}", end='')

    train_acc = correct / total
    print(f"\nEpoch {epoch+1} Training Accuracy: {train_acc:.4f}")

    # Validation
    model.eval()
    val_preds, val_true = [], []
    with torch.no_grad():
        for data, target in val_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            pred = output.argmax(dim=1)
            val_preds.extend(pred.cpu().numpy())
            val_true.extend(target.cpu().numpy())

    val_acc = accuracy_score(val_true, val_preds)
    print(f"Validation Accuracy: {val_acc:.4f}")
    print("Classification Report:\n", classification_report(val_true, val_preds, target_names=le.classes_))

Epoch 1/10 - Batch 17904/17904 - Loss: 0.2000
Epoch 1 Training Accuracy: 0.8852
Validation Accuracy: 0.8924
Classification Report:
                        precision    recall  f1-score   support

             Backdoor       0.90      0.41      0.57      4805
            DDoS_HTTP       0.84      0.35      0.49      9709
            DDoS_ICMP       1.00      0.99      0.99     13588
             DDoS_TCP       0.79      0.99      0.88     10012
             DDoS_UDP       1.00      1.00      1.00     24313
       Fingerprinting       0.39      0.69      0.50       171
                 MITM       1.00      0.92      0.96        72
               Normal       0.89      1.00      0.94    272799
             Password       0.80      0.11      0.19      9987
        Port_Scanning       1.00      0.51      0.68      3995
           Ransomware       0.90      0.40      0.55      1938
        SQL_injection       0.86      0.15      0.25     10165
            Uploading       0.63      0.21      

In [32]:
# -------------------- 6. Final Evaluation -------------------- #
model.eval()
test_preds, test_true = [], []
with torch.no_grad():
    for data, target in test_loader:
        data, target = data.to(device), target.to(device)
        output = model(data)
        pred = output.argmax(dim=1)
        test_preds.extend(pred.cpu().numpy())
        test_true.extend(target.cpu().numpy())

test_acc = accuracy_score(test_true, test_preds)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")
print("Test Classification Report:\n", classification_report(test_true, test_preds, target_names=le.classes_))


Final Test Accuracy: 0.9032
Test Classification Report:
                        precision    recall  f1-score   support

             Backdoor       0.99      0.89      0.94      4805
            DDoS_HTTP       0.97      0.34      0.50      9709
            DDoS_ICMP       1.00      1.00      1.00     13588
             DDoS_TCP       0.78      1.00      0.88     10012
             DDoS_UDP       1.00      1.00      1.00     24314
       Fingerprinting       0.75      0.33      0.46       171
                 MITM       1.00      0.99      0.99        72
               Normal       0.89      1.00      0.94    272800
             Password       0.88      0.16      0.27      9987
        Port_Scanning       1.00      0.49      0.66      3995
           Ransomware       0.84      0.77      0.80      1938
        SQL_injection       0.94      0.17      0.28     10165
            Uploading       0.81      0.20      0.32      7361
Vulnerability_scanner       0.98      0.83      0.90     10