In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
import sys
import networkx as nx

In [2]:
# Load data training dan testing
df_train = pd.read_csv('UNSW_NB15_training-set.csv', sep=',', engine='python')
df_test = pd.read_csv('UNSW_NB15_testing-set.csv', sep=',', engine='python')

In [3]:
df_train

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.000011,udp,-,INT,2,0,496,0,90909.090200,...,1,2,0,0,0,1,2,0,Normal,0
1,2,0.000008,udp,-,INT,2,0,1762,0,125000.000300,...,1,2,0,0,0,1,2,0,Normal,0
2,3,0.000005,udp,-,INT,2,0,1068,0,200000.005100,...,1,3,0,0,0,1,3,0,Normal,0
3,4,0.000006,udp,-,INT,2,0,900,0,166666.660800,...,1,3,0,0,0,2,3,0,Normal,0
4,5,0.000010,udp,-,INT,2,0,2126,0,100000.002500,...,1,3,0,0,0,2,3,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82327,82328,0.000005,udp,-,INT,2,0,104,0,200000.005100,...,1,2,0,0,0,2,1,0,Normal,0
82328,82329,1.106101,tcp,-,FIN,20,8,18062,354,24.410067,...,1,1,0,0,0,3,2,0,Normal,0
82329,82330,0.000000,arp,-,INT,1,0,46,0,0.000000,...,1,1,0,0,0,1,1,1,Normal,0
82330,82331,0.000000,arp,-,INT,1,0,46,0,0.000000,...,1,1,0,0,0,1,1,1,Normal,0


### Drop Kolom Tak Terpakai & Pisahkan Label

- id : Ini hanya nomor urut atau penanda unik, tidak ada informasi jaringan di dalamnya. Jadi tidak berguna untuk prediksi.
- attack_cat : Ini adalah kategori serangan (misalnya: Fuzzers, DoS, Reconnaissance). Biasanya ini bisa dipakai untuk multi-class classification. Tapi kalau kita hanya ingin mendeteksi serangan vs tidak (binary classification), maka kita cukup pakai label saja.
- label : Ini adalah target/label yang ingin kita prediksi, jadi harus dipisahkan ke y_train, bukan jadi bagian dari input.

In [4]:
drop_cols = ['id', 'attack_cat', 'label']
X_train = df_train.drop(columns=drop_cols)
y_train = df_train['label']

### Encoding Kolom Kategorikal

In [5]:
from sklearn.preprocessing import LabelEncoder

cat_cols = ['proto', 'service', 'state']
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    label_encoders[col] = le

### Normalisasi Fitur Numerik

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)

# Membangun GNN

Node = Record, Edge = Koneksi berdasarkan kemiripan fitur (cosine similarity). Ini cocok untuk mendeteksi serangan berdasarkan pola data yang mirip satu sama lain.

Membangun struktur graph dari data tabular menggunakan pendekatan K-Nearest Neighbors berdasarkan kemiripan kosinus antar sampel (fitur) dari data tabular agar bisa digunakan di Graph Neural Network (GNN) untuk feature learning dan anomaly detection.

### K-Nearest Neighbors (KNN) Graph Construction

In [7]:
from sklearn.neighbors import NearestNeighbors
import torch

# Set jumlah tetangga (misal: 10 terdekat)
k_neighbors = 10
nn = NearestNeighbors(n_neighbors=k_neighbors + 1, metric='cosine')  # +1 karena termasuk dirinya sendiri
nn.fit(X_scaled)

# Cari tetangga terdekat
distances, indices = nn.kneighbors(X_scaled)

# Buat edge list dari tetangga
edges = []
for i, neighbors in enumerate(indices):
    for j in neighbors[1:]:  # skip dirinya sendiri
        edges.append([i, j])
        edges.append([j, i])  # tambahkan arah sebaliknya

# Konversi ke tensor edge_index
edge_index = torch.tensor(edges, dtype=torch.long).T
print(f"Total edges: {edge_index.shape[1]}")

Total edges: 1646640


Representasi edge dalam bentuk tensor (dua baris: source dan target), yang digunakan dalam PyTorch Geometric (torch_geometric.data.Data).

Graph dibangun secara unsupervised berdasarkan kemiripan semantik dari fitur, yang dipercaya dapat memperkuat performa GNN dalam memahami struktur data.

### Buat Graph Data Object

In [8]:
from torch_geometric.data import Data
import torch

# Konversi fitur dan label ke tensor
x = torch.tensor(X_scaled, dtype=torch.float32)  # node features
y = torch.tensor(y_train.values, dtype=torch.long)  # node labels

# Buat graph data object
graph_data = Data(x=x, edge_index=edge_index, y=y)

print(graph_data)

Data(x=[82332, 42], edge_index=[2, 1646640], y=[82332])


Kode ini digunakan untuk membentuk representasi graf dari data jaringan yang telah diproses sebelumnya, sehingga bisa digunakan sebagai input untuk Graph Neural Network (GNN).

Ini adalah titik awal penting untuk menggunakan GNN karena mengemas semua struktur dan informasi fitur ke dalam satu objek graf yang siap untuk dimodelkan.

## Membuat model GCN (Graph Convolutional Network)

### Membagi Node untuk Pelatihan dan Pengujian

In [9]:
import torch
import numpy as np

# Tentukan indeks untuk training dan testing
num_nodes = graph_data.num_nodes
perm = torch.randperm(num_nodes)

train_ratio = 0.8
train_size = int(train_ratio * num_nodes)

train_idx = perm[:train_size]
test_idx = perm[train_size:]

# Buat mask untuk train/test
graph_data.train_mask = torch.zeros(num_nodes, dtype=torch.bool)
graph_data.train_mask[train_idx] = True

graph_data.test_mask = torch.zeros(num_nodes, dtype=torch.bool)
graph_data.test_mask[test_idx] = True

### Definisi Model GCN

Membangun model neural network berbasis struktur graf yang bisa mengekstrak informasi dari node dan hubungannya.

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

In [11]:
class GCNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCNModel, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)
        self.embedding_output = None  # simpan output dari layer ke-1 (hidden)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        self.embedding_output = x  # simpan hidden layer sebagai embedding
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

### Melatih Model

Melatih model GCN agar bisa memprediksi label dari node berdasarkan struktur dan fitur.

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Panggil model GCNModel
model = GCNModel(
    input_dim=graph_data.num_node_features,
    hidden_dim=64,
    output_dim=2
).to(device)

data = graph_data.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()

for epoch in range(1, 201):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')


Epoch 10, Loss: 0.3529
Epoch 20, Loss: 0.2725
Epoch 30, Loss: 0.2428
Epoch 40, Loss: 0.2247
Epoch 50, Loss: 0.2129
Epoch 60, Loss: 0.2036
Epoch 70, Loss: 0.1958
Epoch 80, Loss: 0.1883
Epoch 90, Loss: 0.1814
Epoch 100, Loss: 0.1755
Epoch 110, Loss: 0.1706
Epoch 120, Loss: 0.1663
Epoch 130, Loss: 0.1628
Epoch 140, Loss: 0.1617
Epoch 150, Loss: 0.1581
Epoch 160, Loss: 0.1556
Epoch 170, Loss: 0.1534
Epoch 180, Loss: 0.1518
Epoch 190, Loss: 0.1544
Epoch 200, Loss: 0.1504


### Evaluasi

In [14]:
from sklearn.metrics import classification_report, accuracy_score

# Evaluasi di test set
model.eval()
with torch.no_grad():
    out = model(data)
    pred = out.argmax(dim=1)

# Ambil label prediksi dan ground truth dari test_mask
y_true = data.y[data.test_mask].cpu().numpy()
y_pred = pred[data.test_mask].cpu().numpy()

# Evaluasi
acc = accuracy_score(y_true, y_pred)
report = classification_report(y_true, y_pred, target_names=["Normal", "Attack"])

print(f"\nAccuracy: {acc:.4f}")
print(report)


Accuracy: 0.9327
              precision    recall  f1-score   support

      Normal       0.91      0.95      0.93      7391
      Attack       0.96      0.92      0.94      9076

    accuracy                           0.93     16467
   macro avg       0.93      0.93      0.93     16467
weighted avg       0.93      0.93      0.93     16467



- Precision tinggi untuk “Attack” (0.96) artinya model jarang salah mendeteksi serangan.
- Recall tinggi untuk “Normal” (0.95) artinya model juga bagus mengenali aktivitas yang tidak berbahaya.
- F1-score seimbang artinya tidak ada bias besar ke salah satu kelas.

Model GNN kamu berhasil mempelajari representasi dari graph dan fitur node untuk membedakan lalu lintas normal dan serangan.

### Simpan Embedding

In [15]:
model.eval()
with torch.no_grad():
    out = model(graph_data)  # hasil klasifikasi
    gnn_embeddings = model.embedding_output.cpu().numpy()  # hasil feature extraction
    labels_np = graph_data.y.cpu().numpy()

# Integrasi ke Reinforcement Learning

Penggunakan hasil feature extraction dari GCN (yaitu gnn_embeddings) sebagai state untuk agen RL, agar ia bisa belajar mengambil keputusan: apakah suatu node (lalu lintas jaringan) merupakan serangan atau bukan.

In [25]:
from collections import Counter

# Label target dari dataset
Counter(labels_np)

Counter({np.int64(1): 45332, np.int64(0): 37000})

### Buat Environment RL 

In [26]:
import gym
from gym import spaces
import numpy as np

class NetworkEnv(gym.Env):
    def __init__(self, embeddings, labels):
        super(NetworkEnv, self).__init__()
        self.embeddings = embeddings
        self.labels = labels
        self.num_nodes = len(labels)
        self.current_node = 0

        # Observation = embedding (hasil GCN)
        self.observation_space = spaces.Box(
            low=-np.inf,
            high=np.inf,
            shape=(embeddings.shape[1],),
            dtype=np.float32
        )

        # Actions: 0 = Normal, 1 = Attack
        self.action_space = spaces.Discrete(2)

    def reset(self):
        self.current_node = 0
        return self.embeddings[self.current_node]

    def step(self, action):
        true_label = self.labels[self.current_node]

        # Imbalanced-sensitive reward function
        if action == true_label:
            reward = 2 if true_label == 1 else 1
        else:
            reward = -2 if true_label == 1 else -1

        self.current_node += 1
        done = self.current_node >= self.num_nodes

        if not done:
            next_state = self.embeddings[self.current_node]
        else:
            next_state = np.zeros_like(self.embeddings[0])

        return next_state, reward, done, {'label': true_label}

### Buat DQN Agent

In [27]:
import torch.nn as nn
import torch.nn.functional as F

class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)

### Latih Agen RL (Deep Q-Learning)

In [28]:
import torch.optim as optim
import random
import torch

# Hyperparameter
num_episodes = 10
gamma = 0.99
epsilon = 1.0
epsilon_decay = 0.995
epsilon_min = 0.1
lr = 0.001

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
env = NetworkEnv(gnn_embeddings, labels_np)
agent = DQN(input_dim=gnn_embeddings.shape[1], output_dim=2).to(device)
optimizer = optim.Adam(agent.parameters(), lr=lr)
loss_fn = nn.MSELoss()

reward_history = []

for episode in range(num_episodes):
    state = env.reset()
    state = torch.tensor(state, dtype=torch.float32).to(device)
    done = False
    total_reward = 0

    while not done:
        # Epsilon-greedy action
        if random.random() < epsilon:
            action = random.randint(0, 1)
        else:
            with torch.no_grad():
                q_values = agent(state)
                action = torch.argmax(q_values).item()

        next_state, reward, done, _ = env.step(action)
        total_reward += reward

        next_state_tensor = torch.tensor(next_state, dtype=torch.float32).to(device)
        reward_tensor = torch.tensor([reward], dtype=torch.float32).to(device)
        action_tensor = torch.tensor([action], dtype=torch.long).to(device)

        # Target Q-value
        with torch.no_grad():
            max_next_q = torch.max(agent(next_state_tensor))
            target_q = reward_tensor + (gamma * max_next_q)

        current_q = agent(state)[action_tensor]

        loss = loss_fn(current_q, target_q)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        state = next_state_tensor

    reward_history.append(total_reward)
    epsilon = max(epsilon * epsilon_decay, epsilon_min)

    print(f"Episode {episode}, Total Reward: {total_reward}, Epsilon: {epsilon:.3f}")


Episode 0, Total Reward: -368, Epsilon: 0.995
Episode 1, Total Reward: 186, Epsilon: 0.990
Episode 2, Total Reward: 998, Epsilon: 0.985
Episode 3, Total Reward: 2450, Epsilon: 0.980
Episode 4, Total Reward: 2262, Epsilon: 0.975
Episode 5, Total Reward: 2276, Epsilon: 0.970
Episode 6, Total Reward: 3236, Epsilon: 0.966
Episode 7, Total Reward: 4438, Epsilon: 0.961
Episode 8, Total Reward: 5170, Epsilon: 0.956
Episode 9, Total Reward: 4824, Epsilon: 0.951


In [22]:
from sklearn.metrics import classification_report

agent.eval()
state = env.reset()
state = torch.tensor(state, dtype=torch.float32).to(device)
done = False
total_reward = 0

true_labels = []
predictions = []

step_index = 0  # Indeks node yang sedang dievaluasi

while not done:
    with torch.no_grad():
        q_values = agent(state)
        action = torch.argmax(q_values).item()

    next_state, reward, done, _ = env.step(action)
    total_reward += reward

    predictions.append(action)
    true_labels.append(labels_np[step_index])  # Ambil label dari array label asli
    step_index += 1

    state = torch.tensor(next_state, dtype=torch.float32).to(device)

print(f"Evaluation total reward: {total_reward}")

# Evaluasi klasifikasi
print(classification_report(true_labels, predictions, target_names=["Normal", "Attack"]))

Evaluation total reward: -8340
              precision    recall  f1-score   support

      Normal       0.45      1.00      0.62     37000
      Attack       0.42      0.00      0.00     45332

    accuracy                           0.45     82332
   macro avg       0.44      0.50      0.31     82332
weighted avg       0.43      0.45      0.28     82332



In [29]:
# Simpan model GCN ke file
torch.save(model.state_dict(), 'model_gcn.pt')

In [30]:
# Simpan model DQN
torch.save(agent.state_dict(), 'model_dqn.pt')