<a href="https://colab.research.google.com/github/shiva-prasad-maroju/hids-gru/blob/main/Untitled8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!unzip -q ADFA-LD.zip
!ls ADFA-LD



Attack_Data_Master  Training_Data_Master  Validation_Data_Master


In [None]:
import os
from collections import Counter

DATASET_ROOT = "/content/ADFA-LD"

X_train = []
X_test = []
y_test = []

# Training (benign)
for fname in os.listdir(f"{DATASET_ROOT}/Training_Data_Master"):
    if fname.endswith(".txt"):
        with open(f"{DATASET_ROOT}/Training_Data_Master/{fname}") as f:
            X_train.append(list(map(int, f.read().split())))

# Validation benign
for fname in os.listdir(f"{DATASET_ROOT}/Validation_Data_Master"):
    if fname.endswith(".txt"):
        with open(f"{DATASET_ROOT}/Validation_Data_Master/{fname}") as f:
            X_test.append(list(map(int, f.read().split())))
            y_test.append(0)

# Attacks (nested)
for atk in os.listdir(f"{DATASET_ROOT}/Attack_Data_Master"):
    atk_dir = f"{DATASET_ROOT}/Attack_Data_Master/{atk}"
    if not os.path.isdir(atk_dir):
        continue
    for fname in os.listdir(atk_dir):
        if fname.endswith(".txt"):
            with open(f"{atk_dir}/{fname}") as f:
                X_test.append(list(map(int, f.read().split())))
                y_test.append(1)

print("Train:", len(X_train))
print("Test:", Counter(y_test))


Train: 833
Test: Counter({0: 4372, 1: 746})


In [None]:
import torch
print(torch.cuda.is_available())


True


In [None]:
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from collections import Counter


In [None]:
DATASET_ROOT = "/content/ADFA-LD"

X_train = []
X_test = []
y_test = []

# Training (benign)
for f in os.listdir(f"{DATASET_ROOT}/Training_Data_Master"):
    if f.endswith(".txt"):
        with open(f"{DATASET_ROOT}/Training_Data_Master/{f}") as fh:
            X_train.append(list(map(int, fh.read().split())))

# Validation benign
for f in os.listdir(f"{DATASET_ROOT}/Validation_Data_Master"):
    if f.endswith(".txt"):
        with open(f"{DATASET_ROOT}/Validation_Data_Master/{f}") as fh:
            X_test.append(list(map(int, fh.read().split())))
            y_test.append(0)

# Attacks (nested)
for atk in os.listdir(f"{DATASET_ROOT}/Attack_Data_Master"):
    atk_dir = f"{DATASET_ROOT}/Attack_Data_Master/{atk}"
    if not os.path.isdir(atk_dir):
        continue
    for f in os.listdir(atk_dir):
        if f.endswith(".txt"):
            with open(f"{atk_dir}/{f}") as fh:
                X_test.append(list(map(int, fh.read().split())))
                y_test.append(1)

print("Train:", len(X_train))
print("Test:", Counter(y_test))
X_train_raw = X_train
X_test_raw = X_test


Train: 833
Test: Counter({0: 4372, 1: 746})


In [None]:
MAX_LEN = 1300

def pad_np(seqs, maxlen):
    arr = np.zeros((len(seqs), maxlen), dtype=np.int32)
    for i, s in enumerate(seqs):
        s = s[:maxlen]
        arr[i, :len(s)] = s
    return arr

X_train = torch.tensor(pad_np(X_train, MAX_LEN), dtype=torch.long)
X_test  = torch.tensor(pad_np(X_test, MAX_LEN), dtype=torch.long)
y_test  = torch.tensor(y_test, dtype=torch.long)

print(X_train.shape, X_test.shape)


torch.Size([833, 1300]) torch.Size([5118, 1300])


In [None]:
class GRULanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        emb = self.embed(x)
        h, _ = self.gru(emb)
        return self.out(h)


In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=0)


In [None]:
VOCAB_SIZE = int(max(X_train.max(), X_test.max())) + 1
device = "cuda" if torch.cuda.is_available() else "cpu"

model = GRULanguageModel(VOCAB_SIZE).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

loader = DataLoader(TensorDataset(X_train), batch_size=32, shuffle=True)

for epoch in range(10):
    model.train()
    total = 0
    for (x,) in loader:
        x = x.to(device)

        optimizer.zero_grad()
        logits = model(x[:, :-1])       # predict next token
        loss = criterion(
            logits.reshape(-1, VOCAB_SIZE),
            x[:, 1:].reshape(-1)
        )
        loss.backward()
        optimizer.step()
        total += loss.item()
    print(f"Epoch {epoch+1} | Loss {total/len(loader):.4f}")

Epoch 1 | Loss 4.5448
Epoch 2 | Loss 2.5587
Epoch 3 | Loss 2.1149
Epoch 4 | Loss 1.9077
Epoch 5 | Loss 1.7583
Epoch 6 | Loss 1.6629
Epoch 7 | Loss 1.6058
Epoch 8 | Loss 1.5255
Epoch 9 | Loss 1.4631
Epoch 10 | Loss 1.4412


In [None]:
def sequence_nll(model, X, batch_size=32):
    model.eval()
    scores = []

    loader = DataLoader(TensorDataset(X), batch_size=batch_size)

    with torch.no_grad():
        for (x,) in loader:
            x = x.to(device)
            logits = model(x[:, :-1])

            loss = nn.functional.cross_entropy(
                logits.reshape(-1, VOCAB_SIZE),
                x[:, 1:].reshape(-1),
                ignore_index=0,
                reduction="none"
            )

            seq_loss = loss.view(x.size(0), -1).mean(dim=1)
            scores.append(seq_loss.cpu())

    return torch.cat(scores)


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

train_scores = sequence_nll(model, X_train)
test_scores  = sequence_nll(model, X_test)

# Calculate preds for a chosen k (e.g., k=2.0), which was the last value used in the previous cell.
k = 2.0
thr = train_scores.mean() + k * train_scores.std()
preds = (test_scores > thr).long()

print(f"Metrics for k={k}:")
print(confusion_matrix(y_test.cpu(), preds.cpu()))
print(classification_report(y_test.cpu(), preds.cpu()))

Metrics for k=2.0:
[[3934  438]
 [ 591  155]]
              precision    recall  f1-score   support

           0       0.87      0.90      0.88      4372
           1       0.26      0.21      0.23       746

    accuracy                           0.80      5118
   macro avg       0.57      0.55      0.56      5118
weighted avg       0.78      0.80      0.79      5118



In [None]:
train_scores = sequence_nll(model, X_train)
test_scores  = sequence_nll(model, X_test)

for k in [0.5, 1.0, 1.5, 2.0]:
    thr = train_scores.mean() + k * train_scores.std()
    preds = (test_scores > thr).long()

    from sklearn.metrics import recall_score, precision_score
    print(
        f"k={k} | recall={recall_score(y_test, preds):.3f} | "
        f"precision={precision_score(y_test, preds):.3f}"
    )


k=0.5 | recall=0.410 | precision=0.223
k=1.0 | recall=0.328 | precision=0.240
k=1.5 | recall=0.263 | precision=0.245
k=2.0 | recall=0.208 | precision=0.261


In [None]:
import numpy as np

def pad_np(seqs, maxlen):
    arr = np.zeros((len(seqs), maxlen), dtype=np.int32)
    for i, s in enumerate(seqs):
        s = s[:maxlen]
        arr[i, :len(s)] = s
    return arr


In [None]:
import torch
import torch.nn.functional as F

def create_sliding_windows(seq, window, stride):
    # Helper function to generate sliding windows from a single sequence
    windows = []
    for i in range(0, len(seq) - window + 1, stride):
        windows.append(seq[i:i + window])
    return windows

def sliding_window_nll(
    model,
    sequences,          # list of raw syscall sequences
    window=80,
    stride=40,
    batch_size=64
):
    model.eval()
    scores = []

    with torch.no_grad():
        for seq in sequences:
            windows = create_sliding_windows(seq, window, stride)

            # very short sequences
            if len(windows) == 0:
                scores.append(0.0)
                continue

            # pad windows
            w = torch.tensor(
                pad_np(windows, window),
                dtype=torch.long
            ).to(device)

            # language model: predict next syscall
            logits = model(w[:, :-1])

            loss = F.cross_entropy(
                logits.reshape(-1, VOCAB_SIZE),
                w[:, 1:].reshape(-1),
                ignore_index=0,
                reduction="none"
            )

            # mean NLL per window
            window_nll = loss.view(w.size(0), -1).mean(dim=1)

            # IMPORTANT: max window = anomaly score
            scores.append(window_nll.max().item())

    return np.array(scores)

In [None]:
# X_train_raw, X_test_raw = original Python lists (before padding)

train_scores = sliding_window_nll(
    model,
    X_train_raw,
    window=80,
    stride=40
)

test_scores = sliding_window_nll(
    model,
    X_test_raw,
    window=80,
    stride=40
)


In [None]:
from sklearn.metrics import recall_score, precision_score

y_true = y_test.cpu().numpy()

for k in [0.5, 1.0, 1.5]:
    thr = train_scores.mean() + k * train_scores.std()
    preds = (test_scores > thr).astype(int)

    print(
        f"k={k} | "
        f"recall={recall_score(y_true, preds):.3f} | "
        f"precision={precision_score(y_true, preds):.3f}"
    )


k=0.5 | recall=0.562 | precision=0.243
k=1.0 | recall=0.466 | precision=0.269
k=1.5 | recall=0.340 | precision=0.299
