## Встановлення необхідних бібліотек

In [None]:

import numpy as np
import pandas as pd
from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

print("Torch:", torch.__version__)


Torch: 2.9.0+cu126


## Завантаження і merge всіх .csv файлів

In [None]:
import pandas as pd
import glob

files = sorted(glob.glob("/content/events_202512*.csv"))
dfs = [pd.read_csv(f) for f in files]

merged = pd.concat(dfs, ignore_index=True)

merged["timestamp"] = pd.to_datetime(
    merged["timestamp"],
    format="mixed"
)

merged = merged.sort_values("timestamp").reset_index(drop=True)

print("Total rows:", len(merged))

OUT_PATH = "/content/SORTED_24hours.csv"

merged.to_csv(OUT_PATH, index=False)

print("Saved sorted dataset to:", OUT_PATH)


Total rows: 103977
Saved sorted dataset to: /content/SORTED_24hours.csv


In [None]:
CSV_PATH = "/content/SORTED_24hours.csv"

df = pd.read_csv(CSV_PATH)
df["timestamp"] = pd.to_datetime(df["timestamp"], format="mixed")

print("Rows:", len(df))
print("Active users:", df["src_ip"].nunique())

df.head()


Rows: 103977
Active users: 1175


Unnamed: 0,id,timestamp,direction,src_ip,dst_ip,src_port,dst_port,protocol,src_mac,dst_mac,...,dns_qname,dns_answer_ip,tls_sni,ndpi_master_proto,ndpi_app_proto,ndpi_category,dst_geo_country,dst_geo_city,decision_action,decision_rule_id
0,248798,2025-12-08 18:00:00.053988,outbound,10.20.0.110,10.20.0.1,36038,53,udp,bc:24:11:18:89:6b,bc:24:11:e4:6f:c6,...,,,,Unknown,DNS,Network,,,allow,
1,248799,2025-12-08 18:00:00.058020,outbound,10.20.0.1,10.20.0.110,53,36038,udp,bc:24:11:e4:6f:c6,bc:24:11:18:89:6b,...,,,,Unknown,DNS,Network,,,allow,
2,248800,2025-12-08 18:00:00.061712,outbound,10.20.0.110,34.117.59.81,43250,443,tcp,bc:24:11:18:89:6b,bc:24:11:e4:6f:c6,...,,,ipinfo.io,Unknown,TLS,Web,US,Kansas City,allow,
3,248801,2025-12-08 18:00:00.065000,inbound,34.117.59.81,10.20.0.110,443,43250,tcp,bc:24:11:e4:6f:c6,bc:24:11:18:89:6b,...,,,,Unknown,TLS,Web,,,allow,
4,248802,2025-12-08 18:00:00.069241,outbound,10.20.0.110,151.101.2.132,37600,443,tcp,bc:24:11:18:89:6b,bc:24:11:e4:6f:c6,...,,,kafka.apache.org,Unknown,TLS,Web,US,,allow,


## Вивід основної інформації з логів

In [None]:
df = df[df["direction"]=="outbound"]
df["tls_sni"] = df["tls_sni"].fillna("NO_SNI")
df = df[df["tls_sni"]!="NO_SNI"]

print("Events after filter:", len(df))
print("Users:", df["src_ip"].nunique())


Events after filter: 30132
Users: 5


## LabelEncoder

In [None]:
le = LabelEncoder()
df["sni_id"] = le.fit_transform(df["tls_sni"])

print("Unique websites:", len(le.classes_))


Unique websites: 87


In [None]:
WINDOW = 8

X = []
Y = []

for ip, g in df.sort_values("timestamp").groupby("src_ip"):

    seq = g["sni_id"].to_numpy()

    if len(seq) <= WINDOW:
        continue

    for i in range(len(seq) - WINDOW):
        X.append(seq[i:i+WINDOW])
        Y.append(seq[i+WINDOW])

X = np.array(X)
Y = np.array(Y)

print("Sequences:", X.shape)
print("Labels:", Y.shape)


Sequences: (30096, 8)
Labels: (30096,)


In [None]:
from collections import Counter
min_samples = 2

counter = Counter(Y)

keep_mask = np.array([
    counter[y] >= min_samples
    for y in Y
])

X = X[keep_mask]
Y = Y[keep_mask]

print("Sequences after filtering:", X.shape)

X_train, X_test, y_train, y_test = train_test_split(
    X, Y,
    test_size=0.2,
    random_state=42,
    stratify=Y
)


Sequences after filtering: (30095, 8)


## Побудова класу датасету і LSTM моделі

In [None]:
class SeqDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [None]:
class NextSiteLSTM(nn.Module):

    def __init__(self, vocab_size, embed_dim=32, hidden=64):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden, batch_first=True)
        self.fc = nn.Linear(hidden, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        h,_ = self.lstm(x)
        out = h[:,-1]
        return self.fc(out)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab_size = len(le.classes_)
model = NextSiteLSTM(vocab_size).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

train_loader = DataLoader(
    SeqDataset(X_train,y_train),
    batch_size=64,
    shuffle=True
)

EPOCHS = 20

for ep in range(EPOCHS):

    model.train()
    losses = []

    for xb,yb in train_loader:
        xb = xb.to(device)
        yb = yb.to(device)

        optimizer.zero_grad()

        preds = model(xb)
        loss = loss_fn(preds,yb)

        loss.backward()
        optimizer.step()

        losses.append(loss.item())

    print(f"Epoch {ep+1}/{EPOCHS} | loss={np.mean(losses):.4f}")


Epoch 1/20 | loss=2.4791
Epoch 2/20 | loss=1.8522
Epoch 3/20 | loss=1.7307
Epoch 4/20 | loss=1.6878
Epoch 5/20 | loss=1.6691
Epoch 6/20 | loss=1.6564
Epoch 7/20 | loss=1.6475
Epoch 8/20 | loss=1.6393
Epoch 9/20 | loss=1.6341
Epoch 10/20 | loss=1.6278
Epoch 11/20 | loss=1.6225
Epoch 12/20 | loss=1.6159
Epoch 13/20 | loss=1.6133
Epoch 14/20 | loss=1.6045
Epoch 15/20 | loss=1.6004
Epoch 16/20 | loss=1.5933
Epoch 17/20 | loss=1.5897
Epoch 18/20 | loss=1.5821
Epoch 19/20 | loss=1.5763
Epoch 20/20 | loss=1.5697


## Метрики

In [None]:

model.eval()

with torch.no_grad():

    X_test_t = torch.tensor(X_test, dtype=torch.long).to(device)

    logits = model(X_test_t)

    probs = torch.softmax(logits, dim=1)

    preds1 = probs.argmax(1).cpu().numpy()

    preds3 = probs.topk(3, dim=1).indices.cpu().numpy()


In [None]:
acc = accuracy_score(y_test, preds1)
print("Top-1 accuracy:", round(acc,4))


Top-1 accuracy: 0.489


In [None]:
top3 = 0

for i in range(len(y_test)):
    if y_test[i] in preds3[i]:
        top3 += 1

top3_acc = top3 / len(y_test)
print("Top-3 accuracy:", round(top3_acc,4))


Top-3 accuracy: 0.6847


In [None]:
print("\n=== REAL EXAMPLES ===")

for i in range(10):

    hist = X_test[i]

    true_site = le.inverse_transform([y_test[i]])[0]

    pred_sites = le.inverse_transform(preds3[i])

    hist_sites = le.inverse_transform(hist)

    print("\nHistory:")
    print(" → ".join(hist_sites[-4:]))

    print("TRUE next site:", true_site)
    print("Predicted:", list(pred_sites))



=== REAL EXAMPLES ===

History:
archive.org → httpbingo.org → picsum.photos → www.rust-lang.org
TRUE next site: ipinfo.io
Predicted: ['httpbin.org', 'httpbingo.org', 'www.wikipedia.org']

History:
www.mozilla.org → httpbin.org → www.rfc-editor.org → www.rfc-editor.org
TRUE next site: ipinfo.io
Predicted: ['www.mozilla.org', 'httpbin.org', 'www.debian.org']

History:
curl.se → www.openssl.org → www.mozilla.org → cdn.kernel.org
TRUE next site: httpd.apache.org
Predicted: ['www.rfc-editor.org', 'www.mozilla.org', 'developer.mozilla.org']

History:
registry-1.docker.io → dl-cdn.alpinelinux.org → www.hashicorp.com → quay.io
TRUE next site: rpmfind.net
Predicted: ['dl-cdn.alpinelinux.org', 'repos.fedorapeople.org', 'rpmfind.net']

History:
httpbin.org → dask.org → s3.amazonaws.com → pypi.org
TRUE next site: httpbingo.org
Predicted: ['httpbin.org', 'httpbingo.org', 'worldtimeapi.org']

History:
picsum.photos → picsum.photos → picsum.photos → speed.hetzner.de
TRUE next site: picsum.photos
Pre

In [None]:
def top_k_accuracy(y_true, probs, k=10):
    topk = probs.topk(k, dim=1).indices.cpu().numpy()

    hits = 0
    for i in range(len(y_true)):
        if y_true[i] in topk[i]:
            hits += 1

    return hits / len(y_true)
from sklearn.metrics import accuracy_score

top1 = accuracy_score(y_test, probs.argmax(1).cpu().numpy())
top3 = top_k_accuracy(y_test, probs, k=3)
top5 = top_k_accuracy(y_test, probs, k=5)
top10 = top_k_accuracy(y_test, probs, k=10)

print("Top-1 Accuracy :", round(top1,4))
print("Top-3 Accuracy :", round(top3,4))
print("Top-5 Accuracy :", round(top5,4))
print("Top-10 Accuracy:", round(top10,4))


Top-1 Accuracy : 0.489
Top-3 Accuracy : 0.6847
Top-5 Accuracy : 0.7714
Top-10 Accuracy: 0.8261
