In [1]:
import numpy as np
import pandas as pd
import csv
from torch.utils.data import DataLoader
import polars as pl
from sklearn.cluster import MiniBatchKMeans
from tqdm import tqdm
from pathlib import Path

In [2]:
from skripsi_code.utils.domain_dataset import MultiChunkParquet, MultiChunkDataset
from skripsi_code.utils.dataloader import random_split_dataloader

In [3]:
DATA_LIST = [
    "NF-BoT-IoT-v2",
    "NF-CSE-CIC-IDS2018-v2",
    "NF-ToN-IoT-v2",
    "NF-UNSW-NB15-v2",
]

In [4]:
DATA_PATH = "../data/parquet/"
train_data = MultiChunkParquet(
    DATA_PATH,
    DATA_LIST,
    domain=DATA_LIST,
    buffer_size=16,
    chunk_mode=True,
    get_domain=True,
    get_cluster=True,
)

# DATA_PATH = "../data/interim/"
# train_data = MultiChunkDataset(DATA_PATH, DATA_LIST, domain=DATA_LIST, buffer_size=512, chunk_mode=True)

Data directories: ['NF-BoT-IoT-v2', 'NF-CSE-CIC-IDS2018-v2', 'NF-ToN-IoT-v2', 'NF-UNSW-NB15-v2']


Loading NF-BoT-IoT-v2: 100%|██████████| 378/378 [00:00<00:00, 2745.24it/s]
Loading NF-CSE-CIC-IDS2018-v2: 100%|██████████| 189/189 [00:00<00:00, 3201.81it/s]
Loading NF-ToN-IoT-v2: 100%|██████████| 170/170 [00:00<00:00, 3053.46it/s]
Loading NF-UNSW-NB15-v2: 100%|██████████| 24/24 [00:00<00:00, 3277.97it/s]


In [5]:
len(train_data.buffer)

16

In [6]:
train_data

<skripsi_code.utils.domain_dataset.MultiChunkParquet at 0x760f9029fee0>

In [7]:
train_data.buffer[0].estimated_size("gb") * 100

1.0630395263433456

In [8]:
len(train_data)

761

In [9]:
import torch

train_data.buffer[0].select(pl.nth(range(4, 43))).cast(pl.Float32)

PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,SERVER_TCP_FLAGS,FLOW_DURATION_MILLISECONDS,DURATION_IN,DURATION_OUT,MIN_TTL,MAX_TTL,LONGEST_FLOW_PKT,SHORTEST_FLOW_PKT,MIN_IP_PKT_LEN,MAX_IP_PKT_LEN,SRC_TO_DST_SECOND_BYTES,DST_TO_SRC_SECOND_BYTES,RETRANSMITTED_IN_BYTES,RETRANSMITTED_IN_PKTS,RETRANSMITTED_OUT_BYTES,RETRANSMITTED_OUT_PKTS,SRC_TO_DST_AVG_THROUGHPUT,DST_TO_SRC_AVG_THROUGHPUT,NUM_PKTS_UP_TO_128_BYTES,NUM_PKTS_128_TO_256_BYTES,NUM_PKTS_256_TO_512_BYTES,NUM_PKTS_512_TO_1024_BYTES,NUM_PKTS_1024_TO_1514_BYTES,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE
f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
6.0,7.0,140.0,1.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,140.0,140.0,0.0,140.0,140.0,0.0,0.0,0.0,0.0,0.0,1.12e6,0.0,0.0,1.0,0.0,0.0,0.0,512.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6.0,7.0,280.0,2.0,0.0,0.0,2.0,2.0,0.0,4.294467e6,500.0,0.0,64.0,64.0,140.0,140.0,0.0,140.0,280.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,512.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6.0,7.0,280.0,2.0,0.0,0.0,2.0,2.0,0.0,4.29378e6,1187.0,0.0,64.0,64.0,140.0,140.0,0.0,140.0,140140.0,0.0,0.0,0.0,0.0,0.0,1.12e6,0.0,0.0,2.0,0.0,0.0,0.0,512.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17.0,188.0,56.0,2.0,0.0,0.0,0.0,0.0,0.0,4.294545e6,422.0,0.0,64.0,64.0,28.0,28.0,0.0,28.0,56.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17.0,188.0,56.0,2.0,0.0,0.0,0.0,0.0,0.0,4.293732e6,1235.0,0.0,64.0,64.0,28.0,28.0,0.0,28.0,2828.0,0.0,0.0,0.0,0.0,0.0,224000.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
17.0,188.0,84.0,3.0,0.0,0.0,0.0,0.0,0.0,4.29392e6,1047.0,0.0,64.0,64.0,28.0,28.0,0.0,28.0,5628.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17.0,188.0,56.0,2.0,0.0,0.0,0.0,0.0,0.0,4.29467e6,297.0,0.0,64.0,64.0,28.0,28.0,0.0,28.0,2828.0,0.0,0.0,0.0,0.0,0.0,224000.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6.0,7.0,140.0,1.0,40.0,1.0,22.0,2.0,20.0,4.294951e6,0.0,0.0,0.0,0.0,140.0,40.0,40.0,140.0,140.0,40.0,0.0,0.0,0.0,0.0,1.12e6,320000.0,1.0,1.0,0.0,0.0,0.0,512.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17.0,188.0,56.0,2.0,0.0,0.0,0.0,0.0,0.0,4.294529e6,438.0,0.0,64.0,64.0,28.0,28.0,0.0,28.0,56.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
train_data.reload_buffer()

In [11]:
train_data[1]

[tensor([[ 17., 188.,  84.,  ...,   0.,   0.,   0.],
         [  6.,   7., 280.,  ...,   0.,   0.,   0.],
         [  6.,   7., 280.,  ...,   0.,   0.,   0.],
         ...,
         [  6.,   7., 320.,  ...,   0.,   0.,   0.],
         [ 17., 188.,  56.,  ...,   0.,   0.,   0.],
         [ 17., 188.,  56.,  ...,   0.,   0.,   0.]]),
 tensor([1, 1, 1,  ..., 1, 1, 1], dtype=torch.int32),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 0, 0, ..., 0, 0, 0])]

In [12]:
train_dataloader = DataLoader(train_data, batch_size=1, shuffle=False, pin_memory=True)

In [13]:
train_features, train_labels, *misc_labels = next(iter(train_dataloader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

Feature batch shape: torch.Size([1, 100000, 39])
Labels batch shape: torch.Size([1, 100000])


In [14]:
train_features.reshape(-1, 39).size()

torch.Size([100000, 39])

In [15]:
train_features.reshape(-1, 39)

tensor([[  6.,   7., 140.,  ...,   0.,   0.,   0.],
        [  6.,   7., 280.,  ...,   0.,   0.,   0.],
        [  6.,   7., 280.,  ...,   0.,   0.,   0.],
        ...,
        [  6.,   7., 140.,  ...,   0.,   0.,   0.],
        [ 17., 188.,  56.,  ...,   0.,   0.,   0.],
        [ 17., 188.,  84.,  ...,   0.,   0.,   0.]])

In [16]:
len(train_dataloader)

761

In [17]:
# if __name__ == "__main__":
#     for X, Y, *misc_labels in tqdm(train_dataloader, total=len(train_dataloader)):
#         X = X.to("cuda")
#         continue

In [18]:
train_features

tensor([[[  6.,   7., 140.,  ...,   0.,   0.,   0.],
         [  6.,   7., 280.,  ...,   0.,   0.,   0.],
         [  6.,   7., 280.,  ...,   0.,   0.,   0.],
         ...,
         [  6.,   7., 140.,  ...,   0.,   0.,   0.],
         [ 17., 188.,  56.,  ...,   0.,   0.,   0.],
         [ 17., 188.,  84.,  ...,   0.,   0.,   0.]]])

In [19]:
train_dataloader.dataset.reload_buffer()

In [20]:
source_domain = DATA_LIST[1:]
target_domain = DATA_LIST[0]
DATA_PATH = "../data/interim/"
DATA_PATH = "../data/parquet/"

train, val, test = random_split_dataloader(
    dir_path=DATA_PATH,
    source_dir=source_domain,
    target_dir=target_domain,
    source_domain=source_domain,
    target_domain=target_domain,
    batch_size=1,
    n_workers=0,
)

Data directories: ['NF-CSE-CIC-IDS2018-v2', 'NF-ToN-IoT-v2', 'NF-UNSW-NB15-v2']


Loading NF-CSE-CIC-IDS2018-v2: 100%|██████████| 189/189 [00:00<00:00, 3012.63it/s]
Loading NF-ToN-IoT-v2: 100%|██████████| 170/170 [00:00<00:00, 3452.95it/s]
Loading NF-UNSW-NB15-v2: 100%|██████████| 24/24 [00:00<00:00, 3074.63it/s]


Data directories: ['NF-BoT-IoT-v2']


Loading NF-BoT-IoT-v2: 100%|██████████| 378/378 [00:00<00:00, 3455.16it/s]


Train: 364, Val: 19, Test: 378


In [21]:
train_features, train_labels, *train_misc_labels = next(iter(train))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

Feature batch shape: torch.Size([1, 100000, 39])
Labels batch shape: torch.Size([1, 100000])


In [22]:
val_features, val_labels, *val_misc_labels = next(iter(val))
print(f"Feature batch shape: {val_features.size()}")
print(f"Labels batch shape: {val_labels.size()}")

Feature batch shape: torch.Size([1, 100000, 39])
Labels batch shape: torch.Size([1, 100000])


In [23]:
test_features, test_labels, *test_misc_labels = next(iter(test))
print(f"Feature batch shape: {test_features.size()}")
print(f"Labels batch shape: {test_labels.size()}")

Feature batch shape: torch.Size([1, 100000, 39])
Labels batch shape: torch.Size([1, 100000])


In [24]:
# if __name__ == "__main__":
#     for X, Y, *misc_labels in tqdm(train, total=len(train)):
#         X = X.to("cuda")
#         continue
#     for X, Y, *misc_labels in tqdm(val, total=len(val)):
#         X = X.to("cuda")
#         continue
#     for X, Y, *misc_labels in tqdm(test, total=len(test)):
#         X = X.to("cuda")
#         continue

In [25]:
train.dataset.dataset

<skripsi_code.utils.domain_dataset.MultiChunkParquet at 0x760f5ff1bcd0>

In [26]:
from skripsi_code.model.MoMLNIDS import MoMLDNIDS
from torchsummary import summary

x = torch.randn(1000, 43).to("cuda")  # Batch of 5, input size of 20
network = MoMLDNIDS(
    input_nodes=x.size(dim=1), hidden_nodes=[64, 32, 16], num_domains=3, num_class=2
).to("cuda")
print(network)
print(network(x))
summary(network, (x.size(dim=1),))

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [27]:
train.dataset.dataset.cluster_label

array([0, 0, 0, ..., 0, 0, 0])

In [28]:
import skripsi_code.clustering

In [29]:
from skripsi_code.clustering import cluster_utils

In [31]:
from skripsi_code.clustering.cluster_utils import pseudolabeling

source_data = train.dataset.dataset
pseudo_domain_label = pseudolabeling(
    dataset=source_data,
    model=network,
    device="cuda",
    previous_cluster=source_data.cluster_label,
    log_file="clustering.txt",
    epoch=1,
    n_clusters=3,
    method="MiniK",
    data_reduction=False,
    reduced_dimentions=48,
    batch_size=1,
)

(100000, 32)


TypeError: 'int' object is not callable