In [36]:
import numpy as np
import pandas as pd
import csv
from torch.utils.data import DataLoader
import polars as pl
from sklearn.cluster import MiniBatchKMeans
from tqdm import tqdm
from pathlib import Path
import torch

In [37]:
from skripsi_code.utils.domain_dataset import MultiChunkParquet, MultiChunkDataset
from skripsi_code.utils.dataloader import random_split_dataloader

In [38]:
DATA_LIST = [
    "NF-BoT-IoT-v2",
    "NF-CSE-CIC-IDS2018-v2",
    "NF-ToN-IoT-v2",
    "NF-UNSW-NB15-v2",
]

In [39]:
source_domain = DATA_LIST[1:]
target_domain = DATA_LIST[0]
DATA_PATH = "../data/parquet/"

train, val, test = random_split_dataloader(
    dir_path=DATA_PATH,
    source_dir=source_domain,
    target_dir=target_domain,
    source_domain=source_domain,
    target_domain=target_domain,
    batch_size=1,
    n_workers=0,
)

Data directories: ['NF-CSE-CIC-IDS2018-v2', 'NF-ToN-IoT-v2', 'NF-UNSW-NB15-v2']


Loading NF-CSE-CIC-IDS2018-v2: 100%|██████████| 187/187 [00:00<00:00, 5022.09it/s]
Loading NF-ToN-IoT-v2: 100%|██████████| 169/169 [00:00<00:00, 5507.03it/s]
Loading NF-UNSW-NB15-v2: 100%|██████████| 23/23 [00:00<00:00, 4736.77it/s]


Data directories: ['NF-BoT-IoT-v2']


Loading NF-BoT-IoT-v2: 100%|██████████| 377/377 [00:00<00:00, 5252.18it/s]


Train: 361, Val: 18, Test: 377


In [40]:
train_features, train_labels, *train_misc_labels = next(iter(train))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

Feature batch shape: torch.Size([1, 100000, 39])
Labels batch shape: torch.Size([1, 100000])


In [6]:
val_features, val_labels, *val_misc_labels = next(iter(val))
print(f"Feature batch shape: {val_features.size()}")
print(f"Labels batch shape: {val_labels.size()}")

Feature batch shape: torch.Size([1, 100000, 39])
Labels batch shape: torch.Size([1, 100000])


In [7]:
test_features, test_labels, *test_misc_labels = next(iter(test))
print(f"Feature batch shape: {test_features.size()}")
print(f"Labels batch shape: {test_labels.size()}")

Feature batch shape: torch.Size([1, 100000, 39])
Labels batch shape: torch.Size([1, 100000])


In [8]:
train_features.squeeze().size(0)

100000

In [9]:
train_features.squeeze().var(dim=1).shape

torch.Size([100000])

In [10]:
# if __name__ == "__main__":
#     for X, Y, *misc_labels in tqdm(train, total=len(train)):
#         X = X.to("cuda")
#         continue
#     for X, Y, *misc_labels in tqdm(val, total=len(val)):
#         X = X.to("cuda")
#         continue
#     for X, Y, *misc_labels in tqdm(test, total=len(test)):
#         X = X.to("cuda")
#         continue

In [11]:
train.dataset.dataset

<skripsi_code.utils.domain_dataset.MultiChunkParquet at 0x7effbd709f30>

In [12]:
from skripsi_code.model.MoMLNIDS import MoMLDNIDS
from torchsummary import summary
x = torch.randn(1000, 39).to("cuda")  # Batch of 5, input size of 20
network = MoMLDNIDS(
    input_nodes=x.size(dim=1), hidden_nodes=[64, 32, 16], num_domains=3, num_class=2
).to("cuda")
print(network)
print(network(x))
summary(network, (x.size(dim=1),))


MoMLDNIDS(
  (FeatureExtractorLayer): DGFeatExt(
    (fc_modules): ModuleList(
      (0): Sequential(
        (0): Linear(in_features=39, out_features=64, bias=True)
        (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
      )
      (1): Sequential(
        (0): Linear(in_features=64, out_features=32, bias=True)
        (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
      )
      (2): Sequential(
        (0): Linear(in_features=32, out_features=16, bias=True)
        (1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
      )
    )
  )
  (DomainClassifier): DomainDiscriminator(
    (fc_modules): ModuleList(
      (0): Sequential(
        (0): Linear(in_features=16, out_features=64, bias=True)
        (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
        (3): D

In [13]:
train.dataset.dataset.cluster_label

array([0, 0, 0, ..., 0, 0, 0])

In [14]:
from skripsi_code.clustering.cluster_utils import pseudolabeling

source_data = train.dataset.dataset
pseudo_domain_label = pseudolabeling(
    dataset=source_data,
    model=network,
    device='cuda',
    previous_cluster=source_data.cluster_label,
    log_file="clustering.log",
    epoch=1,
    n_clusters=3,
    method="MiniK",
    data_reduction=False,
    reduced_dimentions=48,
    batch_size=1
)

In [15]:
source_data.buffer[0][34244].select(pl.nth(range(4, 43)))


PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,SERVER_TCP_FLAGS,FLOW_DURATION_MILLISECONDS,DURATION_IN,DURATION_OUT,MIN_TTL,MAX_TTL,LONGEST_FLOW_PKT,SHORTEST_FLOW_PKT,MIN_IP_PKT_LEN,MAX_IP_PKT_LEN,SRC_TO_DST_SECOND_BYTES,DST_TO_SRC_SECOND_BYTES,RETRANSMITTED_IN_BYTES,RETRANSMITTED_IN_PKTS,RETRANSMITTED_OUT_BYTES,RETRANSMITTED_OUT_PKTS,SRC_TO_DST_AVG_THROUGHPUT,DST_TO_SRC_AVG_THROUGHPUT,NUM_PKTS_UP_TO_128_BYTES,NUM_PKTS_128_TO_256_BYTES,NUM_PKTS_256_TO_512_BYTES,NUM_PKTS_512_TO_1024_BYTES,NUM_PKTS_1024_TO_1514_BYTES,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""1""","""0.0""","""8976""","""102""","""0""","""0""","""0""","""0""","""0""","""4193971""","""100996""","""0""","""64""","""64""","""88""","""88""","""0""","""88""","""8.888888888888893e+203""","""0.0""","""0""","""0""","""0""","""0""","""704000""","""0""","""102""","""0""","""0""","""0""","""0""","""0""","""0""","""771""","""3""","""0""","""0""","""0""","""0"""


In [16]:
torch.tensor(source_data.buffer[0][34244].select(pl.nth(range(4, 43))).with_columns(pl.col("*").cast(pl.Float32)).to_numpy())

tensor([[1.0000e+00, 0.0000e+00, 8.9760e+03, 1.0200e+02, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 4.1940e+06, 1.0100e+05, 0.0000e+00,
         6.4000e+01, 6.4000e+01, 8.8000e+01, 8.8000e+01, 0.0000e+00, 8.8000e+01,
                inf, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         7.0400e+05, 0.0000e+00, 1.0200e+02, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 7.7100e+02, 3.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00]])

In [17]:
source_data[0][0][34244]

tensor([1.0000e+00, 0.0000e+00, 8.9760e+03, 1.0200e+02, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 4.1940e+06, 1.0100e+05, 0.0000e+00,
        6.4000e+01, 6.4000e+01, 8.8000e+01, 8.8000e+01, 0.0000e+00, 8.8000e+01,
        1.0000e+05, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        7.0400e+05, 0.0000e+00, 1.0200e+02, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 7.7100e+02, 3.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00])

In [18]:
import numpy as np
stats = np.load("statistics.npy")

In [19]:
np.unique(np.where(np.isnan(stats))[0])[:10]

array([ 34244,  94479, 107979, 178470, 209632, 212530, 344103, 379134,
       473575, 579339])

In [20]:
len(np.unique(np.where(np.isnan(stats))[0]))

383

In [21]:
np.isnan(stats).sum()

np.int64(1532)

In [22]:
source_data.parquet_files[89]

'../data/parquet//NF-CSE-CIC-IDS2018-v2/NF-CSE-CIC-IDS2018-v2_chunk_180.parquet'

In [23]:
for i in tqdm(range(len(val.dataset))):
    try:
        val.dataset[i]
    except Exception as e:
        print(f"Error on {i}")
    else:
        continue
        

100%|██████████| 19/19 [00:00<00:00, 55.43it/s]


In [25]:
source_data.length

38124479

# Test Training 

In [35]:
from skripsi_code.TrainEval.TrainEval import train, eval

In [None]:
 model, optimizers = train(
            model=model,
            train_data=source_train,
            optimizers=optimizers,
            device=device,
            epoch=epoch,
            num_epoch=NUM_EPOCH,
            filename=SAVE_PATH / "source_trained.log",
            disc_weight=DISCRIMINATOR_LOSS_WEIGHT,
            entropy_weight=ENTROPY_WEIGHT,
            grl_weight=GRL_WEIGHT,
        )

In [None]:
tra

In [46]:
import torch
import torch.nn as nn

# Dummy predictions and labels
preds = torch.tensor([[2.0, 1.0, 0.1], [1.0, 3.0, 0.1]])  # Logits for 2 samples and 3 classes
labels = torch.tensor([0, 1])  # True labels for each sample

# CrossEntropyLoss with mean reduction (default)
criterion = nn.CrossEntropyLoss(reduction='mean')
loss = criterion(preds, labels)
print(loss)  # Single scalar value, mean of individual losses


tensor(0.2956)


In [49]:
preds = torch.tensor([[0.0599, 0.0290],[0.0574, 0.0215], [0.0351, 0.0483]])
labels = torch.tensor([0, 0, 0])

criterion = nn.CrossEntropyLoss(reduction='mean')
loss = criterion(preds, labels)
print(loss)  # Single scalar value, mean of individual losses


tensor(0.6843)


In [52]:
from skripsi_code.utils.utils import get_model_learning_rate, get_optimizer

from skripsi_code.model.MoMLNIDS import MoMLDNIDS


In [50]:
x = torch.randn(1000, 43).to("cuda")  # Batch of 5, input size of 20
network = MoMLDNIDS(
    input_nodes=x.size(dim=1), hidden_nodes=[64, 32, 16], num_domains=3, num_class=2
).to("cuda")
print(network)
print(network(x))
summary(network, (x.size(dim=1),))


MoMLDNIDS(
  (FeatureExtractorLayer): DGFeatExt(
    (fc_modules): ModuleList(
      (0): Sequential(
        (0): Linear(in_features=43, out_features=64, bias=True)
        (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
      )
      (1): Sequential(
        (0): Linear(in_features=64, out_features=32, bias=True)
        (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
      )
      (2): Sequential(
        (0): Linear(in_features=32, out_features=16, bias=True)
        (1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
      )
    )
  )
  (DomainClassifier): DomainDiscriminator(
    (fc_modules): ModuleList(
      (0): Sequential(
        (0): Linear(in_features=16, out_features=64, bias=True)
        (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
        (3): D

In [55]:
mlr = get_model_learning_rate(network, 1, 1)

In [56]:
optimizers = [
        get_optimizer(
            model_module,
            0.001 * alpha,
            weight_decay=5e-4,
            amsgrad=True,
        )
        for model_module, alpha in mlr
    ]

In [60]:
optimizers

[AdamW (
 Parameter Group 0
     amsgrad: True
     betas: (0.9, 0.999)
     capturable: False
     differentiable: False
     eps: 1e-08
     foreach: None
     fused: None
     lr: 0.001
     maximize: False
     weight_decay: 0.0005
 ),
 AdamW (
 Parameter Group 0
     amsgrad: True
     betas: (0.9, 0.999)
     capturable: False
     differentiable: False
     eps: 1e-08
     foreach: None
     fused: None
     lr: 0.001
     maximize: False
     weight_decay: 0.0005
 ),
 AdamW (
 Parameter Group 0
     amsgrad: True
     betas: (0.9, 0.999)
     capturable: False
     differentiable: False
     eps: 1e-08
     foreach: None
     fused: None
     lr: 0.001
     maximize: False
     weight_decay: 0.0005
 )]