In [39]:
from preprocess_data import decode_N_WGAN_GP
from load_data import get_datasets_and_info
from hyperparams import DEFAULT_HYPERPARAMS_TO_TUNE
from gan_tuner_model import GANTunerModelCV
from gan import CIDDS_WCGAN_GP, StopTrainingOnNaNCallback
import keras_tuner as kt
import logging
import numpy as np
from pathlib import Path


In [25]:
# get train and test datasets
dataset_info = get_datasets_and_info()

# output dim is the number of features (WITHOUT the labels)
output_dim = dataset_info["folds"][0]["X_test"].shape[1]
X_encoders = dataset_info["X_encoders"]
y_encoder = dataset_info["y_encoder"]
X_colnames = dataset_info["X_colnames"]
y_encoder = dataset_info["y_encoder"]

In [32]:
# a = decode_N_WGAN_GP(dataset_info["folds"][0]["X_test"],dataset_info["folds"][0]["y_test"], y_encoder, X_colnames, X_encoders, True, True)
# a[a["class"] == "attacker"]["DstIP"].unique()

array(['104.55.241.127'], dtype=object)

In [36]:
# create GAN
model_gan = CIDDS_WCGAN_GP(
    output_dim=output_dim,
    num_classes=2,
    x_col_labels=X_colnames,
    x_encoders=X_encoders,
    decoder_func=decode_N_WGAN_GP,
    y_encoder=y_encoder,
    hyperparams_to_tune=DEFAULT_HYPERPARAMS_TO_TUNE,
)
# compile the gan model
model_gan.compile()


In [40]:
# dummy tuner just to run evaluate_TSTR
hypermodel = GANTunerModelCV(
    output_dim=output_dim,
    num_classes=2,
    X_encoders=X_encoders,
    y_encoder=y_encoder,
    X_colnames=X_colnames,
    decoder_func=decode_N_WGAN_GP,
)


In [52]:
# train the gan model
hp_num_epochs = model_gan.hyperparams_to_tune["num_epochs"]
hp_batch_size = model_gan.hyperparams_to_tune["batch_size"]

# for each fold in dataset_folds, obtain TSTR score and return the average
tstr_scores = []
dataset_folds = dataset_info["folds"]
for i, fold in enumerate(dataset_folds):
    # unpack the fold
    real_dataset = fold["train_dataset"]
    X_test = fold["X_test"]
    y_test = fold["y_test"]
    # train the model
    model_gan.fit(
        real_dataset.batch(hp_batch_size),
        epochs=hp_num_epochs,
        verbose=0,
        callbacks=[StopTrainingOnNaNCallback()],
    )
    # evaluate the gan model
    tstr_score = hypermodel.evaluate_TSTR(model_gan, X_test, y_test)
    tstr_scores.append(tstr_score)

# return the average TSTR score
avg_tstr_score = np.mean(tstr_scores)

2023-11-29 03:19:00.561905: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_6' with dtype float and shape [1,2]
	 [[{{node Placeholder/_6}}]]
2023-11-29 03:19:00.562218: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_8' with dtype float and shape [93968,31]
	 [[{{node Placeholder/_8}}]]
2023-11-29 03:19:11.141216: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_6' with dtype float and shape [1,2]

In [53]:
avg_tstr_score

0.7696638487879933

In [62]:
fake_X_y, retention_scores = model_gan.generate_n_plausible_samples(10000, 10000)
fake_X = fake_X_y[:, :-1]
fake_y = fake_X_y[:, -1].reshape(-1, 1)

In [63]:
decoded_fakes = decode_N_WGAN_GP(fake_X, fake_y, y_encoder, X_colnames, X_encoders, True, True)
display(decoded_fakes.head())
display(decoded_fakes.describe())

Unnamed: 0,Duration,Proto,SrcPt,DstPt,Packets,Bytes,Flags,Date_first_seen,SrcIP,DstIP,class
0,236550.4375,TCP,62761,0,1,40,....S.,Thursday 13:16:20,214.164.0.99,0.0.244.0,attacker
1,0.0,TCP,0,21852,1,40,.AP.SF,"Tuesday 2 days, 0:53:46",0.0.281.236,381.298.0.153,normal
2,0.0,TCP,0,41417,1,40,.A..S.,Thursday 20:14:07,0.81.65.70,245.214.60.111,attacker
3,0.0,TCP,0,9761,1,40,.AP.SF,"Friday 1 day, 1:06:00",0.0.179.132,181.241.111.205,normal
4,0.0,TCP,56547,0,1,40,....SF,Wednesday 2:35:52,371.99.0.0,0.0.253.0,attacker


Unnamed: 0,Duration,SrcPt,DstPt,Packets,Bytes
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,890.286194,13356.3788,16903.3022,346.3521,1444609.0
std,11570.135742,18552.453724,20028.073529,4350.387062,17767270.0
min,0.0,0.0,0.0,1.0,40.0
25%,0.0,0.0,0.0,1.0,40.0
50%,0.0,0.0,7326.0,1.0,40.0
75%,0.0,24501.0,31875.5,1.0,40.0
max,342866.53125,65490.0,65504.0,109454.0,559898900.0


In [72]:
# train a random forest on the fake data and evaluate on real data (e.g. first fold of test set)
from turtle import pos
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report

X_test = dataset_info["folds"][0]["X_test"]
y_test = dataset_info["folds"][0]["y_test"]

rf = RandomForestClassifier()
rf.fit(fake_X, fake_y.ravel())
y_pred = rf.predict(X_test)

attacker_label = y_encoder.transform(["attacker"])[0][0]

print(f1_score(y_test, y_pred, pos_label=attacker_label))

0.9310045185957595


In [70]:
attacker_label = y_encoder.transform(["attacker"])[0]
attacker_label[0]

0