In [None]:
# imports
import numpy as np
import os
import json
import shutil
import subprocess

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier

%load_ext autoreload
%autoreload 2

from .dataset_wrapper import find_and_load_datasets
from .classifier_wrapper import SKLearnClassifierWrapper
from .preprocessing_wrapper import PreprocessingWrapper
from .logger import Logger
from .features import FeatureExtraction


import warnings
warnings.filterwarnings("ignore", category=UserWarning,module="sklearn")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
# paths, constants
root = "../../../dataset-private/" # path to the datasets 
validation_split=0.1
experiment_name = "test_plotting"
seed = 1111

## Loading, init

In [19]:
loaders = find_and_load_datasets(root) #helper function from dataset_loader.py

found_datasets=list(loaders.keys())
print("Found datasets:", found_datasets)

""" results = sample_n_from_each_dataset(loaders,n=3)
for ds_name, info in results.items():
    print(f"Dataset: {ds_name}  (file used: {info['file']})  samples: {len(info['samples'])}")
    display(info['df']) """

#print datasets
for name, loader in loaders.items():
    print(f"{name}: {len(loader)} samples")

Found datasets: ['001-zeek-scenario-malicious', '003-zeek-scenario-malicious', '008-zeek-mixed', '009-zeek-malicious', '010-zeek-mixed', '011-zeek-mixed', '012-zeek-mixed', '013-zeek-mixed', '014-zeek-malicious', '015-zeek-malicious', '016-zeek-malicious', '017-zeek-malicious', '018-zeek-malicious', '020-zeek-malicious', '021-zeek-malicious', '022-zeek-malicious', '023-zeek-malicious', '024-zeek-malicious', '025-zeek-malicious', '026-zeek-malicious', '027-zeek-malicious', '028-zeek-malicious', '029-zeek-malicious', '030-zeek-malicious', '031-zeek-malicious', '032-zeek-malicious', '033-zeek-malicious', '034-zeek-malicious', '035-zeek-malicious', '036-zeek-malicious', '037-zeek-mixed']
001-zeek-scenario-malicious: 253 samples
003-zeek-scenario-malicious: 361 samples
008-zeek-mixed: 5603 samples
009-zeek-malicious: 79401 samples
010-zeek-mixed: 5310 samples
011-zeek-mixed: 8722 samples
012-zeek-mixed: 3810 samples
013-zeek-mixed: 26241 samples
014-zeek-malicious: 26738 samples
015-zeek-ma

## Pipeline

In [20]:
#feature processing
feature_extraction = FeatureExtraction()

#preprocessing
scaler = StandardScaler() 
preprocessor = PreprocessingWrapper(experiment_name=experiment_name)
preprocessor.add_step("scaler", scaler)

""" pca = IncrementalPCA(n_components=7)
preprocessor.add_step("pca", pca) """

# other steps here? add your own!

#classifiers:
# sklearn SGD linear
model = SGDClassifier(loss='hinge', penalty='l2',random_state=seed) 
classifier = SKLearnClassifierWrapper(model,preprocessing_handler=preprocessor)


""" tree_Hoeff = tree.HoeffdingAdaptiveTreeClassifier(seed=seed,max_size=10,max_depth=15)
model = ensemble.ADWINBoostingClassifier(seed=seed,model = tree_Hoeff,n_models=5)


# model = forest.ARFClassifier(seed=seed,n_models=50, max_size=10, warning_detector=drift.ADWIN(delta=0.05))


# final wapper for pipeline
classifier = RiverClassifierWrapper(model,preprocessing_handler=preprocessor)
 """



In [21]:
# example of commands to run
# each command is either "train" or "test"
# dataset_prefix is 3 numbers always - which dataset to use (008, 009, 010, ...)
# validation = use validation portion when training

commands = [
    {"command": "train", "dataset_prefix": "009", "validation": True},
    {"command": "test", "dataset_prefix": "009"},

]    

In [22]:
# train loop
    # call batch from dataset
    # process features
    # preprocessing (scaling)
    # train on model with validation, logger for metrics!
    # save model after the whole dataset is done
    # reporting, metrics, plots, etc.

rng = np.random.default_rng(seed)

# Ensure log experiment folder exists
experiment_folder = os.path.join(".", "logs", experiment_name)
if os.path.exists(experiment_folder):
    if os.path.isdir(experiment_folder):
        shutil.rmtree(experiment_folder)
    else:
        os.remove(experiment_folder)
os.makedirs(experiment_folder, exist_ok=False)

# Save config to configs.txt in the experiment folder (for reproducibility)
config_path = os.path.join(experiment_folder, "configs.txt")
with open(config_path, "w") as f:
    json.dump({
        "seed": seed,
        "validation_split": validation_split,
        "commands": commands,
        "experiment_name": experiment_name,
        "root": root
    }, f, indent=4)

# check if the model is fitted (now just to know if we can test from the start or need to train first)
try:
    dummy_input = np.zeros((1, model.n_features_in_))
    classifier.predict(dummy_input)
    is_fitted = True
except Exception:
    print("Model is not fitted.")
    is_fitted = False


# main loop doing commands one by one, and storing logs
for command_idx,command_dict in enumerate(commands):

    #find the dataset we wanted to use
    ds = command_dict["dataset_prefix"]
    try:    
        selected_dataset = next(name for name in found_datasets if ds in name)
        loader = loaders[selected_dataset]
    except StopIteration:
        print(f"No dataset found for {ds}, skipping")
        if command_idx == 0 and not is_fitted:
            print("No dataset for the first training command, exiting")
            exit(1)
        continue

    # based on the command specified, do the action train/test
    command = command_dict["command"]
    num_batches = command_dict.get("batches", None)
    training_batches = loader.batches()
    if num_batches is not None:
        training_batches = min(num_batches, loader.batches())

    if command == "train":
        loader.reset_epoch(batch_size=500)
        path_to_logfile = f"{command_idx}_train_{ds}"
        logger = Logger(experiment_name = f"{experiment_name}", path_to_logfile = path_to_logfile, overwrite=True)
        print(f"Training on dataset {selected_dataset}")
        do_validation = command_dict.get("validation", False)

        for i in range(training_batches):
            if i %5 == 0:
                print(f"Processing batch {i}")


            batch = loader.next_batch()
            X, y = feature_extraction.process_batch(batch)

            sum_labeled_flows = len(y)
            if do_validation:
                try:

                    validation_indices = rng.choice(
                        X.shape[0],
                        size=int(validation_split * X.shape[0]),
                        replace=False,
                    )
                    train_indices = np.array(  # the rest is validation
                        sorted(
                            set(range(X.shape[0])) - set(validation_indices)
                        )
                    )
                    X_train, X_val, y_gt_train, y_gt_val = X.iloc[train_indices], X.iloc[validation_indices], y.iloc[train_indices], y.iloc[validation_indices]
                except Exception as e:
                    print(f"Error during train_test_split: {e}")
                    continue


                #preprocessor
                preprocessor.partial_fit(X_train)
                X_train_processed = preprocessor.transform(X_train)

                #classif
                classifier.partial_fit(X_train_processed, y_gt_train)
                y_pred_train = classifier.predict(X_train_processed)

                # predict on validation set
                if X_val.shape[0] == 0:
                    y_pred_val = np.array([])
                else:
                    X_val_processed = preprocessor.transform(X_val)
                    y_pred_val = classifier.predict(X_val_processed)

                logger.save_training_results(
                    y_pred_train, y_gt_train, y_pred_val, y_gt_val, sum_labeled_flows
                )
            else:
                preprocessor.partial_fit(X)
                X_processed = preprocessor.transform(X)
                classifier.partial_fit(X_processed, y)
                y_pred_train = classifier.predict(X_processed)
                logger.save_training_results(
                    y_pred_train, y, None, None, sum_labeled_flows # None is for validation
                )

        # After training, plot the training performance using the external script, not here!


    elif command == "test":
        path_to_logfile = f"{command_idx}_test_{ds}"
        logger = Logger(experiment_name = f"{experiment_name}",path_to_logfile = path_to_logfile, overwrite=True)
        loader.reset_epoch(batch_size=1_000)
        print(f"Testing on dataset {selected_dataset}")
        for i in range(loader.batches()):
            batch = loader.next_batch()
            if i %25 == 0:
                print(f"Processing batch {i}")
            X, y = feature_extraction.process_batch(batch)
            if X.shape[0] == 0:
                continue
            X_processed = preprocessor.transform(X)
            y_pred = classifier.predict(X_processed)
            logger.save_test_results(y, y_pred)
    else:
        print(f"Unknown command {command}, skipping")
        continue

#save model, preprocessor steps and add config to the configs.txt
models_path= f"./models/{experiment_name}"
# ensure models directory is fresh (delete if already created)
models_path = f"./models/{experiment_name}"
if os.path.exists(models_path):
    if os.path.isdir(models_path):
        shutil.rmtree(models_path)
    else:
        os.remove(models_path)
os.makedirs(models_path, exist_ok=False)
classifier.save_classifier(path = models_path ,name = "model_lin_SGD.bin")
preprocessor.save() #saves in models/<experiment_name>/preprocessing/<step_name>

#  append feature names from the first preprocessing step and model parameters to configs.txt
# get feature names from the first step in the preprocessor
first_preprocessor_step = preprocessor.steps[0][1]
if hasattr(first_preprocessor_step, 'get_feature_names_out'):
    feature_names = first_preprocessor_step.get_feature_names_out()
elif hasattr(first_preprocessor_step, 'feature_names_in_'):
    feature_names = first_preprocessor_step.feature_names_in_
else:
    feature_names = None

# Get model parameters
if hasattr(model, "get_params"):
    model_params = model.get_params()
else:
    model_params = None
model_info = {
    "class": type(model).__name__,
    "loss": getattr(model, "loss", None),
    "params": model_params
}

# Append to configs.txt
with open(config_path, "a") as f:
    f.write("\n\n# Feature names from first preprocessing step:\n")
    if feature_names is not None:
        f.write(json.dumps({"feature_names": list(feature_names)}, indent=4))
    else:
        f.write("# Feature names not available\n")
    f.write("\n\n# Model information:\n")
    f.write(json.dumps(model_info, indent=4))

Model is not fitted.
Training on dataset 009-zeek-malicious
Processing batch 0


Processing batch 5
Processing batch 10
Processing batch 15
Processing batch 20
Processing batch 25
Processing batch 30
Processing batch 35
Processing batch 40
Processing batch 45
Processing batch 50
Processing batch 55
Processing batch 60
Processing batch 65
Processing batch 70
Processing batch 75
Testing on dataset 009-zeek-malicious
Processing batch 0
Processing batch 25
Processing batch 50
Processing batch 75


In [23]:
if hasattr(classifier.classifier, 'n_warnings_detected'):
    print(classifier.classifier.n_warnings_detected())

if hasattr(classifier.classifier, 'n_drifts_detected'):
    print(classifier.classifier.n_drifts_detected())

In [24]:
# Go through experiment log folders and plot performance for each command

# saving place for plots!
save_folder = f"./results/{experiment_name}/"
if os.path.exists(save_folder):
    if os.path.isdir(save_folder):
        shutil.rmtree(save_folder)
    else:
        os.remove(save_folder)
os.makedirs(save_folder, exist_ok=False)

for idx, cmd in enumerate(commands):

    ds = command_dict["dataset_prefix"]
    try:    
        selected_dataset = next(name for name in found_datasets if ds in name)
        loader = loaders[selected_dataset]
    except StopIteration:
        print(f"No dataset found for {ds}, skipping")
        continue

    output_log = f"{idx}_{cmd['command']}_{cmd['dataset_prefix']}"
    log_dir = os.path.join("logs", experiment_name, output_log)
    output_stdout = []
    if cmd["command"] == "train":
        subprocess.run([
            "python", "../plot_train_performance.py",
            "-f", log_dir,
            "-e", output_log,
            "--save_folder", save_folder
        ])

    elif cmd["command"] == "test":
        subprocess.run([
            "python", "../plot_testing_performance.py",
            "-f", log_dir,
            "-e", output_log,
            "--save_folder", save_folder
        ])



[INFO] Reading logfile: logs/test_plotting/0_train_009
[INFO] Accumulating batch and cumulative metrics...
Validation plots will be saved to: /home/svobojan/StratosphereLinuxIPS/modules/flowmldetection/pipeline_ml_training/results/test_plotting/training/0_train_009/validation
Training plots will be saved to: /home/svobojan/StratosphereLinuxIPS/modules/flowmldetection/pipeline_ml_training/results/test_plotting/training/0_train_009/training
Comparison plots will be saved to: /home/svobojan/StratosphereLinuxIPS/modules/flowmldetection/pipeline_ml_training/results/test_plotting/training/0_train_009/comparison

=== VALIDATION Multi-class (Aggregated) ===
Accuracy:             1.0000
Malware F1:           1.0000
Malware FPR:          0.0000
Malware FNR:          0.0000
Macro F1:             1.0000
Precision:            1.0000
Recall:               1.0000
MCC:                  1.0000

=== TRAINING Multi-class (Aggregated) ===
Accuracy:             0.9999
Malware F1:           1.0000
Malware F