In [1]:
# imports
import numpy as np
import random
import os
import json


%load_ext autoreload
%autoreload 2
from dataset_wrapper import find_and_load_datasets
from classifier_wrapper import SKLearnClassifierWrapper
from preprocessing_wrapper import PreprocessingWrapper
from logger import Logger
from features import FeatureExtraction

import subprocess


from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import IncrementalPCA

# other models, preprocessing, scalers etc.
import warnings

# other models, preprocessing, scalers etc.
warnings.filterwarnings("ignore", category=UserWarning,module="sklearn")



In [None]:
# paths, constants
root = "../../../dataset-private/" # path to the datasets 
validation_split=0.1
experiment_name = "test_longer_commands"
seed = 1111

## Loading, init

In [3]:
loaders = find_and_load_datasets(root) #helper function from dataset_loader.py

found_datasets=list(loaders.keys())
print("Found datasets:", found_datasets)

""" results = sample_n_from_each_dataset(loaders,n=3)
for ds_name, info in results.items():
    print(f"Dataset: {ds_name}  (file used: {info['file']})  samples: {len(info['samples'])}")
    display(info['df']) """

#print datasets
for name, loader in loaders.items():
    print(f"{name}: {len(loader)} samples")

Found datasets: ['008-zeek-mixed', '009-zeek-malicious', '010-zeek-mixed', '011-zeek-mixed', '012-zeek-mixed', '013-zeek-mixed', '014-zeek-malicious', '015-zeek-malicious']
008-zeek-mixed: 5603 samples
009-zeek-malicious: 79401 samples
010-zeek-mixed: 5310 samples
011-zeek-mixed: 8722 samples
012-zeek-mixed: 3810 samples
013-zeek-mixed: 26241 samples
014-zeek-malicious: 26738 samples
015-zeek-malicious: 197722 samples


In [5]:
loaders["008-zeek-mixed"].next_batch()

[{'ts': '1655071198.444458',
  'uid': 'CHLXQF2OBY4onAR0Ug',
  'id.orig_h': '147.32.81.167',
  'id.orig_p': 62771,
  'id.resp_h': '40.74.108.123',
  'id.resp_p': 443,
  'proto': 'tcp',
  'service': 'ssl',
  'duration': '1.135424',
  'orig_bytes': 1596,
  'resp_bytes': 4039,
  'conn_state': 'SF',
  'local_orig': True,
  'local_resp': False,
  'missed_bytes': 5138,
  'history': 'ShADdaFf',
  'orig_pkts': 14,
  'orig_ip_bytes': 1016,
  'resp_pkts': 16,
  'resp_ip_bytes': 1226,
  'tunnel_parents': '(empty)',
  'label': 'Benign',
  'detailedlabel': 'Microsoft'},
 {'ts': '1655071231.749548',
  'uid': 'CEL3aLvvNGvO6Ecoa',
  'id.orig_h': '193.201.9.232',
  'id.orig_p': 46563,
  'id.resp_h': '147.32.81.167',
  'id.resp_p': 3933,
  'proto': 'tcp',
  'service': None,
  'duration': '0.000010',
  'orig_bytes': 0,
  'resp_bytes': 0,
  'conn_state': 'S0',
  'local_orig': False,
  'local_resp': True,
  'missed_bytes': 0,
  'history': 'S',
  'orig_pkts': 2,
  'orig_ip_bytes': 80,
  'resp_pkts': 0,
  're

## Pipeline

In [None]:
#feature processing
feature_extraction = FeatureExtraction()

#preprocessing
scaler = StandardScaler() 
preprocessor = PreprocessingWrapper(experiment_name=experiment_name)
preprocessor.add_step("scaler", scaler)
pca = IncrementalPCA(n_components=7)
preprocessor.add_step("pca", pca)

#other steps here? add your own!

#classifier
model = SGDClassifier(loss='hinge', penalty='l2',random_state=seed) 
classifier = SKLearnClassifierWrapper(model)

In [None]:
# example of commands to run
# each command is either "train" or "test"
# dataset_prefix is 3 numbers always - which dataset to use (008, 009, 010, ...)
# validation = use validation portion when training
commands = [
    {"command": "train", "dataset_prefix": "008"},#, "validation": True},
    {"command": "test", "dataset_prefix": "008"},
    {"command": "test", "dataset_prefix": "009"},
    {"command": "test", "dataset_prefix": "015"},
    
    {"command": "train", "dataset_prefix": "009", "validation": True}, # evaluation
    # TODO: training error only from training part?
    {"command": "test", "dataset_prefix": "008"},
    {"command": "test", "dataset_prefix": "009"},
    {"command": "test", "dataset_prefix": "017"}, # skipped, does not exist. if it was first, would not be skipped, exit()
]

In [None]:

# train loop
    # call batch from dataset
    # process features
    # preprocessing (scaling)
    # train on model with validation, logger for metrics!
    # save model after the whole dataset is done
    # reporting, metrics, plots, etc.

np.random.seed(seed)
random.seed(seed)

# Ensure log experiment folder exists
experiment_folder = os.path.join(".", "logs", experiment_name)
os.makedirs(experiment_folder, exist_ok=True)

# Save config to configs.txt in the experiment folder (for reproducibility)
config_path = os.path.join(experiment_folder, "configs.txt")
with open(config_path, "w") as f:
    json.dump({
        "seed": seed,
        "validation_split": validation_split,
        "commands": commands,
        "experiment_name": experiment_name,
        "root": root
    }, f, indent=4)

# check if the model is fitted (now just to know if we can test from the start or need to train first)
try:
    dummy_input = np.zeros((1, model.n_features_in_))
    classifier.predict(dummy_input)
    is_fitted = True
except Exception:
    print("Model is not fitted.")
    is_fitted = False


# main loop doing commands one by one, and storing logs
for command_idx,command_dict in enumerate(commands):

    #find the dataset we wanted to use
    ds = command_dict["dataset_prefix"]
    try:    
        selected_dataset = next(name for name in found_datasets if ds in name)
        loader = loaders[selected_dataset]
    except StopIteration:
        print(f"No dataset found for {ds}, skipping")
        if command_idx == 0 and not is_fitted:
            print("No dataset for the first training command, exiting")
            exit(1)
        continue

    # based on the command specified, do the action
    command = command_dict["command"]
    if command == "train":
        loader.reset_epoch(batch_size=500)
        path_to_logfile = f"{command_idx}_train_{ds}"
        logger = Logger(experiment_name = f"{experiment_name}",path_to_logfile = path_to_logfile, overwrite=True)
        print(f"Training on dataset {selected_dataset}")
        do_validation = command_dict.get("validation", False)

        for i in range(loader.batches()):
            if i %50 == 0:
                print(f"Processing batch {i}")
            batch = loader.next_batch()
            X, y = feature_extraction.process_batch(batch)
            sum_labeled_flows = len(y)

            if do_validation:
                X_train, X_val, y_gt_train, y_gt_val = train_test_split(X, y, test_size=validation_split, random_state=seed)

                #preprocessor
                preprocessor.partial_fit(X_train)
                X_train_processed = preprocessor.transform(X_train)

                #classif
                classifier.partial_fit(X_train_processed, y_gt_train)
                y_pred_train = classifier.predict(X_train_processed)

                # predict on validation set
                X_val_processed = preprocessor.transform(X_val)
                y_pred_val = classifier.predict(X_val_processed)
                logger.save_training_results(
                    y_pred_train, y_gt_train, y_pred_val, y_gt_val, sum_labeled_flows
                )
            else:
                preprocessor.partial_fit(X)
                X_processed = preprocessor.transform(X)
                classifier.partial_fit(X_processed, y)
                y_pred_train = classifier.predict(X_processed)
                logger.save_training_results(
                    y_pred_train, y, None, None, sum_labeled_flows # None is for validation
                )

        # After training, plot the training performance using the external script, not here!


    elif command == "test":
        path_to_logfile = f"{command_idx}_test_{ds}"
        logger = Logger(experiment_name = f"{experiment_name}",path_to_logfile = path_to_logfile, overwrite=True)
        loader.reset_epoch(batch_size=1_000)
        print(f"Testing on dataset {selected_dataset}")
        for i in range(loader.batches()):
            batch = loader.next_batch()
            if i %50 == 0:
                print(f"Processing batch {i}")
            X, y = feature_extraction.process_batch(batch)
            if X.shape[0] == 0:
                continue
            X_processed = preprocessor.transform(X)
            y_pred = classifier.predict(X_processed)
            logger.save_test_results(y, y_pred)
        

    else:
        print(f"Unknown command {command}, skipping")
        continue

#save model, preprocessor steps and add config to the configs.txt
classifier.save_classifier(path = f"./models/{experiment_name}" ,name = "model_lin_SGD.bin")
preprocessor.save() #saves in models/<experiment_name>/preprocessing/<step_name>

#  append feature names from the first preprocessing step and model parameters to configs.txt
# get feature names from the first step in the preprocessor
first_preprocessor_step = preprocessor.steps[0][1]
if hasattr(first_preprocessor_step, 'get_feature_names_out'):
    feature_names = first_preprocessor_step.get_feature_names_out()
elif hasattr(first_preprocessor_step, 'feature_names_in_'):
    feature_names = first_preprocessor_step.feature_names_in_
else:
    feature_names = None

# Get model parameters
model_params = model.get_params()
model_info = {
    "class": type(model).__name__,
    "loss": getattr(model, "loss", None),
    "params": model_params
}

# Append to configs.txt
with open(config_path, "a") as f:
    f.write("\n\n# Feature names from first preprocessing step:\n")
    if feature_names is not None:
        f.write(json.dumps({"feature_names": list(feature_names)}, indent=4))
    else:
        f.write("# Feature names not available\n")
    f.write("\n\n# Model information:\n")
    f.write(json.dumps(model_info, indent=4))

Model is not fitted.
Training on dataset 008-zeek-mixed
Processing batch 0
Testing on dataset 008-zeek-mixed
Processing batch 0
Testing on dataset 009-zeek-malicious
Processing batch 0
Processing batch 50
Testing on dataset 015-zeek-malicious
Processing batch 0
Processing batch 50
Processing batch 100
Processing batch 150
Training on dataset 009-zeek-malicious
Processing batch 0
Processing batch 50
Processing batch 100
Processing batch 150
Testing on dataset 008-zeek-mixed
Processing batch 0
Testing on dataset 009-zeek-malicious
Processing batch 0
Processing batch 50
No dataset found for 017, skipping


In [None]:
# Go through experiment log folders and plot performance for each command
for idx, cmd in enumerate(commands):

    ds = command_dict["dataset_prefix"]
    try:    
        selected_dataset = next(name for name in found_datasets if ds in name)
        loader = loaders[selected_dataset]
    except StopIteration:
        print(f"No dataset found for {ds}, skipping")
        continue

    output_log = f"{idx}_{cmd['command']}_{cmd['dataset_prefix']}"
    log_dir = os.path.join("logs", experiment_name, output_log)
    save_folder = f"./results/{experiment_name}/"
    os.makedirs(save_folder, exist_ok=True)
    output_stdout = []
    if cmd["command"] == "train":
        subprocess.run([
            "python", "../plot_train_performance.py",
            "-f", log_dir,
            "-e", output_log,
            "--save_folder", save_folder
        ])

    elif cmd["command"] == "test":
        subprocess.run([
            "python", "../plot_testing_performance.py",
            "-f", log_dir,
            "-e", output_log,
            "--save_folder", save_folder
        ])



No dataset found for 017, skipping
No dataset found for 017, skipping
No dataset found for 017, skipping
No dataset found for 017, skipping
No dataset found for 017, skipping
No dataset found for 017, skipping
No dataset found for 017, skipping
No dataset found for 017, skipping


In [None]:
#PoC classifier loading from a location
classifier_loaded = SKLearnClassifierWrapper(classifier=None).load_classifier(path = f"models/{experiment_name}",name = "model_lin_SGD.bin")

#PoC loading preprocessing
preprocessor_loaded = PreprocessingWrapper(experiment_name=experiment_name).load()