In [16]:
# imports
import numpy as np
import random


%load_ext autoreload
%autoreload 2
from dataset_wrapper import find_and_load_datasets
from classifier_wrapper import SKLearnClassifierWrapper
from preprocessing_wrapper import PreprocessingWrapper
from logger import Logger
from features import FeatureExtraction


from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# other models, preprocessing, scalers etc.

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
# paths, constants
root = "../../../dataset-private/" # path to the datasets
validation_split=0.1

In [18]:
experiment_name = "testing_of_pipeline_functionality_1"
model_save_path = f"./models/model_{experiment_name}.pkl"

## Loading, init

In [19]:
loaders = find_and_load_datasets(root,batch_size=1000)

found_datasets=list(loaders.keys())
print("Found datasets:", found_datasets)

""" results = sample_n_from_each_dataset(loaders,n=3)
for ds_name, info in results.items():
    print(f"Dataset: {ds_name}  (file used: {info['file']})  samples: {len(info['samples'])}")
    display(info['df']) """

for name, loader in loaders.items():
    print(f"{name}: {len(loader)} samples")

Found datasets: ['008-zeek-mixed', '009-zeek-malicious', '010-zeek-mixed', '011-zeek-mixed', '012-zeek-mixed', '013-zeek-mixed', '014-zeek-malicious', '015-zeek-malicious']
008-zeek-mixed: 5603 samples
009-zeek-malicious: 79401 samples
010-zeek-mixed: 5310 samples
011-zeek-mixed: 8722 samples
012-zeek-mixed: 3810 samples
013-zeek-mixed: 26241 samples
014-zeek-malicious: 26738 samples
015-zeek-malicious: 197722 samples


## Pipeline

In [20]:
#feature processing
feature_extraction = FeatureExtraction()

#preprocessing
scaler = StandardScaler() 
preprocessor = PreprocessingWrapper()
preprocessor.add_step("scaler", scaler)
#other steps? pca?

#classifier
model = SGDClassifier(loss='hinge', penalty='l2') 
classifier = SKLearnClassifierWrapper(model)

# reporting
logger = Logger(experiment_name = experiment_name,overwrite=True)

In [21]:
#example of commands to run
# each command is either "train" or "test"
#dataset_prefix is 3 numbers always - which dataset to use (008, 009, 010, ...)
commands = [
    {"command": "train", "dataset_prefix": "008", "validation": True},
    {"command": "test", "dataset_prefix": "008"},
    {"command": "test", "dataset_prefix": "009"},
    {"command": "train", "dataset_prefix": "009", "validation": True},
    {"command": "test", "dataset_prefix": "008"},
    {"command": "test", "dataset_prefix": "009"},
]

In [None]:

# datasets we have from before - select one (or a list) to train on!
# select commands - train/test on which dataset
# train loop
    # call batch from dataset
    # process features
    # preprocessing (scaling)
    # train on model with validation, logger for metrics!
    # save model after the whole dataset is done
    # reporting, metrics, plots, etc.

np.random.seed(1111)
random.seed(1111)

for i,command_dict in enumerate(commands):
    #find the dataset we wanted to use
    ds = command_dict["dataset_prefix"]
    selected_dataset = next(name for name in found_datasets if ds in name)
    if selected_dataset is None:
        print(f"No dataset found for {ds}, skipping")
        continue
    loader = loaders[selected_dataset]

    # based on the command specified, do the action
    command = command_dict["command"]
    if command == "train":
        loader.reset_epoch(batch_size=1111)
        logger = Logger(experiment_name = f"{experiment_name}",path_to_logfile = f"{i}_train_{ds}", overwrite=True)
        print(f"Training on dataset {selected_dataset}")
        do_validation = command_dict.get("validation", False)

        for i in range(loader.batches()):
            print(f"Processing batch {i+1}")
            batch = loader.next_batch()
            X, y = feature_extraction.process_batch(batch)
            sum_labeled_flows = len(y)
            if do_validation:
                X_train, X_val, y_gt_train, y_gt_val = train_test_split(X, y, test_size=validation_split, random_state=42)
                preprocessor.partial_fit(X_train)
                X_train_processed = preprocessor.transform(X_train)
                y_pred_train = classifier.partial_fit(X_train_processed, y_gt_train)

                # predict on validation set
                X_val_processed = preprocessor.transform(X_val)
                y_pred_val = classifier.predict(X_val_processed)
                logger.save_training_results(
                    y_pred_train, y_gt_train, y_pred_val, y_gt_val, sum_labeled_flows
                )
            else:
                preprocessor.partial_fit(X)
                X_processed = preprocessor.transform(X)
                y_pred_train = classifier.partial_fit(X_processed, y)
                logger.save_training_results(
                    y_pred_train, y, None, None, sum_labeled_flows
                )

    elif command == "test":
        logger = Logger(experiment_name = f"{experiment_name}",path_to_logfile = f"{i}_test_{ds}", overwrite=True)
        loader.reset_epoch(batch_size=10)
        print(f"Testing on dataset {selected_dataset}")
        for i in range(loader.batches()):
            batch = loader.next_batch()
            X, y = feature_extraction.process_batch(batch)
            if X.shape[0] == 0:
                continue
            X_processed = preprocessor.transform(X)
            y_pred = classifier.predict(X_processed)
            logger.save_test_results(y, y_pred)


    else:
        print(f"Unknown command {command}, skipping")
        continue

Training on dataset 008-zeek-mixed
Processing batch 1
Processing batch 2
Processing batch 3
Processing batch 4
Processing batch 5
Processing batch 6
Testing on dataset 008-zeek-mixed
   sport  dport  proto       dur  sbytes  dbytes  state  spkts  dpkts  bytes  \
0  62789    443    0.0  0.143547    1022    4048    1.0     24     16   5070   
1  31545   8140    0.0  0.000004       0       0    0.0      2      0      0   
2  52496     53    0.0  0.000000       0       0    0.0      1      0      0   
3  53559   2077    0.0  0.000009       0       0    0.0      2      0      0   
4  52496   5608    0.0  0.000002       0       0    0.0      2      0      0   
5  53116  12558    0.0  0.000003       0       0    0.0      2      0      0   
6  61162     23    0.0  0.000007       0       0    0.0      2      0      0   
7  36985   8100    0.0  0.000010       0       0    0.0      2      0      0   
8  51575  49501    0.0  0.000003       0       0    0.0      2      0      0   

   pkts  
0    4

In [None]:
# final test?
# on unrelated dataset?
