# Static Testing with the UNSW NB15 Dataset

The UNSW NB15 Dataset is a dataset containing real captured data from the university of new south wales in Sydney, Austraila from, the year 2015. First we load the training set.

In [None]:
import numpy as np

data = np.loadtxt("UNSW_NB15_training-set.csv", delimiter=",", dtype=str)
print(f'loaded {len(data)} training samples with {len(data[0])} features')

In [None]:
def remove_unnecessary_data(data):
    # First line with the titles
    data = np.delete(data, 0, axis=0)
    # id
    data = np.delete(data, 0, axis=1)
    # don't need label and category, remove label
    data = np.delete(data, -1, axis=1)
    return data


data = remove_unnecessary_data(data)

Then, we calculate the correct feature extraction methods

In [None]:
from feature_extraction.feature_extractor import FeatureExtractor, calculate_best_numeric_normalization

feature_methods_path = 'extraction_methods.csv'
feature_extractor = FeatureExtractor()
if not feature_extractor.load_extraction_methods(feature_methods_path):
    methods = feature_extractor.calculate_extraction_methods(data)
    feature_extractor.save_extraction_methods(feature_methods_path)
    print('methods calculated')


    # Change some erroneous rows
    def get_numeric_row(data: [], i: int) -> [str]:
        return [float(d[i]) for d in data]


    # changes = [24, 25, 26, 27, 28, 29, 30, 33, 34, 35, 36, 37, 38, 39, 40]
    # for i in changes:
    #     m = calculate_best_numeric_normalization(get_numeric_row(data, i))
    #     feature_extractor.change_method(i, m)
    feature_extractor.save_extraction_methods(feature_methods_path)
else:
    print('methods loaded')
data = [feature_extractor.transform(d) for d in data]
print('data successfully extracted')

In [None]:
from feature_selection.genetic_algorithm_selector import reduce_bulk_features
from feature_selection.correlation_selector import CorrelationSelector

correlation_selector = CorrelationSelector()
correlation_selector.train(data, list(range(2, 24)))
features = correlation_selector.get_with_threshold(0.4)

print(f'features = {features}')
# features = [0, 3, 9, 19, 23, 24, 25, 33, 34]
features.append(len(data[0]) - 1)
corr_data = reduce_bulk_features(data, features)

In [None]:
validation_data = np.loadtxt("UNSW_NB15_testing-set.csv", delimiter=",", dtype=str)
validation_data = remove_unnecessary_data(validation_data)
validation_data = [feature_extractor.transform(d) for d in validation_data]


def validate(classifier):
    labels = [d[-1] for d in validation_data]
    test_data = reduce_bulk_features(np.delete(validation_data, -1, axis=1), features)
    classifications = [classifier.classify(sample) for sample in test_data]
    for threshold in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        true_positive = []
        true_negative = []
        false_positive = []
        false_negative = []
        for i in range(len(classifications)):
            test = classifications[i]
            label = labels[i]
            if label != 1:
                if test >= threshold:
                    true_positive.append(test)
                else:
                    false_negative.append(test)
            else:
                if test >= threshold:
                    false_positive.append(test)
                else:
                    true_negative.append(test)
        tp = len(true_positive)
        tn = len(true_negative)
        fp = len(false_positive)
        fn = len(false_negative)
        print(f'{classifier}:')
        print(f'threshold: {threshold}')
        print(f'Accuracy: {(tp + tn) / (tp + tn + fp + fn)}')
        print(f'Detection Rate: {tp / (tp + fn)}')
        print(f'FAR: {fp / (fp + tn)}')

Find optimal hyperparameters for IFTSVM Classifier

In [None]:
from numpy import array_split
from classification.iftsvm_classifier import IFTSVMClassifier
from training_process.iftsvm_training import IFTSVMTrainer

iftsvm_classifier = IFTSVMClassifier(alpha=50, delta=0, C_1=0.1, C_2=1, C_3=0.1, C_4=0.5, kernel_size=10)
if not iftsvm_classifier.load('iftsvm.csv'):
    iftsvm_trainer = IFTSVMTrainer(corr_data, list(range(2, 24)), 'non-linear')
    iftsvm_classifier = iftsvm_trainer.find_best_coefficients()
    iftsvm_classifier.train(array_split(corr_data, 10)[0], list(range(2, 24)))
    iftsvm_classifier.save('iftsvm.csv')
# validate(iftsvm_classifier)

Find optimal hyperparameters for ABC Classifier

In [None]:
from classification.abc_classifier import ABCClassifier
from training_process.abc_training import ABCTrainer

abc_classifier = ABCClassifier(method='non-linear',
                               generations=100, population_size=1000, cycle_numbers=1000,
                               chosen_number=10, kernel_size=7, sight=10, fit_function='sight')
if not abc_classifier.load('abc.csv'):
    abc_trainer = ABCTrainer(corr_data, list(range(2, 24)))
    abc_classifier = abc_trainer.find_best_coefficients()
    abc_classifier.train(array_split(corr_data, 10)[0], list(range(2, 24)))
    abc_classifier.save('abc.csv')
# validate(abc_classifier)

Training the model with a complete model trainer

In [None]:
import warnings
import datetime, time
from training_process.model_trainer import train_model

start = time.time()
classifiers = [
    iftsvm_classifier,
    abc_classifier
]
warnings.simplefilter("ignore", UserWarning)
trained_model = train_model(classifiers=classifiers, data=data, positive_labels=list(range(2, 24)), selector_threshold=0.4, iterations=3)
print(f'finished after {datetime.timedelta(seconds=int(time.time() - start))}')

Start the validation process with the KDDTest+ dataset

In [None]:
validation_data = remove_unnecessary_data(np.loadtxt("UNSW_NB15_testing-set.csv", delimiter=",", dtype=str))
print(f'loaded {len(validation_data)} test samples with {len(validation_data[0])} features')
validation_data = [feature_extractor.transform(d) for d in validation_data]
print('data successfully extracted')

In [None]:
labels = [d[-1] for d in validation_data]
test_data = np.delete(validation_data, -1, axis=1)
classifications = [trained_model.classify(sample) for sample in test_data]
true_positive = []
true_negative = []
false_positive = []
false_negative = []
for i in range(len(classifications)):
    label, mem_deg = classifications[i]
    true_label = labels[i]
    if true_label != 1:
        if label == 0:
            true_positive.append(mem_deg)
        else:
            false_negative.append(mem_deg)
    else:
        if label == 0:
            false_positive.append(mem_deg)
        else:
            true_negative.append(mem_deg)
tp = len(true_positive)
tn = len(true_negative)
fp = len(false_positive)
fn = len(false_negative)
print('result:')
print(f'Accuracy: {(tp + tn) / (tp + tn + fp + fn)}')
print(f'Detection Rate: {tp / (tp + fn)}')
print(f'FAR: {fp / (fp + tn)}')