# Static Testing with the CICFlow Dataset

This dataset is real live data, which has been collected at the UNSW Canberra in 2015 over a few days. First we load the training set.

In [None]:
import numpy as np

training_path = "Friday-23-02-2018_TrafficForML_CICFlowMeter.csv"
training_data = np.loadtxt(training_path, delimiter=",", dtype=str)
print(f'loaded {len(training_data)} training samples with {len(training_data[0])} features')

file_paths = ['Friday-02-03-2018_TrafficForML_CICFlowMeter.csv', 'Friday-16-02-2018_TrafficForML_CICFlowMeter.csv',
              'Thuesday-20-02-2018_TrafficForML_CICFlowMeter.csv', 'Thursday-01-03-2018_TrafficForML_CICFlowMeter.csv',
              'Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv', 'Thursday-22-02-2018_TrafficForML_CICFlowMeter.csv',
              'Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv',
              'Wednesday-21-02-2018_TrafficForML_CICFlowMeter.csv',
              'Wednesday-28-02-2018_TrafficForML_CICFlowMeter.csv']
for i in range(1, 21):
    file_paths.append(f'Thursday-20-02-2018/Thursday-20-02-2018_TrafficForML_CICFlowMeter{i}.csv')

Next, remove the first line, which contains the labels, and the timestamp

In [None]:
def remove_unnecessary_data(data):
    data = np.delete(data, 2, axis=1)
    data = np.delete(data, 0, axis=0)
    return data


training_data = remove_unnecessary_data(training_data)
print(training_data[0])

Then, we calculate the correct feature extraction methods

In [None]:
from feature_extraction.feature_extractor import FeatureExtractor

feature_methods_path = 'cic_flow_extraction_methods.csv'
feature_extractor = FeatureExtractor()
if not feature_extractor.load_extraction_methods(feature_methods_path):
    methods = feature_extractor.calculate_extraction_methods(training_data)
    feature_extractor.save_extraction_methods(feature_methods_path)
    print('methods calculated')
    feature_extractor.save_extraction_methods(feature_methods_path)
else:
    print('methods loaded')
training_data = [feature_extractor.transform(d) for d in training_data]
print('data successfully extracted')

In [None]:
from feature_selection.genetic_algorithm_selector import reduce_bulk_features
from feature_selection.correlation_selector import CorrelationSelector

correlation_selector = CorrelationSelector()
correlation_selector.train(training_data, list(range(2, 24)))
features = correlation_selector.get_with_threshold(0.4)
if len(features) < 6:
    features = correlation_selector.get_highest_ranked_features(6)
# features = [8, 9, 13, 18, 23, 7]
print(f'features = {features}')
features.append(len(training_data[0]) - 1)
corr_data = reduce_bulk_features(training_data, features)

Find optimal hyperparameters for IFTSVM Classifier

In [None]:
def calculate_measures(classifier, test_data, threshold):
    labels = [d[-1] for d in test_data]
    test_data = reduce_bulk_features(np.delete(test_data, -1, axis=1), features)
    classifications = [classifier.classify(sample) for sample in test_data]
    true_positive = []
    true_negative = []
    false_positive = []
    false_negative = []
    for i in range(len(classifications)):
        test = classifications[i]
        label = labels[i]
        if label != 1:
            if test >= threshold:
                true_positive.append(test)
            else:
                false_negative.append(test)
        else:
            if test >= threshold:
                false_positive.append(test)
            else:
                true_negative.append(test)
    return true_positive, true_negative, false_positive, false_negative


def validate(classifier):
    for threshold in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        tp, tn, fp, fn = 0, 0, 0, 0
        for path in file_paths:
            print(f'add {path}')
            validation_data = np.loadtxt(path, delimiter=",", dtype=str)
            print(f'length data: {len(validation_data)}')
            validation_data = [feature_extractor.transform(d) for d in validation_data]
            ctp, ctn, cfp, cfn = calculate_measures(classifier, validation_data, threshold)
            tp += len(ctp)
            tn += len(ctn)
            fp += len(cfp)
            fn += len(cfn)
        print(f'result for {classifier}:')
        print(f'threshold: {threshold}')
        print(f'Accuracy: {(tp + tn) / (tp + tn + fp + fn)}')
        print(f'Detection Rate: {tp / (tp + fn)}')
        print(f'FAR: {fp / (fp + tn)}')

In [None]:
from training_process.iftsvm_training import IFTSVMTrainer

iftsvm_trainer = IFTSVMTrainer(corr_data, list(range(2, 24)), 'non-linear')
iftsvm_classifier = iftsvm_trainer.find_best_coefficients()
validate(iftsvm_classifier)
iftsvm_classifier.save('iftsvm.csv')

Find optimal hyperparameters for ABC Classifier

In [None]:
from training_process.abc_training import ABCTrainer

abc_trainer = ABCTrainer(corr_data, list(range(2, 24)))
abc_classifier = abc_trainer.find_best_coefficients()
validate(abc_classifier)
abc_classifier.save('abc_classifier.csv')

Training the model with a complete model trainer

In [None]:
import warnings
import datetime, time
from training_process.model_trainer import train_model

start = time.time()
classifiers = [
    iftsvm_classifier,
    abc_classifier
]
warnings.simplefilter("ignore", UserWarning)
trained_model = train_model(data=training_data, positive_labels=list(range(2, 24)), training_data_length=10000)
print(f'finished after {datetime.timedelta(seconds=int(time.time() - start))}')

Start the validation process with the dataset of the other days

In [None]:
from matplotlib import pyplot as plt


def create_hist(hist_data, name):
    plt.hist(hist_data, bins=np.arange(0, 1, 0.1))
    plt.title(name)
    plt.xlabel('membership degree')
    plt.show()


def calculate_measures(test_data):
    labels = [d[-1] for d in test_data]
    test_data = np.delete(test_data, -1, axis=1)
    classifications = [trained_model.classify(sample) for sample in test_data]
    true_positive = []
    true_negative = []
    false_positive = []
    false_negative = []
    for i in range(len(classifications)):
        label, mem_deg = classifications[i]
        true_label = labels[i]
        if true_label != 1:
            if label == 0:
                true_positive.append(mem_deg)
            else:
                false_negative.append(mem_deg)
        else:
            if label == 0:
                false_positive.append(mem_deg)
            else:
                true_negative.append(mem_deg)
    return true_positive, true_negative, false_positive, false_negative


tp, tn, fp, fn = 0, 0, 0, 0
for path in file_paths:
    print(f'add {path}')
    validation_data = remove_unnecessary_data(np.loadtxt(path, delimiter=",", dtype=str))
    print(f'length data: {len(validation_data)}')
    validation_data = [feature_extractor.transform(d) for d in validation_data]
    ctp, ctn, cfp, cfn = calculate_measures(validation_data)
    tp += len(ctp)
    tn += len(ctn)
    fp += len(cfp)
    fn += len(cfn)
print('result:')
print(f'Accuracy: {(tp + tn) / (tp + tn + fp + fn)}')
print(f'Detection Rate: {tp / (tp + fn)}')
print(f'FAR: {fp / (fp + tn)}')