In [None]:
import numpy as np

data = np.loadtxt("kddcup.data_10_percent_corrected", delimiter=",", dtype=str)
print(f'loaded {len(data)} samples with {len(data[0])} features')

In [None]:
from feature_extraction.feature_extractor import FeatureExtractor

feature_methods_path = 'kdd_methods.csv'
feature_extractor = FeatureExtractor()
if not feature_extractor.load_extraction_methods(feature_methods_path):
    methods = feature_extractor.calculate_extraction_methods(data)
    feature_extractor.save_extraction_methods(feature_methods_path)
    print('methods calculated')

    from feature_extraction.feature_extractor import calculate_best_numeric_normalization


    # Change some erroneous rows
    def get_numeric_row(data: [], i: int) -> [str]:
        return [float(d[i]) for d in data]


    changes = [24, 25, 26, 27, 28, 29, 30, 33, 34, 35, 36, 37, 38, 39, 40]
    for i in changes:
        m = calculate_best_numeric_normalization(get_numeric_row(data, i))
        feature_extractor.change_method(i, m)
    feature_extractor.save_extraction_methods(feature_methods_path)
else:
    print('methods loaded')
data = [feature_extractor.transform(d) for d in data]
print('data successfully extracted')
positive_labels = list(range(2, 24))

In [None]:
from feature_selection.genetic_algorithm_selector import reduce_bulk_features
from feature_selection.correlation_selector import CorrelationSelector

correlation_selector = CorrelationSelector()
correlation_selector.train(data, list(range(2, 24)))
features = correlation_selector.get_with_threshold(0.4)
if len(features) < 6:
    features = correlation_selector.get_highest_ranked_features(6)
print(f'features = {features}')
features.append(len(data[0]) - 1)
corr_data = reduce_bulk_features(data, features)

In [None]:
validation_data = np.loadtxt("kddcup.data/kddcup.data", delimiter=",", dtype=str)
print(f'loaded {len(data)} test samples with {len(data[0])} features')
validation_data = [feature_extractor.transform(d) for d in validation_data]
print('data successfully extracted')

In [None]:



def create_hist(hist_data, name):
    plt.hist(hist_data, bins=np.arange(0, 1, 0.1))
    plt.title(name)
    plt.xlabel('membership degree')
    plt.show()


def validate(classifier):
    labels = [d[-1] for d in validation_data]
    test_data = reduce_bulk_features(np.delete(validation_data, -1, axis=1), features)
    classifications = [classifier.classify(sample) for sample in test_data]
    for threshold in [0.3, 0.4, 0.5, 0.6, 0.7]:
        true_positive = []
        true_negative = []
        false_positive = []
        false_negative = []
        for i in range(len(classifications)):
            test = classifications[i]
            label = labels[i]
            if label != 1:
                if test >= threshold:
                    true_positive.append(test)
                else:
                    false_negative.append(test)
            else:
                if test >= threshold:
                    false_positive.append(test)
                else:
                    true_negative.append(test)
        # create_hist(true_positive, 'true positive')
        # create_hist(true_negative, 'true negative')
        # create_hist(false_positive, 'false positive')
        # create_hist(false_negative, 'false negative')
        tp = len(true_positive)
        tn = len(true_negative)
        fp = len(false_positive)
        fn = len(false_negative)
        print(f'{classifier} with threshold {threshold}:')
        print(f'Accuracy: {(tp + tn) / (tp + tn + fp + fn)}')
        print(f'Detection Rate: {tp / (tp + fn)}')
        print(f'FAR: {fp / (fp + tn)}')

In [None]:
from classification.iftsvm_classifier import IFTSVMClassifier
from training_process.iftsvm_training import IFTSVMTrainer

iftsvm_classifier = IFTSVMClassifier(method='non-linear', alpha=50, delta=5, C_1=1, C_2=0.1, C_3=0.1, C_4=0.1,
                                     kernel_size=100)
if not iftsvm_classifier.load('iftsvm.csv'):
    iftsvm_trainer = IFTSVMTrainer(corr_data, list(range(2, 24)), 'non-linear')
    iftsvm_classifier = iftsvm_trainer.find_best_coefficients()
    iftsvm_classifier.train(corr_data, positive_labels)
    iftsvm_classifier.save('iftsvm.csv')
validate(iftsvm_classifier)

In [None]:
from classification.abc_classifier import ABCClassifier
from training_process.abc_training import ABCTrainer

abc_classifier = ABCClassifier(method='non-linear', population_size=2000, kernel_size=7, sight=10)
if not abc_classifier.load('abc.csv'):
    abc_trainer = ABCTrainer(corr_data, list(range(2, 24)))
    abc_classifier = abc_trainer.find_best_coefficients()
    abc_classifier.train(corr_data, positive_labels)
    abc_classifier.save('abc.csv')
validate(abc_classifier)

In [None]:
import warnings

import datetime, time
from training_process.model_trainer import train_model

start = time.time()
classifiers = [
    iftsvm_classifier,
    abc_classifier
]
warnings.simplefilter("ignore", UserWarning)
trained_model = train_model(classifiers=classifiers, data=data, positive_labels=positive_labels)
print(f'finished after {datetime.timedelta(seconds=int(time.time() - start))}')

In [None]:
from matplotlib import pyplot as plt


def create_hist(hist_data, name):
    plt.hist(hist_data, bins=np.arange(0, 1, 0.1))
    plt.title(name)
    plt.xlabel('membership degree')
    plt.show()


labels = [d[-1] for d in validation_data]
classifications = [trained_model.classify(sample) for sample in np.delete(validation_data, -1, axis=1)]
true_positive = []
true_negative = []
false_positive = []
false_negative = []
for i in range(len(classifications)):
    label, mem_deg = classifications[i]
    true_label = labels[i]
    if true_label != 1:
        if label == 0:
            true_positive.append(mem_deg)
        else:
            false_negative.append(mem_deg)
    else:
        if label == 0:
            false_positive.append(mem_deg)
        else:
            true_negative.append(mem_deg)
create_hist(true_positive, 'true positive')
create_hist(true_negative, 'true negative')
create_hist(false_positive, 'false positive')
create_hist(false_negative, 'false negative')
tp = len(true_positive)
tn = len(true_negative)
fp = len(false_positive)
fn = len(false_negative)
print('result:')
print(f'Accuracy: {(tp + tn) / (tp + tn + fp + fn)}')
print(f'Detection Rate: {tp / (tp + fn)}')
print(f'FAR: {fp / (fp + tn)}')