In [1]:
# preparing data
from models import *
from methods import *
import pandas as pd
import torch
import os
from os import path
from sklearn.manifold import TSNE

DATASET = 'full_data'
"""___________________Hyper Parameters________________________"""
MODEL_NAME = ('VAE', 'DAE', 'AE')
CROSS_VALIDATION_SHUFFLE = True
TRIM_DATA = (True, False)
FILTER_CORRCOEF = (True, False)
REMOVE_NOISE = (True, False)
NOISE_THRESHOLD = (10, 3, 2)
DENOISE = (False, True)
NOISE_FACTOR = (0.05, 0.2, 0.5)
NOISE_FRACTION = (0.1, 0.2, 0.5, 0.8)
PERPLEXITY = 10
NORMALIZE_DATA = (True, False)

INITIALIZATION = ('xavier_normal', 'kaiming_normal')
ACTIVATION = ('tanh', 'leaky_relu', 'sigmoid')
SIGMA = (1e0, 1e-1, 1e-4)
LATENT_DIM = (8, 32, 128)
HIDDEN_DIM = (8, 64, 512)
GD_ALGORITHMS = ('SGD', 'Adam')
WEIGHT_DECAY = (0, 1e-6, 1e-4)
LEARNING_RATE = (1e-4, 1e-3)
BATCH_SIZE = (1024, 256, 128)
EPOCHS = (2000, 4000)
"""___________________________________________________________"""
# collapse
# Load Dataset
if DATASET == 'network_data':
    raw_data = np.loadtxt('../data/network_flow_regular_data.csv', skiprows=1, delimiter=',')[:,:-1]
    anomalous_data = np.loadtxt('../data/network_flow_attack_data.csv', skiprows=1, delimiter=',')[:,:-1]
elif DATASET == 'medical_data':
    raw_data = np.loadtxt('../data/medical_regular_data.csv', skiprows=1, delimiter=',')[:,:-1]
    anomalous_data = np.loadtxt('../data/medical_attack_data.csv', skiprows=1, delimiter=',')[:,:-1]
elif DATASET == 'full_data':
    raw_data = np.concatenate((np.loadtxt('../data/network_flow_regular_data.csv', skiprows=1, delimiter=',')[:,:-1], np.loadtxt('../data/medical_regular_data.csv', skiprows=1, delimiter=',')[:,:-1]), axis=1)
    anomalous_data = np.concatenate((np.loadtxt('../data/network_flow_attack_data.csv', skiprows=1, delimiter=',')[:,:-1], np.loadtxt('../data/medical_attack_data.csv', skiprows=1, delimiter=',')[:,:-1]), axis=1)
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(raw_data.shape, anomalous_data.shape)

(13343, 20) (2046, 20)


In [None]:
import itertools
# Different types of model
for model_name in MODEL_NAME:
    # Data Processing
    for trim_data, normalize_data, filter_corrcoef, remove_noise, noise_threshold in itertools.product(TRIM_DATA, NORMALIZE_DATA, FILTER_CORRCOEF, REMOVE_NOISE, NOISE_THRESHOLD):
        # initialize containers for data
        loss_array = []
        # process data
        train_data, validation_data, test_data, anomalous_data = preProcessData_OneClass(raw_data, anomalous_data, trim=trim_data, trim_threshold=0.98, normalize=normalize_data,
            filterLinearDependencies=True, filter_threshold=0.99, removeNoise=remove_noise, noise_threshold=noise_threshold)

        train_data, validation_data, test_data, anomalous_data = toTorchTensor(device, train_data, validation_data, test_data, anomalous_data)
        criterion = nn.MSELoss(reduction='sum')
        print(train_data.shape, validation_data.shape, test_data.shape, anomalous_data.shape)
        NUM_FEATURE = train_data.shape[1]
        
        for latent_dim, hidden_dim, activation, initialization, sigma in itertools.product(LATENT_DIM, HIDDEN_DIM, ACTIVATION, INITIALIZATION, SIGMA):
            for algo, learning_rate, weight_decay, epochs, batch_size, noise_factor, noise_fraction in itertools.product(GD_ALGORITHMS, LEARNING_RATE, WEIGHT_DECAY, EPOCHS, BATCH_SIZE, NOISE_FACTOR, NOISE_FRACTION):
                # initialize model
                if model_name == 'AE':
                    model = AutoEncoder(num_feature=NUM_FEATURE, latent_dim=latent_dim, hidden_dim=hidden_dim, activation=activation, initialization=initialization).to(device)
                elif model_name == 'DAE':
                    model = DAE(num_feature=NUM_FEATURE, latent_dim=latent_dim, hidden_dim=hidden_dim, activation=activation, initialization=initialization, noise_factor=noise_factor, noise_fraction=noise_fraction).to(device)
                elif model_name == 'VAE':
                    model = VAE(num_feature=NUM_FEATURE, latent_dim=latent_dim, hidden_dim=hidden_dim, activation=activation, initialization=initialization, sigma=sigma).to(device)
                # train 
                loss_array = train(model, 'SGD', epochs, train_data, train_data, criterion, batch_size=batch_size, lr=learning_rate, weight_decay=weight_decay
                    , grad_limit=1e3)
                torch.save(model.state_dict(), '../model/param_AE')

                # test
                loss_test, loss_attack, kl_div_test, kl_div_attack, y_scores, y_scores_lcs, y_ground_truth, lcs_array_test, lcs_array_attack = test(model, criterion, train_data, test_data, anomalous_data) 
                # t-SNE analysis of code distribution
                tsne = TSNE(n_components=2, perplexity=PERPLEXITY)
                full_code = model.encode(torch.cat((test_data, anomalous_data), dim=0)).detach().numpy()
                full_label = [0 for _ in range(len(test_data))] + [1 for _ in range(len(anomalous_data))]
                tsne_code = tsne.fit_transform(full_code)
                # plot
                # parameters for visualizer to print
                parameters = {"trim_data": trim_data, "normalize_data": normalize_data, 
                    "filter_corrcoef": filter_corrcoef, "remove_noise": remove_noise, "noise_threshold": noise_threshold,
                    
                    "latent_dim": latent_dim, "hidden_dim": hidden_dim,
                    "activation": activation, "initialization": initialization,

                    "weight_decay": weight_decay,
                    "learning_rate": learning_rate,
                    "batch_size": batch_size,
                    "epochs": epochs,
                    }
                if model.name == 'VAE':
                    parameters["sigma"] = sigma
                if model.name == 'DAE':
                    parameters["denoise"] = True
                    parameters["noise_factor"] = noise_factor
                    parameters["noise_fraction"] = noise_fraction
                visualize_convergence(loss_array, model.name, save=True, **parameters)
                visualize_loss(loss_test, loss_attack, model.name, save=True, **parameters)
                visualize_tSNE(tsne_code, len(test_data), len(anomalous_data), PERPLEXITY, model.name, save=True, **parameters)
                visualize_kl(kl_div_test, kl_div_attack, model.name, save=True, **parameters)
                scores = {"y_scores": y_scores, "y_scores_lcs": y_scores_lcs, "kl_scores": np.concatenate((kl_div_test, kl_div_attack), axis=None)}
                #visualize_ROC(y_ground_truth, model.name, True, scores ,**parameters)
                plt.close('all')


In [None]:
# load network data
raw_data, anomalous_data = np.loadtxt('../data/network_flow_regular_data.csv', skiprows=1, delimiter=',')[:,:-1], np.loadtxt('../data/network_flow_attack_data.csv', skiprows=1, delimiter=',')[:,:-1]
_, _, _, anomalous_data = preProcessData_OneClass(raw_data, anomalous_data, trim=trim_data, trim_threshold=0.98, normalize=normalize_data,
            filterLinearDependencies=True, filter_threshold=0.98, removeNoise=remove_noise, noise_threshold=noise_threshold)
# import network optimal model
model = None
model.load_state_dict('../model/param_DOCAE_network_optimal')
# indices that are detected
idc1 = torch.sum((model.encode(anomalous_data) - model.center)**2, dim=1) > model.R**2

#_______________________________________________________________________________
# load medical data
raw_data, anomalous_data = np.loadtxt('../data/medical_regular_data.csv', skiprows=1, delimiter=',')[:,:-1], np.loadtxt('../data/medical_attack_data.csv', skiprows=1, delimiter=',')[:,:-1]
_, _, _, anomalous_data = preProcessData_OneClass(raw_data, anomalous_data, trim=trim_data, trim_threshold=0.98, normalize=normalize_data,
            filterLinearDependencies=True, filter_threshold=0.98, removeNoise=remove_noise, noise_threshold=noise_threshold)
# import medical optimal model
model = None
model.load_state_dict('../model/param_DOCAE_medical_optimal')
# indices that are detected
idc2 = torch.sum((model.encode(anomalous_data) - model.center)**2, dim=1) > model.R**2

print('Accuracy = ', torch.logical_or(idc1, idc2).sum() / len(idc1))
