In [1]:
# libraries
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns


import pandas as pd
import numpy as np
from numpy.random import seed

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import VarianceThreshold


from pyod.models.cblof import CBLOF
from pyod.models.hbos import HBOS
from pyod.models.combination import maximization
from pyod.utils.utility import standardizer




import keras
from keras import backend as bkend


from keras.layers import Input, Dense, BatchNormalization, Dropout
from keras.models import Model


os.environ["KERAS_BACKEND"] = "tensorflow"
importlib.reload(bkend)


pd.options.display.max_rows = 4000


Using TensorFlow backend.
Using TensorFlow backend.


In [2]:
# useful methods
def variance_threshold_selector(data, threshold=0):
    """Eliminate features lower than given variance.

    Parameters
    ----------
    data : `dataframe`
        Dataframe will be processed.

    threshold : `float`
        Threshold value for this operation.

    Returns
    -------
    data : `dataframe`
        Updated dataframe with coloumn names.
    """
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

def min_max_scaler(data):
    """Min max scaler for features.

    Parameters
    ----------
    data : `dataframe`
        Dataframe will be processed.

    Returns
    -------
    data : `dataframe`
        Updated dataframe with coloumn names.
    """
    scaled_data = preprocessing.minmax_scale(data)
    return pd.DataFrame(scaled_data, columns=data.columns)

def binning_anomally_score(data, n_bins=6, encode='ordinal', strategy='kmeans'):
    """Bin anomally scores.

    Parameters
    ----------
    data : `array`
        1D numpy array.

    n_bins : `int`
        Number of bins.

    encode : `str`
        Encode style.See: sklearn.

    strategy : `array`
        1D numpy array.

    Returns
    -------
    data : `dataframe`
        Updated dataframe with coloumn names.
    """
    est = preprocessing.KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy)
    res = est.fit_transform(data.reshape(-1,1)).ravel()
    return res

def plt_fig(x, y):
    """Plot figure.

    Parameters
    ----------
    x : `array`
        1D numpy array.

    y : `array`
        1D numpy array.

    Returns
    -------
    No return values given.
    """
    plt.figure(figsize=(10,8))
    axis = sns.barplot(x=x, y=y)
    x=axis.set_xlabel("Cluster Number")
    x=axis.set_ylabel("Number of connections")

### **Given dataset in Kaggle had not been available for foreigners therefore, a near dataset had been selected. The collection of dataset link and description are here [1]. This dataset instance has been selected because it has various anomally event happened in it.**

In [3]:
# read data
df = pd.read_csv("../data/MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv", low_memory=False)
# clean coloumn name for whitespace
df = df.rename(columns=lambda x: x.strip())
# create unsupervised dataset by removing labels
df_u = df.drop('Label', axis=1)

In [4]:
# check first few entries of dataset
df_u.head(100)

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,80,38308,1,1,6,6,6,6,6.0,0.0,...,0,20,0.0,0.0,0,0,0.0,0.0,0,0
1,389,479,11,5,172,326,79,0,15.636364,31.449238,...,4,32,0.0,0.0,0,0,0.0,0.0,0,0
2,88,1095,10,6,3150,3150,1575,0,315.0,632.561635,...,3,32,0.0,0.0,0,0,0.0,0.0,0,0
3,389,15206,17,12,3452,6660,1313,0,203.058824,425.778474,...,10,32,0.0,0.0,0,0,0.0,0.0,0,0
4,88,1092,9,6,3150,3152,1575,0,350.0,694.509719,...,2,32,0.0,0.0,0,0,0.0,0.0,0,0
5,389,433,11,4,172,326,79,0,15.636364,31.449238,...,4,32,0.0,0.0,0,0,0.0,0.0,0,0
6,88,1088,9,6,3150,3152,1575,0,350.0,694.509719,...,2,32,0.0,0.0,0,0,0.0,0.0,0,0
7,80,579225,132,150,160,320799,160,0,1.212121,13.926212,...,1,32,0.0,0.0,0,0,0.0,0.0,0,0
8,49666,3,2,0,12,0,6,6,6.0,0.0,...,1,20,0.0,0.0,0,0,0.0,0.0,0,0
9,49413,4,3,0,18,0,6,6,6.0,0.0,...,2,20,0.0,0.0,0,0,0.0,0.0,0,0


In [None]:
# check features and coloumn types
df_u.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 692703 entries, 0 to 692702
Data columns (total 78 columns):
Destination Port               692703 non-null int64
Flow Duration                  692703 non-null int64
Total Fwd Packets              692703 non-null int64
Total Backward Packets         692703 non-null int64
Total Length of Fwd Packets    692703 non-null int64
Total Length of Bwd Packets    692703 non-null int64
Fwd Packet Length Max          692703 non-null int64
Fwd Packet Length Min          692703 non-null int64
Fwd Packet Length Mean         692703 non-null float64
Fwd Packet Length Std          692703 non-null float64
Bwd Packet Length Max          692703 non-null int64
Bwd Packet Length Min          692703 non-null int64
Bwd Packet Length Mean         692703 non-null float64
Bwd Packet Length Std          692703 non-null float64
Flow Bytes/s                   691695 non-null object
Flow Packets/s                 692703 non-null object
Flow IAT Mean                  

In [None]:
# check statistical parameters of each feature
df_u.describe()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,692703.0,692703.0,692703.0,692703.0,692703.0,692703.0,692703.0,692703.0,692703.0,692703.0,...,692703.0,692703.0,692703.0,692703.0,692703.0,692703.0,692703.0,692703.0,692703.0,692703.0
mean,5686.869462,28001680.0,9.556261,10.214079,555.093,16996.44,233.593936,15.022183,60.55544,82.895863,...,6.121279,26.761141,92244.78,47608.52,162736.3,63151.86,22111220.0,474374.4,22521740.0,21733730.0
std,15727.42356,42766800.0,747.197814,984.204633,6163.663,2241175.0,603.751856,51.068835,157.643794,226.126084,...,715.155068,6.322368,700704.9,474208.1,1094616.0,605102.3,38124150.0,4488512.0,38482920.0,38077250.0
min,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,53.0,201.0,2.0,1.0,12.0,0.0,6.0,0.0,6.0,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,80.0,61437.0,2.0,2.0,82.0,188.0,46.0,0.0,41.0,0.0,...,1.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,443.0,83024370.0,7.0,6.0,365.0,11595.0,341.0,32.0,56.666667,128.916917,...,2.0,32.0,991.0,0.0,991.0,988.0,15900000.0,0.0,16500000.0,10000000.0
max,65487.0,120000000.0,203943.0,272353.0,1224076.0,627000000.0,24820.0,2065.0,4640.757576,6429.190773,...,197124.0,60.0,100000000.0,74200000.0,105000000.0,100000000.0,120000000.0,76900000.0,120000000.0,120000000.0


In [None]:
# we can check number of distinct non-nan values for each features
# some of features may have zero variance
df_u.nunique()

Destination Port                30094
Flow Duration                  363185
Total Fwd Packets                 685
Total Backward Packets            859
Total Length of Fwd Packets      9380
Total Length of Bwd Packets     23483
Fwd Packet Length Max            3728
Fwd Packet Length Min             220
Fwd Packet Length Mean          31471
Fwd Packet Length Std           63498
Bwd Packet Length Max            3836
Bwd Packet Length Min             429
Bwd Packet Length Mean          41095
Bwd Packet Length Std           55643
Flow Bytes/s                   450841
Flow Packets/s                 387031
Flow IAT Mean                  370810
Flow IAT Std                   222370
Flow IAT Max                   148528
Flow IAT Min                    47743
Fwd IAT Total                  100832
Fwd IAT Mean                   162817
Fwd IAT Std                    142925
Fwd IAT Max                     98682
Fwd IAT Min                     37400
Bwd IAT Total                  111743
Bwd IAT Mean

## Feature engineering

In [None]:
# destination port coloumn is nominal categorial feature so we need to encode it using count encoding.
df_u['Destination Port'] = df_u['Destination Port'].map(df_u['Destination Port'].value_counts())

In [None]:
# convert problematic coloumns because of nan values into more easy to handle types
df_u['Flow Bytes/s'] = df_u['Flow Bytes/s'].astype(float)
df_u['Flow Packets/s'] = df_u['Flow Packets/s'].astype(float)

# replace infinite values or bigger than float64 with nan values
df_u = df_u.replace([np.inf, -np.inf], np.nan)

# check how many nan values per coloumn
display(df_u.isna().sum())

Destination Port                  0
Flow Duration                     0
Total Fwd Packets                 0
Total Backward Packets            0
Total Length of Fwd Packets       0
Total Length of Bwd Packets       0
Fwd Packet Length Max             0
Fwd Packet Length Min             0
Fwd Packet Length Mean            0
Fwd Packet Length Std             0
Bwd Packet Length Max             0
Bwd Packet Length Min             0
Bwd Packet Length Mean            0
Bwd Packet Length Std             0
Flow Bytes/s                   1297
Flow Packets/s                 1297
Flow IAT Mean                     0
Flow IAT Std                      0
Flow IAT Max                      0
Flow IAT Min                      0
Fwd IAT Total                     0
Fwd IAT Mean                      0
Fwd IAT Std                       0
Fwd IAT Max                       0
Fwd IAT Min                       0
Bwd IAT Total                     0
Bwd IAT Mean                      0
Bwd IAT Std                 

In [None]:
# drop rows includes nan values
# it may handle more effective in other situations however, for this task it is not so important.
df_u = df_u.dropna(axis=0)

In [None]:
# drop all coloumns only has one discint value
df_u = variance_threshold_selector(df_u)
df_u.nunique()

Destination Port                   37
Flow Duration                  363184
Total Fwd Packets                 685
Total Backward Packets            859
Total Length of Fwd Packets      9380
Total Length of Bwd Packets     23483
Fwd Packet Length Max            3728
Fwd Packet Length Min             220
Fwd Packet Length Mean          31471
Fwd Packet Length Std           63498
Bwd Packet Length Max            3836
Bwd Packet Length Min             429
Bwd Packet Length Mean          41095
Bwd Packet Length Std           55643
Flow Bytes/s                   450840
Flow Packets/s                 387030
Flow IAT Mean                  370809
Flow IAT Std                   222370
Flow IAT Max                   148527
Flow IAT Min                    47743
Fwd IAT Total                  100832
Fwd IAT Mean                   162817
Fwd IAT Std                    142925
Fwd IAT Max                     98682
Fwd IAT Min                     37400
Bwd IAT Total                  111743
Bwd IAT Mean

In [None]:
# normalize features of dataframe
# df_unsupervised = pd.DataFrame(m_m_scaler.fit_transform(df_unsupervised.values), 
#                                columns=df_unsupervised.columns)
df_u = min_max_scaler(df_u)
df_u.head(10)



Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,1.0,0.0003192417,0.0,4e-06,5e-06,9.569378e-09,0.000242,0.002906,0.001293,0.0,...,0.0,0.344262,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.003765,4e-06,4.9e-05,1.8e-05,0.000141,5.199362e-07,0.003183,0.0,0.003369,0.004892,...,2e-05,0.540984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.003089,9.133333e-06,4.4e-05,2.2e-05,0.002573,5.023923e-06,0.063457,0.0,0.067877,0.098389,...,1.5e-05,0.540984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.003765,0.000126725,7.8e-05,4.4e-05,0.00282,1.062201e-05,0.052901,0.0,0.043756,0.066226,...,5.1e-05,0.540984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.003089,9.108333e-06,3.9e-05,2.2e-05,0.002573,5.027113e-06,0.063457,0.0,0.075419,0.108024,...,1e-05,0.540984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.003765,3.616667e-06,4.9e-05,1.5e-05,0.000141,5.199362e-07,0.003183,0.0,0.003369,0.004892,...,2e-05,0.540984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.003089,9.075e-06,3.9e-05,2.2e-05,0.002573,5.027113e-06,0.063457,0.0,0.075419,0.108024,...,1e-05,0.540984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,0.004826883,0.000642,0.000551,0.000131,0.0005116411,0.006446,0.0,0.000261,0.002166,...,5e-06,0.540984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.000418,3.333333e-08,5e-06,0.0,1e-05,0.0,0.000242,0.002906,0.001293,0.0,...,5e-06,0.344262,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.7e-05,4.166667e-08,1e-05,0.0,1.5e-05,0.0,0.000242,0.002906,0.001293,0.0,...,1e-05,0.344262,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### **There are high number of features need to be reduced in order to improve performance of unsupervised algorithms.Different dimension reduction tec. can be used here.I have chosen a basic vanilla autoencoder to compress data by considering performance / ease of implementation (complexity) ratio.**


In [None]:
# split our data test - train fashion
df_u_train, df_u_test = train_test_split(df_u,  train_size = 0.9, 
                                     random_state = seed(2018))




In [None]:
# parameter definition
n_hidden_units = 100
batch_size=100
n_epoch = 50
encoding_dim=  10
dropout = 0.2
n_features = df_u_train.shape[1]

# callbacks
early_stop = keras.callbacks.EarlyStopping(monitor="val_loss",
                                                   patience=10)
        
reduce_learn_rate = keras.callbacks.ReduceLROnPlateau(monitor="val_loss",
                                                      factor=0.1,
                                                      patience=5)
callbacks_list = [early_stop, reduce_learn_rate]


In [None]:
# model definition
# *** enceodding part *** 
# embedding layer
input_dim = Input(shape=(n_features,))
encoded = BatchNormalization()(input_dim)
encoded = Dense(units=n_hidden_units, activation="relu")(encoded)
encoded = Dropout(rate=dropout)(encoded)

# hidden layer
encoded = BatchNormalization()(encoded)
encoded = Dense(units=n_hidden_units, activation="relu")(encoded)
encoded = Dropout(rate=dropout)(encoded)

# hidden layer
encoded = BatchNormalization()(encoded)
encoded = Dense(units=n_hidden_units, activation="relu")(encoded)
encoded = Dropout(rate=dropout)(encoded)

# hidden layer
# dropout not applied 
# https://stackoverflow.com/questions/38125657/what-layers-should-experience-dropout-when-training-a-neural-network
encoded = BatchNormalization()(encoded)
encoded = Dense(units=n_hidden_units, activation="relu")(encoded)

# output layer
encoded = BatchNormalization()(encoded)
encoded_last = Dense(units=encoding_dim, activation="relu")(encoded)

# *** decoding part *** 
# embedding layer of decoder
decoded = BatchNormalization()(encoded_last)
decoded = Dense(units=n_hidden_units, activation="relu")(decoded)
decoded = Dropout(rate=dropout)(decoded)

# hidden layer
decoded = BatchNormalization()(decoded)
decoded = Dense(units=n_hidden_units, activation="relu")(decoded)
decoded = Dropout(rate=dropout)(decoded)

# hidden layer
decoded = BatchNormalization()(decoded)
decoded = Dense(units=n_hidden_units, activation="relu")(decoded)
decoded = Dropout(rate=dropout)(decoded)

# hidden layer
# dropout not applied 
decoded = BatchNormalization()(decoded)
decoded = Dense(units=n_hidden_units, activation="relu")(decoded)

# output layer
decoded = BatchNormalization()(decoded)
decoded = Dense(units=n_features, activation="relu")(decoded)

autoencoder = Model(inputs=input_dim, outputs=decoded)
autoencoder.compile(optimizer = 'adam', loss = 'mean_squared_logarithmic_error', metrics=['accuracy'])





print(autoencoder.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 68)                0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 68)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 100)               6900      
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 100)               400       
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
__________

In [None]:
# # second model
# input_dim = Input(shape = (n_features, ))

# # Encoder Layers
# encoded = Dense(n_hidden_units, activation = 'relu')(input_dim)
# encoded = Dropout(rate=dropout)(encoded)
# encoded = Dense(n_hidden_units, activation = 'relu')(encoded)
# encoded = Dropout(rate=dropout)(encoded)
# encoded = Dense(n_hidden_units, activation = 'relu')(encoded)
# encoded = Dropout(rate=dropout)(encoded)
# encoded = Dense(n_hidden_units, activation = 'relu')(encoded)
# encoded_last = Dense(encoding_dim, activation = 'relu')(encoded)

# # Decoder Layers
# decoded = Dense(n_hidden_units, activation = 'relu')(encoded_last)
# decoded = Dropout(rate=dropout)(decoded)
# decoded = Dense(n_hidden_units, activation = 'relu')(decoded)
# decoded = Dropout(rate=dropout)(decoded)
# decoded = Dense(n_hidden_units, activation = 'relu')(decoded)
# decoded = Dropout(rate=dropout)(decoded)
# decoded = Dense(n_hidden_units, activation = 'relu')(decoded)
# decoded = Dense(n_features, activation = 'sigmoid')(decoded)

# # Combine Encoder and Deocder layers
# autoencoder = Model(inputs = input_dim, outputs = decoded)

# # Compile the Model
# # accuracy metric might be redundant for this task because it is unclear of meaning in regression
# autoencoder.compile(optimizer = 'adam', loss = 'mean_squared_logarithmic_error', metrics=['accuracy'])
# # autoencoder.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])
# print(autoencoder.summary())

In [None]:
# autoencoder will map train data to train data itself
autoencoder.fit(df_u_train, df_u_train,
               validation_split=0.2,
                epochs=n_epoch,
               batch_size=100,
               shuffle=True,
                callbacks=callbacks_list,)

Train on 497812 samples, validate on 124453 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50

In [None]:
# extract encoder part for feature selection
encoder = Model(inputs = input_dim, outputs = encoded_last)

In [None]:
# use extracted encoder for getting non-linear important features
enc_df_u_train = pd.DataFrame(encoder.predict(df_u_train)).add_prefix('new_feature_')
enc_df_u_test = pd.DataFrame(encoder.predict(df_u_test)).add_prefix('new_feature_')
# drop all features with zero variance
enc_df_u_train = variance_threshold_selector(enc_df_u_train)
enc_df_u_test = variance_threshold_selector(enc_df_u_test)
# normalize data for further tasks
enc_df_u_train = min_max_scaler(enc_df_u_train)
enc_df_u_test = min_max_scaler(enc_df_u_test)

print(enc_df_u_train.shape)
enc_df_u_train.head(20)


## Anomally detection

### Firstly, we can use a simpler model to cluster data points. Start with K-MEANS:

In [None]:
# check # of cluster - loss plot to see which one is suitable
kmean_df = enc_df_u_train.copy()
kmean_test_df = enc_df_u_test.copy()
n_cluster = range(1, 10)
kmeans = [KMeans(n_clusters=i).fit(kmean_df) for i in n_cluster]
scores = [kmeans[i].score(kmean_test_df) for i in range(len(kmeans))]
fig, ax = plt.subplots()
ax.plot(n_cluster, scores)
plt.show()

**we can choose a number between 5 - 10 for getting elbow point.For efficiency, I am choosing 6.**

In [None]:
# fitting k-means cluster
num_of_clusters = 6
kmeans = kmeans[num_of_clusters -1]
kmeans.fit(kmean_test_df)

In [None]:
# add cluster numbers as coloumn
kmean_test_df['cluster'] = kmeans.labels_
kmean_test_df.head()

In [None]:
# check number of samples in each clusters by plotting
cluster_num_arr = np.arange(0,num_of_clusters,1)
freq_of_cluster = kmean_test_df['cluster'].value_counts()
plt_fig(cluster_num_arr, freq_of_cluster)
display(kmean_test_df['cluster'].value_counts())

In [None]:
# check ground truth
# select test rows from labelled data
ground_truth = df[df.index.isin(df_u_test.index)]
# get counts of each label
label_counts = ground_truth['Label'].value_counts()
# plot figure
plt_fig(np.arange(0,len(label_counts),1), label_counts)
display(label_counts)

### Lets check another method for clustering connections, Isolation forests[2]

In [None]:
iso_df = enc_df_u_train.copy()
iso_test_df = enc_df_u_test.copy()
contamination = 0.3  # the proportion of outliers in the data set

# training the model
clf = IsolationForest(contamination=contamination)
clf.fit(iso_df)
# get scores
res = clf.decision_function(iso_test_df)

# bin the scores using discretizer
res = binning_anomally_score(res)
# get unique elements and their frequency
unique_elements, counts_elements = np.unique(res, return_counts=True)
# plot frequencies
plt_fig(unique_elements, counts_elements)
display(counts_elements)

### **According to paper[3], we can use different approach to achieve better accuracy.In the paper, kdd99 task is very similar to our task so we can take several better methods and apply our problem. 2 of them have been selected.**

### Thirdly, we can use Clustering-Based Local Outlier Factor(CBLOF)[4]:

In [None]:
cblof_df = enc_df_u_train.copy()
minimum_n_cluster = 2
max_n_cluster = 9
n_cluster = max_n_cluster - minimum_n_cluster
contamination = 0.3  # the proportion of outliers in the data set
# initilize models with different parameters
test_scores = np.zeros([enc_df_u_test.shape[0], n_cluster])
# combine cblof with different cluster number and save anomally score of each sample points for test
for k in range(minimum_n_cluster, max_n_cluster):
    try:
        clf = CBLOF(n_clusters=k , contamination=contamination)
    except:
        pass
    else:
        clf.fit(cblof_df)
        test_scores[:, k-2] = clf.decision_function(enc_df_u_test)

# normalize the scores for evaluation
test_scores_norm = standardizer(test_scores)
# ensemble anomally score recoreded by different classifier
comb_by_maximization = maximization(test_scores_norm)

In [None]:
# bin the scores using discretizer
res = binning_anomally_score(comb_by_maximization)
# get unique elements and their frequency
unique_elements, counts_elements = np.unique(res, return_counts=True)
# plot frequencies
plt_fig(unique_elements, counts_elements)
display(counts_elements)

### Lastly, we can use Histogram-based anomaly detection algorithm (HBOS)[5]:

In [None]:
hbos_df = enc_df_u_train.copy()
minimum_n_cluster = 2
max_n_cluster = 9
n_cluster = max_n_cluster - minimum_n_cluster
contamination = 0.3  # the proportion of outliers in the data set
# initilize models with different parameters
test_scores = np.zeros([enc_df_u_test.shape[0], n_cluster])
# combine cblof with different cluster number and save anomally score of each sample points for test
for k in range(minimum_n_cluster, max_n_cluster):
    try:
        clf = HBOS(n_bins=k , contamination=contamination)
    except:
        pass
    else:
        clf.fit(cblof_df)
        test_scores[:, k-2] = clf.decision_function(enc_df_u_test)

# normalize the scores for evaluation
test_scores_norm = standardizer(test_scores)
# ensemble anomally score recoreded by different classifier
comb_by_maximization = maximization(test_scores_norm)

In [None]:
# bin the scores using discretizer
res = binning_anomally_score(comb_by_maximization)
# get unique elements and their frequency
unique_elements, counts_elements = np.unique(res, return_counts=True)
# plot frequencies
plt_fig(unique_elements, counts_elements)
display(counts_elements)

# Definition of normal

### In this chapter, we will try to specify the characteristics of a normal connection.To do that, lets compare two clusters to get a definition of a normal connection. The intuitive way is selecetion of most frequent cluster (probably represents normal connections) and least frequent one (probably includes harsh attacts).

# References

[1] https://www.unb.ca/cic/datasets/ids-2017.html <br>
[2] https://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/icdm08b.pdf <br>
[3] https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0152173&type=printable <br>
[4] He, Z., Xu, X. and Deng, S., 2003. Discovering cluster-based local outliers. Pattern Recognition Letters, 24(9-10), pp.1641-1650.  <br>
[5] https://pdfs.semanticscholar.org/5cf8/81d1db19834f123fcfc79ad32097aeafe17f.pdf