# Basic Classification Analysis of Static Classifiers 

This script perform the experiments for static analysis of classifiers on the UNSW_NB15 dataset. These experiments are used to form a baseline for the best case scenario when all of the training data are available to learn from when they are available. We use the off-the-shelf classifiers from [Sklearn](https://scikit-learn.org/) to benchmark. We extend the previous results from the author's that presented in some of their previous work.   

In [1]:
import numpy as np 
import scipy as sp
import pandas as pd 
import matplotlib.pylab as plt
plt.style.use('ggplot')

# load static classifiers 
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
# setup the global variables and paths that will be used throughout this document 
data_path = '../data/'      # path to the dataset (should be ../data/)
K = 5                       # number of randomized trials to run 
binary_classes = True       # do you want to run classifiers 

# Functions 

In [3]:
def dataframe2dataset(df, training = False, label_dict = {}): 
    """
    X, y, label_dict = dataframe2dataset(df, training = False, label_dict = {})
    """

    drop_cols = ['id', 'proto', 'service', 'state', 'attack_cat', 'label', 'is_sm_ips_ports']
    df_feat = df.drop(drop_cols, axis = 1)
    
    X = df_feat.values
    labels = df['attack_cat'].values
    y = np.zeros((len(X),))
    
    if training: 
        uni_labels = np.unique(labels)
        label_dict = {}
        for lbl, n in zip(uni_labels, range(len(uni_labels))):
            label_dict[lbl] = n
            
        # swap the index of the labels for normal and analysis to make the label of the normal 
        # samples be equal to one. this is a bit more standard than having the normal data 
        # be nonzero. 
        label_dict['Analysis'] = 6
        label_dict['Normal'] = 0
        
    for n in range(len(y)): 
        y[n] = label_dict[labels[n]]
        
    return X, y, label_dict

def calc_binary_results(y, yhat):
    """
    tpr, tnr, fpr, fnr, f1s, acc, mcc = calc_binary_results(y, yhat)
    """
    tp, tn, fp, fn = 0., 0., 0., 0.
    for i in range(len(y)): 
        if y[i] == 1. and yhat[i] == 1.: 
            tp += 1.
        elif y[i] == 0. and yhat[i] == 0.:
            tn += 1.
        elif y[i] == 0. and yhat[i] == 1.: 
            fp += 1.
        elif y[i] == 1. and yhat[i] == 0.: 
            fn += 1.
        else: 
            print('Should not be here.')
            
    tpr = tp/(tp+fn)
    tnr = tn/(tn+fp)
    fpr = fp/(fp+tn)
    fnr = fn/(fn+tp)
    f1s = 2*tp/(2*tp+tn+tp)
    acc = (tp+tn)/(tp+tn+fp+fn)
    mcc = (tp*tn-fp*fn)/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    
    return tpr, tnr, fpr, fnr, f1s, acc, mcc

# Binary Classification Results 

In [4]:
# parse out the training and testing data
df_tr = pd.read_csv(data_path + 'UNSW_NB15_training-set.csv')
df_te = pd.read_csv(data_path + 'UNSW_NB15_testing-set.csv')

Xtr, ytr, label_dict = dataframe2dataset(df_tr, training=True)
Xte, yte, _ = dataframe2dataset(df_te, training=False, label_dict=label_dict)

scaler = StandardScaler().fit(Xtr)
Xtr = scaler.transform(Xtr)
Xte = scaler.transform(Xte)

if binary_classes: 
    i, j = np.where(ytr!=0)[0], np.where(yte!=0)[0]
    ytr[i] = 1.
    yte[j] = 1.

In [5]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=25, max_features=10),
    AdaBoostClassifier(n_estimators=25),
    GaussianNB()]

classifiers = [GaussianNB(), 
               DecisionTreeClassifier(max_depth=5), 
               AdaBoostClassifier(), 
               RandomForestClassifier(max_depth=5, n_estimators=25, max_features=10), 
               GradientBoostingClassifier(), 
               LogisticRegression()
              ]
classifier_name = ['naive Bayes', 'CART', 'Adaboost', 'RandomForest', 'GradientBoosting', 'LogisticRegression']

tprs, tnrs, fprs, fnrs, f1ss, accs, mccs = np.zeros((len(classifiers),)), np.zeros((len(classifiers),)), np.zeros((len(classifiers),)), np.zeros((len(classifiers),)), np.zeros((len(classifiers),)), np.zeros((len(classifiers),)), np.zeros((len(classifiers),))

n = 0
for clf in classifiers:
    print('Running ' + classifier_name[n])
    for k in range(K): 
        clf_k = clf
        j = np.random.randint(0, len(ytr), len(ytr))
        yhat = clf_k.fit(Xtr[j], ytr[j]).predict(Xte)
        tpr, tnr, fpr, fnr, f1s, acc, mcc = calc_binary_results(yte, yhat)
        
        tprs[n] += tpr
        tnrs[n] += tnr
        fprs[n] += fpr
        fnrs[n] += fnr
        f1ss[n] += f1s
        accs[n] += acc
        mccs[n] += mcc
        
    n += 1

tprs, tnrs, fprs, fnrs, f1ss, accs, mccs = tprs/K, tnrs/K, fprs/K, fnrs/K, f1ss/K, accs/K, mccs/K

Running naive Bayes
Running CART
Running Adaboost
Running RandomForest
Running GradientBoosting
Running LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [7]:
tprs, tnrs, fprs, fnrs, f1ss, accs, mccs

(array([0.65813553, 0.99423365, 0.96920939, 0.99969999, 0.98619077,
        0.94124239]),
 array([0.84864865, 0.59869189, 0.70718919, 0.57863243, 0.69067027,
        0.57061622]),
 array([0.15135135, 0.40130811, 0.29281081, 0.42136757, 0.30932973,
        0.42938378]),
 array([3.41864467e-01, 5.76634607e-03, 3.07906115e-02, 3.00008824e-04,
        1.38092297e-02, 5.87576105e-02]),
 array([0.49347569, 0.57282573, 0.55624348, 0.57596709, 0.55997037,
        0.57227258]),
 array([0.74375213, 0.81647719, 0.85145751, 0.81047223, 0.85338386,
        0.77468299]),
 array([0.50922656, 0.66271292, 0.71301259, 0.65569315, 0.72298866,
        0.56189136]))