# Initial Classification Modeling for UNSW NB-15 cyberattack dataset

In [3]:
# Custom modules
from data_prep import load_csv_data
import model_abstraction as moda

# Data Structures
import pandas as pd
import numpy as np

# Preprocessing or data manipulation methods
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder

# Modeling methods and selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import OneClassSVM, LinearSVC
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split

# Model assessment
from sklearn.metrics import confusion_matrix, roc_auc_score

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Modeling Using only Numeric Features


In [4]:



X_train, y_train = load_csv_data('/content/UNSW_training_30_50.csv')
#X_train, X_hold, y_train, y_hold  = train_test_split(X_train,y_train, test_size = 0.25,
      #                                               random_state = 42, stratify=y_train)
X_test, y_test = load_csv_data('/content/UNSW_testing_30_10.csv')

In [5]:
# Number of numeric features remaining
len(X_train.columns)

39

In [None]:
# Provide classifiers to test for a "first pass" assessment using only vanilla models
classifiers = {
    'knn': KNeighborsClassifier,
    'lgr': LogisticRegression,
    'gnb': GaussianNB,
    'mnb': MultinomialNB,
    'dtc': DecisionTreeClassifier,
    'rfc': RandomForestClassifier,
    'gbc': GradientBoostingClassifier,
    'lsvc': LinearSVC
}

default_parameters = {
    'knn': {'n_neighbors':9},
    'lgr': {'solver':'lbfgs'},
    'gnb': {},
    'mnb': {},
    'dtc': {},
    'rfc': {'n_estimators':100},
    'gbc': {},
    'lsvc': {}
}

In [None]:
results = moda.cross_val_models(classifiers, X_train, y_train, params=default_parameters, verbose = True)

Model: knn Metric: roc_auc 0.9218877394959154
Model: lgr Metric: roc_auc 0.8424542994211042
Model: gnb Metric: roc_auc 0.8557519940812671
Model: mnb Metric: roc_auc 0.7812905471666495
Model: dtc Metric: roc_auc 0.8898225178385308
Model: rfc Metric: roc_auc 0.9838088265929693
Model: gbc Metric: roc_auc 0.9841299172779582




Model: lsvc Metric: roc_auc 0.6331180719164315




In [6]:
## Determine how much the classifier is over-fitting by comparing test auc with
## training cross-validation from above
gbc = GradientBoostingClassifier(random_state=42)
gbc.fit(X_train, y_train)

y_pred=gbc.predict(X_test)
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
#print('Train: ', roc_auc_score(y_train, gbc.predict(X_train)))
#print('Holdout: ', roc_auc_score(y_hold, gbc.predict(X_hold)))
#print('Test:', roc_auc_score(y_test, gbc.predict(X_test)))

Accuracy: 0.9003838833510759


In [7]:

## Determine how much the classifier is over-fitting by comparing test auc with
## training cross-validation from above
rfc = RandomForestClassifier(n_estimators=100,random_state=42)
rfc.fit(X_train, y_train)
#print('Train: ', roc_auc_score(y_train, rfc.predict(X_train)))
#print('Holdout: ', roc_auc_score(y_hold, rfc.predict(X_hold)))
#print('Test:', roc_auc_score(y_test, rfc.predict(X_test)))

#clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
#clf.fit(X_train,y_train)

y_pred=rfc.predict(X_test)

from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy for RF:",metrics.accuracy_score(y_test, y_pred))

Accuracy for RF: 0.9191909416432789


In [8]:
lgr = LogisticRegression(solver = 'liblinear', penalty = 'l1', random_state=42)
lgr.fit(X_train, y_train)
y_pred=lgr.predict(X_test)

from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.796445046614407




In [9]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=10)
classifier.fit(X_train, y_train)
y_pred=classifier.predict(X_test)
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy for KNN:",metrics.accuracy_score(y_test, y_pred))

Accuracy for KNN: 0.795993419142553


In [10]:
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
model = GaussianNB()

# Train the model using the training sets
#model.fit(features,label)
model.fit(X_train, y_train)
y_pred=model.predict(X_test)
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy for Naive Bayes:",metrics.accuracy_score(y_test, y_pred))

Accuracy for Naive Bayes: 0.6541178747701539


In [None]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
dt = DecisionTreeClassifier()

# Train Decision Tree Classifer
dt.fit(X_train,y_train)
y_pred=dt.predict(X_test)
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy for DT:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.908228871554055


In [None]:
## Determine how much the classifier is over-fitting by comparing test auc with
## training cross-validation from above
lgr = LogisticRegression(solver = 'liblinear', penalty = 'l1', random_state=42)
lgr.fit(X_train, y_train)
print('Train: ', roc_auc_score(y_train, lgr.predict(X_train)))
print('Holdout: ', roc_auc_score(y_hold, lgr.predict(X_hold)))
print('Test:    ', roc_auc_score(y_test, lgr.predict(X_test)))

Train:  0.8929929872233793
Holdout:  0.8955387211997012
Test:     0.7533345944992023




In the example below, I eliminate the values

In [None]:
lgr_coefs = pd.Series(lgr.coef_.ravel())[
(pd.Series(lgr.coef_.ravel())>1e-2)\
| (pd.Series(lgr.coef_.ravel())<-1e-2)]
x_red = lgr_coefs

In [None]:
lgr = LogisticRegression(solver = 'liblinear', penalty = 'l1', random_state=42)
lgr.fit(X_train.iloc[:,x_red], y_train)
print('Train: ', roc_auc_score(y_train, lgr.predict(X_train.iloc[:,x_red])))
print('Holdout: ', roc_auc_score(y_hold, lgr.predict(X_hold.iloc[:,x_red])))
print('Test:    ', roc_auc_score(y_test, lgr.predict(X_test.iloc[:,x_red])))

Train:  0.5267994924465512
Holdout:  0.5269865359201731
Test:     0.5130821315889259


In [None]:
best_k, best_score = moda.iterate_k_for_KNN(X_train, y_train, 80,81)

n_neighbors: 80 roc_auc 0.9454374383181573


## Grid-Searching Logistic Regression:


In [None]:
# Parameter search for a logistic regression using an l1 penalty
param_grid = {'solver':['liblinear','saga'], 
             'C':np.linspace(1e-3,1e3,12)
             }
grid = GridSearchCV(LogisticRegression(penalty='l1', random_state=42),param_grid,
                    scoring='roc_auc', cv=5).fit(X_train,y_train)







In [None]:
# Cross-validation AUC score improves from the initial multi-model exercise
grid.best_score_

0.9692200422429181

In [None]:
grid.best_params_

{'C': 545.455, 'solver': 'liblinear'}

In [None]:
lgr_optimized = grid.best_estimator_

In [None]:
# Despite improvements in the tuning score, the performance gains on the test set are marginal
roc_auc_score(y_test, lgr_optimized.predict(X_test))

0.7535152377295676

In [None]:
grid.best_estimator_.coef_

array([[ 3.44108227e-02, -1.94397201e-02, -1.35032449e-02,
         2.04231191e-05, -4.66065391e-05,  1.18592348e-06,
         1.10675768e-02,  5.40989102e-02, -5.64964484e-10,
        -9.83319671e-06,  4.08277437e-02,  1.61653126e-01,
        -1.37131493e-04, -4.28209509e-05,  1.28308153e-06,
        -2.39781690e-06, -5.23263140e-02, -1.19771758e-11,
         4.27559195e-12, -2.73774349e-03, -1.56992644e+00,
        -9.65802305e+00, -3.12065222e-01, -1.00788712e-03,
         1.41033932e-02,  1.14097754e+00, -1.73934138e-06,
        -7.84067021e-02,  1.27175491e+00,  6.96032417e-03,
         1.60333948e-02,  2.55658056e-01,  2.35714547e-01,
         1.34310684e+00,  3.89252015e-01, -2.89861001e-02,
         4.20569386e-03, -2.59115619e-01, -1.21370379e+01]])

In [None]:
# Parameter grid search using an l2 penalty. This version has more solvers available.
param_grid_2 = {'solver':['liblinear','saga', 'sag', 'lbfgs', 'newton-cg'], 
             'C':np.linspace(1e-3,1e3,12)
             }
grid_2 = GridSearchCV(LogisticRegression(penalty='l2', random_state=42),param_grid_2,
                    scoring='roc_auc', cv=5).fit(X_train,y_train)

































In [None]:
grid_2.best_score_

0.8978438674615145