In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [18]:
train = pd.read_csv('UNSW_NB15_training-set.csv')
test = pd.read_csv('UNSW_NB15_testing-set.csv')

In [19]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175341 entries, 0 to 175340
Data columns (total 45 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 175341 non-null  int64  
 1   dur                175341 non-null  float64
 2   proto              175341 non-null  object 
 3   service            175341 non-null  object 
 4   state              175341 non-null  object 
 5   spkts              175341 non-null  int64  
 6   dpkts              175341 non-null  int64  
 7   sbytes             175341 non-null  int64  
 8   dbytes             175341 non-null  int64  
 9   rate               175341 non-null  float64
 10  sttl               175341 non-null  int64  
 11  dttl               175341 non-null  int64  
 12  sload              175341 non-null  float64
 13  dload              175341 non-null  float64
 14  sloss              175341 non-null  int64  
 15  dloss              175341 non-null  int64  
 16  si

In [20]:
train = train.drop(['attack_cat','id'],axis = 1)
test = test.drop(['attack_cat','id'],axis = 1)

In [21]:
y_train = train['label']
x_train = train.drop(['label'],axis = 1)
y_test = test['label']
x_test = test.drop(['label'],axis = 1)

In [22]:
num = x_train.select_dtypes(include = 'number')
cat = x_train.select_dtypes(include = 'object')

In [23]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
x_train[num.columns] = scaler.fit_transform(x_train[num.columns])
x_test[num.columns] = scaler.transform(x_test[num.columns])

In [24]:
# one hot encode
x_test = pd.get_dummies(x_test)
x_train = pd.get_dummies(x_train)

In [28]:
# align
x_train, x_test = x_train.align(x_test, join = 'inner', axis = 1)

In [25]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve,auc
from xgboost import XGBClassifier

In [26]:
def train_and_evaluate_model(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print(roc_auc_score(y_test, y_pred))

In [29]:
xgb = XGBClassifier(learning_rate=0.1, max_depth=7, n_estimators=300)
train_and_evaluate_model(xgb, x_train, y_train, x_test, y_test)

              precision    recall  f1-score   support

           0       0.97      0.74      0.84     37000
           1       0.82      0.98      0.89     45332

    accuracy                           0.87     82332
   macro avg       0.90      0.86      0.87     82332
weighted avg       0.89      0.87      0.87     82332

[[27334  9666]
 [  821 44511]]
0.8603229649838668


In [30]:
# balanced RF
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=42)
train_and_evaluate_model(brf, x_train, y_train, x_test, y_test)

              precision    recall  f1-score   support

           0       0.96      0.81      0.88     37000
           1       0.86      0.97      0.91     45332

    accuracy                           0.90     82332
   macro avg       0.91      0.89      0.90     82332
weighted avg       0.91      0.90      0.90     82332

[[29928  7072]
 [ 1280 44052]]
0.8903143701364826


In [32]:
# try pca
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

In [33]:
print(x_train_pca.shape)
print(x_test_pca.shape)

(175341, 1)
(82332, 1)


In [34]:
# try again
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=42)
train_and_evaluate_model(brf, x_train_pca, y_train, x_test_pca, y_test)

              precision    recall  f1-score   support

           0       0.79      0.59      0.68     37000
           1       0.72      0.87      0.79     45332

    accuracy                           0.75     82332
   macro avg       0.76      0.73      0.73     82332
weighted avg       0.75      0.75      0.74     82332

[[21924 15076]
 [ 5766 39566]]
0.7326728115214836


In [35]:
# voting classifier
from sklearn.ensemble import VotingClassifier
xgb = XGBClassifier(learning_rate=0.1, max_depth=7, n_estimators=300)
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=42)
voting_clf = VotingClassifier(estimators=[('xgb', xgb), ('brf', brf)], voting='soft')

In [36]:
voting_clf.fit(x_train, y_train)
y_pred = voting_clf.predict(x_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.77      0.86     37000
           1       0.84      0.98      0.90     45332

    accuracy                           0.89     82332
   macro avg       0.90      0.87      0.88     82332
weighted avg       0.90      0.89      0.88     82332

[[28455  8545]
 [  894 44438]]
0.8746664428921995


In [38]:
xgb = XGBClassifier(learning_rate=0.1, max_depth=7, n_estimators=300)
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=42)
voting_clf2 = VotingClassifier(estimators=[('xgb', xgb), ('brf', brf)], voting='hard')

In [39]:
voting_clf2.fit(x_train, y_train)
y_pred = voting_clf2.predict(x_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.82      0.88     37000
           1       0.87      0.97      0.91     45332

    accuracy                           0.90     82332
   macro avg       0.91      0.89      0.90     82332
weighted avg       0.91      0.90      0.90     82332

[[30337  6663]
 [ 1496 43836]]


In [40]:
print(roc_auc_score(y_test, y_pred))

0.893458974151068
