In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

df = pd.read_csv ('../input/unsw-nb15/UNSW_NB15_training-set.csv')


Data Cleaning

In [None]:
X = df.copy()
Y = X.label

X




In [None]:
X= X.drop(["attack_cat","id","label"],axis=1)
X

In [None]:
string_fields = X.select_dtypes('object').columns.values
X = pd.get_dummies(X, columns=string_fields)


Machine Learning Analysis

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=1)
print("Train size = "+str(len(X_train)) + "Test_size = "+str(len(X_test)))


We obtained a learning sample of size: 61749 and a test sample of size: 20583

Train the model

In [None]:
from xgboost import XGBClassifier, plot_importance, plot_tree
from sklearn.metrics import roc_auc_score
#average_precision_score


xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_curve, auc, confusion_matrix, classification_report, recall_score, roc_auc_score 
y_pred = xgb_model.predict (X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
precision, recall, thresholds = precision_recall_curve(y_test, y_pred) 
area = auc(recall, precision)
print('------------ Results for XGBClassifier ---------------')
print('matrice de confusion:',confusion_matrix(y_test, y_pred))
#print('cr:', classification_report(y_test,y_pred))
#print('recall_score:', recall_score(y_test,y_pred))
print('roc_auc_score:',roc_auc_score(y_test,y_pred))
print("Area Under P-R Curve: ",area)

AUPRC = 98.05%

In [None]:
from xgboost import plot_importance
from matplotlib import pyplot
# plot feature importance
plot_importance(xgb_model)
pyplot.show()

In [None]:
for importance_type in ('weight', 'gain', 'cover'):
    print('%s: ' % importance_type, xgb_model.get_booster().get_score(importance_type=importance_type))


In [None]:
from xgboost import plot_tree
import matplotlib.pyplot as plt
plot_tree(xgb_model)
plt.show()



The model involved XGBoost performs very well in predicting attacks with a very low error rate below 2%. 
As we could see on the graph related to the importance of the variables, it is the STTL variable and ct_dst_sport_ltm that have the most importance compared to our model, result confirmed by our binary tree, always with the same discriminating variables.

Statistics Analysis

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

x = StandardScaler().fit_transform(X.copy())
pca = PCA(n_components=2)
pca_x = pca.fit_transform(x)

principalDf = pd.DataFrame(data = pca_x, columns = ['principal component 1', 'principal component 2'])
finalDf = pd.concat([principalDf, Y], axis = 1)

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = [0,1]
colors = ['r', 'b']
for target, color in zip(targets, colors):
    indicesToKeep = finalDf.label == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1'],
               finalDf.loc[indicesToKeep, 'principal component 2'],
               c = color,
               s = 50)
    ax.legend(targets)
    ax.grid()

In [None]:
print("Independant parameters = "+str(pca.explained_variance_))

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(pca_x, Y)
Y_pca = reg.predict(pca_x)
Y_pca - Y

Evaluating and reporting

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_curve, auc, confusion_matrix, classification_report, recall_score, roc_auc_score 
y_pred = xgb_model.predict (X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
precision, recall, thresholds = precision_recall_curve(y_test, y_pred) 
area = auc(recall, precision)
print('------------ Results for XGBClassifier ---------------')
print('matrice de confusion:',confusion_matrix(y_test, y_pred))
print('cr:', classification_report(y_test,y_pred))
print('recall_score:', recall_score(y_test,y_pred))
print('roc_auc_score:',roc_auc_score(y_test,y_pred))


In [None]:
from matplotlib import pyplot
results = xgb_model.evals_result()
pyplot.plot(results['validation_0']['logloss'], label='train')
pyplot.plot(results['validation_1']['logloss'], label='test')
pyplot.legend()
pyplot.show()



The plot shows learning curves for the train and test dataset where the x-axis is the number of iterations of the algorithm (or the number of trees added to the ensemble) and the y-axis is the logloss of the model. Each line shows the logloss per iteration for a given dataset.

From the learning curves, we can see that the performance of the model on the training dataset (blue line) is better or has lower loss than the performance of the model on the test dataset (orange line), as we might generally expect