# Loading data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

df = pd.read_csv('../input/unsw-nb15/UNSW_NB15_training-set.csv')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data cleaning

In [None]:
X = df.copy()
Y = X.label
X = X.drop(columns=['id','attack_cat','label'])
print(X.columns)

string_fields = X.select_dtypes('object').columns.values
X = pd.get_dummies(X, columns=string_fields)
print(X.columns)

# Machine learning Analysis

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.25,random_state=1)
print("Train size = "+str(len(X_train)) + " Test_size = "+str(len(X_test)))

In [None]:
from xgboost import XGBClassifier, plot_importance, plot_tree
from sklearn.metrics import roc_auc_score, average_precision_score

xgb_model = XGBClassifier(user_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train,y_train)

In [None]:
# 6.3 Extract the AUPRC
probabilities = xgb_model.fit(X_train, y_train).predict_proba(X_test)
print('AUPRC = {}'.format(average_precision_score(y_test, probabilities[:,1])))

In [None]:
# 6.4.1 Plot the relative importance of fields through plot_importance, using importance_type
#‘gain’, ‘weight’, ‘cover’
import matplotlib.pylab as plt
from matplotlib import pyplot

In [None]:
# 6.4.2 Plot the relative importance of fields through plot_importance, using importance_type
#‘gain’, ‘weight’, ‘cover’
fig, ax = plt.subplots(3,1,figsize=(10,20))

importance_types = ['weight', 'cover', 'gain']

for i, imp_i in enumerate(importance_types):
    plot_importance(xgb_model, ax=ax[i], max_num_features=10, importance_type=imp_i, xlabel=imp_i)

In [None]:
# 6.6 Show the classifier visually
plot_tree(xgb_model,num_trees=0)
plt.rcParams['figure.figsize'] = [100, 200]
plt.show()

# Statistics Analysis

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

x = StandardScaler().fit_transform(X.copy())
pca = PCA(n_components=2)
pca_x = pca.fit_transform(X)

principalDf = pd.DataFrame(data = pca_x,columns = ['principal component 1','principal component 2'])
finalDf = pd.concat([principalDf,Y],axis = 1)

In [None]:
import matplotlib.pyplot as plt 

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1',fontsize = 15)
ax.set_ylabel('Principal Component 2',fontsize = 15)
ax.set_title('2 component PCA',fontsize = 20)
targets = [0,1]
colors = ['r','b']
for target, color in zip(targets, colors):
    indicesToKeep = finalDf.label == target
    ax.scatter(finalDf.loc[indicesToKeep,'principal component 1'],
               finalDf.loc[indicesToKeep,'principal component 2'],
               c = color,
               s = 50)
    ax.legend(targets)
    ax.grid()

In [None]:
print("Independant parameters = "+str(pca.explained_variance_))

In [None]:
# 8. Use Linear Regression to evaluate the contribution of these independent parameters to the characterization as attack
from sklearn.linear_model import LinearRegression
import matplotlib as mpl


model = LinearRegression().fit(pca_x,Y)
r_squared = model.score(pca_x, Y)
print('coefficient of determination:', r_squared)
print('intercept:', model.intercept_)
print('slope:', model.coef_)
mpl.rcParams['agg.path.chunksize'] = 10000
plt.plot(pca_x, model.predict(pca_x), color='red',linewidth=3)