In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Analysis goal

1. You will perform Principal Component Analysis, Linear Regression, and well as XGBoost learning.
For each of these analysis algorithms, list the information you expect to extract

- PCA will be used to project our dataset into principal components (e.g a new base), we will be using it to reduce dimensions from our datas, and find the best way to represent them.
- Linear Regression is used to find the best fit for a lineary combination our data vectors (and n-dimensions line, easy to explain). This could be use in the PC space.
- XGBoost is a tree gradient boosting algorithm that try to infers rules to classify our data. We expect it to give us insight of why a data should be considerer as anomaly or not through a decision tree.


# Loading data

2. Rebuild the analysis environment from lab session 1
2.1. In File > Add or Include Data, search for “UNSW_NB15” dataset and include it
2.2. In ‘Data > input > unsw-nb15‘, get the exact path of CSV file 'UNSW_NB15_training-set.csv'
and load it as training_set using Pandas.

In [None]:
df = pd.read_csv("/kaggle/input/unsw-nb15/UNSW_NB15_training-set.csv")

# Data Cleaning

3. Create a new DataFrame X you will use for cleaning. Store the labels in dataframe Y

In [None]:
X = df.copy()
Y = X.label

4. Remove labels bound to attacks as well as the ‘id’ field to avoid biased learning

In [None]:
col = ["attack_cat", "label", "id"]
X.drop(col, axis=1, inplace=True)

5. List string fields, and perform one-hot encoding.

In [None]:
string_fields = X.select_dtypes('object').columns.values

In [None]:
print(string_fields)

In [None]:
# Encode those fields
X = pd.get_dummies(X, columns=string_fields)

# Machine Learning Analysis

6. Through train_test_split on X and Y, and the XGBClassifier
6.1. Extract train and test dataframes for X and Y

In [None]:
from sklearn.model_selection import train_test_split
SEED = 1 # let's make it less 'random' for the notebook

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state = SEED)
print(f"Train size = {len(X_train)}, Test_size = {len(X_test)}")

6.2. Train the model

In [None]:
from xgboost import XGBClassifier, plot_importance, plot_tree, to_graphviz
from sklearn.metrics import roc_auc_score

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', objective='binary:logistic')
xgb_model.fit(X_train, Y_train)

6.3. Extract the AUPRC

In [None]:
roc_auc = roc_auc_score(Y_test, xgb_model.predict(X_test))
print(f"Roc Auc score = {roc_auc}")

6.4. Plot the relative importance of fields through plot_importance, using importance_type
‘gain’, ‘weight’, ‘cover’

In [None]:
for importance in ['gain', 'weight', 'cover']:
    plot_importance(xgb_model, importance_type=importance, max_num_features=15)

6.5. What do you conclude wrt. the security status of the target system?

We find that the most distinctive feature is sttl (according to the gain), the most weighted feature is sbytes (it may be used to discerns type of attacks in each decision tree ?), and the top3  covered features are protocols (sun-nd, mobile and swipe), which may means those are frequent to attacks because they are vulnerable. Reminder: those protocols don't appear in the normal traffic.


6.6. Show the classifier visually

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, dpi=100)
fig.set_size_inches(100, 50)
plot_tree(xgb_model, ax=ax)
plt.plot()
plt.savefig("tree.png")
# Available in the output 

6.7. For XGBClassifier

6.7.1.List the observations you make

We can see the decision process used by the graph. For instance, sttl is used first to determine if there is an attack or not. Then, we can see the left part of the tree check protocols, as mentionned, this is easy, because there is no normal traffic with those proto (i expect it to expand to all others proto in the dataset that are in attacks and not in normal traffic). We can see afterward how the tree splits the dataset using features : synack, smean, ct_dst_src_ltm ...



6.7.2.Which security-related information do you draw?

Any non-authorized protocols should be banned in the firewall, and if the traffic is too high (right part of the graph), it mights be attack (potential DDoS). 


6.7.3.Which recommendations can you emit based on these observations?

Check firewalls rules for unwanted protocols, implement an anti-DDOS system.

# Statistics analysis

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# We will use 3 PCA, but we can discard the last one anyway.
x=StandardScaler().fit_transform(X.copy())
pca = PCA(n_components=3)
pca_x = pca.fit_transform(x)

principalDf = pd.DataFrame(data=pca_x, columns = ['PC1', "PC2", "PC3"])
finalDf = pd.concat([principalDf, Y], axis = 1)

In [None]:
# Visualize point first in 3D before a 2D projection
import plotly.express as px
fig = px.scatter_3d(finalDf, x='PC1', y='PC2', z='PC3',
              color='label')
fig.show()

In [None]:
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot()
ax.set_xlabel('PC 1', fontsize=15)
ax.set_ylabel('PC 2', fontsize=15)
ax.set_title('2 component PCA', fontsize=20)

targets = [0,1]
colors = ['r', 'b']

for target, color in zip(targets, colors):
    indicesToKeep = finalDf.label == target
    ax.scatter(finalDf.loc[indicesToKeep, 'PC1'],
               finalDf.loc[indicesToKeep, 'PC2'],
               c = color,
               s = 50)
    ax.legend(targets)
    ax.grid()

If we use only 2 PC, we will loose a LOT of information, as the PC3 make it easy to identify if its attack or not ...

In [None]:
print("Independant parameters = " + str(pca.explained_variance_))

In [None]:
from sklearn.linear_model import LinearRegression
# Linear reg in PCA for attacks 
lr = LinearRegression()
attacks = finalDf.loc[finalDf.label == 1]
lr.fit(attacks[['PC1']], attacks['PC2'])
score= lr.score(attacks[['PC1']], attacks['PC2'])
print("Intercept for PC1-PC2 for attacks: " + str(lr.intercept_))
print("Coef for PC1-PC2 for attacks: " + str(lr.coef_))
print("Score: " + str(score))

Clearly, this is not convincing for a linear regression ! We can split manually the linear regression in two

In [None]:
from sklearn.linear_model import LinearRegression
# Linear reg in PCA for attacks 
lr = LinearRegression()
attacks_part1 = finalDf.loc[(finalDf.label == 1) & (finalDf['PC1']<2)]
lr.fit(attacks_part1[['PC1']], attacks_part1['PC2'])
score= lr.score(attacks_part1[['PC1']], attacks_part1['PC2'])
print("----- PC 1 < 2 ----")
print("Intercept for PC1-PC2 for attacks: " + str(lr.intercept_))
print("Coef for PC1-PC2 for attacks: " + str(lr.coef_))
print("Score: " + str(score))
c1s = lr.coef_

In [None]:
from sklearn.linear_model import LinearRegression
# Linear reg in PCA for attacks 
lr = LinearRegression()
attacks_part1 = finalDf.loc[(finalDf.label == 1) & (finalDf['PC1']>2)]
lr.fit(attacks_part1[['PC1']], attacks_part1['PC2'])
score= lr.score(attacks_part1[['PC1']], attacks_part1['PC2'])
print("----- PC 1 > 2 ----")
print("Intercept for PC1-PC2 for attacks: " + str(lr.intercept_))
print("Coef for PC1-PC2 for attacks: " + str(lr.coef_))
print("Score: " + str(score))
c2s = lr.coef_

Ah, here we have two of them. Let's try first 3D for part 2.

In [None]:
from sklearn.linear_model import LinearRegression
# Linear reg in PCA for attacks 
lr = LinearRegression()
attacks_part2 = finalDf.loc[(finalDf.label == 1) & (finalDf['PC1']>2)]
lr.fit(attacks_part2[['PC1', 'PC2']], attacks_part2['PC3'])
score= lr.score(attacks_part2[['PC1', 'PC2']], attacks_part2['PC3'])
print("----- PC 1 > 2 ----")
print("Intercept for PC1-PC2 for attacks: " + str(lr.intercept_))
print("Coef for PC1-PC2 for attacks: " + str(lr.coef_))
print("Score: " + str(score))

coefs_attacks_part2 = lr.coef_

Let's compare with normal traffic.

In [None]:
from sklearn.linear_model import LinearRegression
# Linear reg in PCA for attacks 
lr = LinearRegression()
normal_part2 = finalDf.loc[(finalDf.label == 0) & (finalDf['PC1']>2)]
lr.fit(normal_part2[['PC1', 'PC2']], normal_part2['PC3'])
score= lr.score(attacks_part2[['PC1', 'PC2']], attacks_part2['PC3'])
print("----- PC 1 > 2 ----")
print("Intercept for PC1-PC2 for normal: " + str(lr.intercept_))
print("Coef for PC1-PC2 for normal: " + str(lr.coef_))
print("Score: " + str(score))

coefs_normal_part2 = lr.coef_

In [None]:
from sklearn.linear_model import LinearRegression
# Linear reg in PCA for attacks 
lr = LinearRegression()
normal_part1 = finalDf.loc[(finalDf.label == 0) & (finalDf['PC1']>2)]
lr.fit(normal_part1[['PC1', 'PC2']], normal_part1['PC3'])
score= lr.score(attacks_part2[['PC1', 'PC2']], attacks_part2['PC3'])
print("----- PC 1 > 2 ----")
print("Intercept for PC1-PC2 for normal: " + str(lr.intercept_))
print("Coef for PC1-PC2 for normal: " + str(lr.coef_))
print("Score: " + str(score))

coefs_normal_part2 = lr.coef_

In [None]:
# Just to compute how those two vector are different using angle as metric. For part 2, using 3 PC
import math
vector_1 = coefs_attacks_part2
vector_2 = coefs_normal_part2
unit_vector_1 = vector_1 / np.linalg.norm(vector_1)
unit_vector_2 = vector_2 / np.linalg.norm(vector_2)
dot_product = np.dot(unit_vector_1, unit_vector_2)
angle = np.arccos(dot_product)

print(math.degrees(angle))

Ok, now keeping on 2D PCA, with part 1 and 2

In [None]:
from sklearn.linear_model import LinearRegression
# Linear reg in PCA for attacks 
lr = LinearRegression()
normal_part2 = finalDf.loc[(finalDf.label == 0) & (finalDf['PC1']>2)]
lr.fit(normal_part2[['PC1']], normal_part2['PC2'])
score= lr.score(attacks_part2[['PC1']], attacks_part2['PC2'])
print("----- PC 1 > 2 ----")
print("Intercept for PC1-PC2 for normal: " + str(lr.intercept_))
print("Coef for PC1-PC2 for attacks: " + str(lr.coef_))
print("Score: " + str(score))

cn2 = lr.coef_

In [None]:
from sklearn.linear_model import LinearRegression
# Linear reg in PCA for attacks 
lr = LinearRegression()
normal_part1 = finalDf.loc[(finalDf.label == 0) & (finalDf['PC1']<2)]
lr.fit(normal_part1[['PC1']], normal_part1['PC2'])
score= lr.score(normal_part1[['PC1']], normal_part1['PC2'])
print("----- PC 1 < 2 ----")
print("Intercept for PC1-PC2 for normal: " + str(lr.intercept_))
print("Coef for PC1-PC2 for attacks: " + str(lr.coef_))
print("Score: " + str(score))

cn1 = lr.coef_

# The score is too low on this one !

In [None]:
# Just to compute how those two vector are different using angle as metric. For part 1, using 2 PC

print("Diff angle between normal and attack for PC1 <2")
print(math.degrees(np.arctan(cn1) - np.arctan(c1s)))


# Just to compute how those two vector are different using angle as metric. For part 2, using 2 PC

print("Diff angle between normal and attack for PC1 >2")
print(math.degrees(np.arctan(cn2) - np.arctan(c2s)))

The issue is that the data in part 1 are way too mixed, we cannot really use the regression to differentiate between the do. We will work now on part 2.

We now have PC2 = PC1 * 1.51, let's reverse to the normal space

In [None]:
vpc1 = pca.components_[0] 
vpc2 = pca.components_[1]

reg_p1 = vpc1+c1s*vpc1
reg_p2 = vpc1+c2s*vpc2

In [None]:
def find_max_index(i, array):
    return np.where(array == sorted(array)[i])

print("Top 5 components for first regression")
for i in range(5):
    index = find_max_index(i, reg_p2)[0][0]
    print(X.columns[index])

print("****************")
    
print("Top 5 components for second regression")
for i in range(5):
    index = find_max_index(i, reg_p1)[0][0]
    print(X.columns[index])

9. For statistical analysis

9.1.1.List the observations you make
    We can clearly identify attacks to normal traffic on a part of the PCA space, but half of them seems to be mixed with normal traffic (we can explore in PCA3). We used linear regression in PC space and transformed back, and used two regression. While normal trafffic isn't really linear, attacks is half linear.

9.1.2.Which security-related information do you draw?

We found that a lot of attack traffic is similar to normal traffic, and some are not. We can identify the top features (above) for both linear regression that participate the most to being an attack or not.

9.1.3.Which recommendations can you emit based on these observations?

Check the dwin, proto, swin, state_FIN and dttl values in the traffic, they may reveal potential scans in the traffic (nmap, topology scanner).


10) Building graph and metrics of the model.

In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay

Y_predicted = xgb_model.predict_proba(X_test)
Y_predicted_2 = [v[1] for v in Y_predicted]
prec, recall, _ = precision_recall_curve(Y_test, Y_predicted_2)
pr_display = PrecisionRecallDisplay(precision=prec, recall=recall).plot()

In [None]:
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import roc_curve

fpr, tpr, _ = roc_curve(Y_test, Y_predicted_2)
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit


def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate 3 plots: the test and training learning curve, the training
    samples vs fit times curve, the fit times vs score curve.

    Parameters
    ----------
    estimator : estimator instance
        An estimator instance implementing `fit` and `predict` methods which
        will be cloned for each validation.

    title : str
        Title for the chart.

    X : array-like of shape (n_samples, n_features)
        Training vector, where ``n_samples`` is the number of samples and
        ``n_features`` is the number of features.

    y : array-like of shape (n_samples) or (n_samples, n_features)
        Target relative to ``X`` for classification or regression;
        None for unsupervised learning.

    axes : array-like of shape (3,), default=None
        Axes to use for plotting the curves.

    ylim : tuple of shape (2,), default=None
        Defines minimum and maximum y-values plotted, e.g. (ymin, ymax).

    cv : int, cross-validation generator or an iterable, default=None
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

          - None, to use the default 5-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, default=None
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like of shape (n_ticks,)
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the ``dtype`` is float, it is regarded
        as a fraction of the maximum size of the training set (that is
        determined by the selected validation method), i.e. it has to be within
        (0, 1]. Otherwise it is interpreted as absolute sizes of the training
        sets. Note that for classification the number of samples usually have
        to be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """
    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
    axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1)
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt

In [None]:
plot_learning_curve(xgb_model, "Learning curve for XGBoost", X_train, Y_train)

11. Which information do you draw?
The model seems really good PRC and ROC curve, as they almost have an area of 1 on the validation dataset. It means it is has few false positive and detect almost all attacks.
Concerning the learning curve, while they're is a little overfit (the training and cross validated datasets are off by 1.5%), it generalizes quite good, as the margin are stable during the learning process, and the performance too.

12. Which recommendation do you emit
We can use our classifier only if we are sure the dataset is really representing the real life, which means we should try it live :).