# Normal Random Forest on CICIDS2017

In [1]:
import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 
import time

from numpy import array

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import completeness_score, homogeneity_score, v_measure_score

from sklearn.model_selection import train_test_split

## Reading data

In [2]:
# path to where ML files are stored
path = '../dataset/CICIDS2017'
all_files = glob.glob(path + "/*.csv")

# concatenate the 8 files into 1
dataset = pd.concat((pd.read_csv(f) for f in all_files))

Inspect the Dataset

In [3]:
# Peak at first 5 records in the dataset
dataset.head(5)



In [4]:
# get statistics about each feature
dataset.describe().transpose()





In [5]:
# check all the values are numerical
# if not, would have to encode
dataset.dtypes



## Preprocessing data

Remove NaN/Null/Inf Values

In [6]:
dataset.isnull().any().any()



In [7]:
# Replace Inf values with NaN
dataset = dataset.replace([np.inf, -np.inf], np.nan)
# Drop all occurences of NaN
dataset = dataset.dropna()
# Double check these are all gone
dataset.isnull().any().any()



In [8]:
# rename Label columns
dataset = dataset.rename(columns={' Label': 'Label'})

In [9]:
dataset = dataset.replace(['Heartbleed', 'Web Attack � Sql Injection', 'Infiltration'], np.nan)
dataset = dataset.dropna()
dataset['Label'].value_counts()



In [10]:
dataset.loc[dataset.Label == 'Web Attack � Brute Force', ['Label']] = 'Brute Force'
dataset.loc[dataset.Label == 'Web Attack � XSS', ['Label']] = 'XSS'

In [11]:
# Create attack column, containing binary labels
dataset['Attack'] = np.where(dataset['Label'] == 'BENIGN', 0, 1)
dataset['Attack'].value_counts()



In [12]:
# Proposed Groupings
attack_group = {'BENIGN': 'benign', 
                'DoS Hulk': 'dos',
                'PortScan': 'probe', 
                'DDoS': 'ddos',
                'DoS GoldenEye': 'dos', 
                'FTP-Patator': 'brute_force',
                'SSH-Patator': 'brute_force', 
                'DoS slowloris': 'dos', 
                'DoS Slowhttptest': 'dos',
                'Bot': 'botnet',
                'Brute Force': 'web_attack', 
                'XSS': 'web_attack'}
# Create grouped label column
dataset['Label_Category'] = dataset['Label'].map(lambda x: attack_group[x])
dataset['Label_Category'].value_counts()



## Split Data

Split data using 60:20:20 ratio, for training, test and validation dataset. We stratified so that the attack rate remained the same across all 3 sets.

In [13]:
# 3 Different labeling options
attacks = ['Label', 'Label_Category', 'Attack']

# xs=feature vectors, ys=labels
xs = dataset.drop(attacks, axis=1)
ys = dataset[attacks]

# split dataset - stratified
x_train, x_temp, y_train, y_temp = train_test_split(xs, ys, test_size=0.4, random_state=0, stratify=ys['Label'])
x_test, x_validate, y_test, y_validate = train_test_split(x_temp, y_temp, test_size=0.5, random_state=0, stratify=y_temp['Label'])


In [14]:
column_names = np.array(list(x_train))
to_drop = []
for x in column_names:
    size = x_train.groupby([x]).size()
    # check for columns that only take one value
    if (len(size.unique()) == 1):
        to_drop.append(x)
to_drop



Drop these because they only contain one value, and so are redundant as columns

In [15]:
x_train = x_train.drop(to_drop, axis=1)
x_validate = x_validate.drop(to_drop, axis=1)
x_test = x_test.drop(to_drop, axis=1)
dataset_copy = dataset.drop(to_drop, axis=1)

## Apply Normalization

Using minmax normalization

In [16]:
# Normalise
min_max_scaler = MinMaxScaler().fit(x_train)

# Apply normalization to dataset
x_train = min_max_scaler.transform(x_train)
x_validate = min_max_scaler.transform(x_validate)
x_test = min_max_scaler.transform(x_test)

# All values between 0 and 1
pd.Series(x_train.flatten()).describe()



## Feature Selection

Use chi2 select k best First, score all the features

In [17]:
features = SelectKBest(score_func=chi2, k=x_train.shape[1])

#fit features to the training dataset
fit = features.fit(x_train, y_train.Label)

In [20]:
# plot the score associated with each feature
features_df = pd.DataFrame({
    'feature': dataset_copy.columns,
    'score': features.scores_
})
plt.figure(figsize=(12, 6))
plt.bar(range(len(features_df)), features_df['score'])
plt.xticks(range(len(features_df)), features_df['feature'], rotation=90, fontsize=5)
plt.tight_layout()
plt.savefig('features.png', dpi=300)
plt.close()



In [None]:
# sort the features by importance score
features_df = features_df.sort_values('score', ascending=False)
sorted_importances = features_df['score'].values
sorted_features = features_df['feature'].values

x_values = range(len(sorted_importances))

# plot the cumulative scores
plt.figure(figsize=(12, 6))
cumulative_importances = np.cumsum(sorted_importances)
plt.plot(x_values, cumulative_importances)

# Draw line at 99% of importance retained 
value99 = cumulative_importances[-1]*0.99
plt.hlines(y=value99, xmin=0, xmax=len(sorted_importances), color='r', linestyles='dashed')
plt.xticks(x_values, sorted_features, rotation='vertical', fontsize=5)
plt.yticks([], [])
plt.xlabel('Feature Variable', fontsize=8)
plt.title('A Chart to Show Cumulative Feature Scores', fontsize=8)
plt.tight_layout()
plt.savefig('cum_features.png', dpi=300)
plt.close()