# Network Intrusion Detection using Artificial Neural Network(ANN) Discovery

# Library Setup

In [2]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import keras 
import seaborn as sns
from keras import preprocessing
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Flatten
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
# Importing the dataset
df = pd.read_csv('Train_data.csv')


In [3]:
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,anomaly
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


# MIssing Values

In [4]:
missing_count = df.isnull().sum()/len(df)*100
print(missing_count)
#Dataset was cleaned manually in Excel, but normally would require data modifications for missing values.

duration                        0.000000
protocol_type                   0.000000
service                         0.000000
flag                            0.000000
src_bytes                       0.000000
dst_bytes                       0.000000
land                            0.000000
wrong_fragment                  0.000000
urgent                          0.000000
hot                             0.000000
num_failed_logins               2.302302
logged_in                       2.302302
num_compromised                 0.000000
root_shell                      0.000000
su_attempted                    0.000000
num_root                        0.000000
num_file_creations              0.000000
num_shells                      0.000000
num_access_files                0.000000
num_outbound_cmds               0.000000
is_host_login                   0.000000
is_guest_login                  0.000000
count                           0.000000
srv_count                       0.000000
serror_rate     

# Imputation

In [5]:
#Determine missing data type
(df.dtypes)

duration                         int64
protocol_type                   object
service                         object
flag                            object
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins              float64
logged_in                      float64
num_compromised                  int64
root_shell                       int64
su_attempted                     int64
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
num_outbound_cmds                int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_rate          

In [6]:
#Impute missing values
imputer = SimpleImputer(strategy='mean')
imputer.fit(df['num_failed_logins'].values.reshape(-1, 1))
imputer.fit(df['logged_in'].values.reshape(-1, 1))
imputer.fit(df['srv_serror_rate'].values.reshape(-1, 1))
df['num_failed_logins'] = imputer.transform(df['num_failed_logins'].values.reshape(-1, 1))
df['logged_in'] = imputer.transform(df['logged_in'].values.reshape(-1, 1))
df['srv_serror_rate'] = imputer.transform(df['srv_serror_rate'].values.reshape(-1, 1))
#check how many values are missing (NaN) - after we filled in the NaN
missing_count = df.isnull().sum()/len(df)*100 # the number of missing values for every column
missing_count

duration                       0.0
protocol_type                  0.0
service                        0.0
flag                           0.0
src_bytes                      0.0
dst_bytes                      0.0
land                           0.0
wrong_fragment                 0.0
urgent                         0.0
hot                            0.0
num_failed_logins              0.0
logged_in                      0.0
num_compromised                0.0
root_shell                     0.0
su_attempted                   0.0
num_root                       0.0
num_file_creations             0.0
num_shells                     0.0
num_access_files               0.0
num_outbound_cmds              0.0
is_host_login                  0.0
is_guest_login                 0.0
count                          0.0
srv_count                      0.0
serror_rate                    0.0
srv_serror_rate                0.0
rerror_rate                    0.0
srv_rerror_rate                0.0
same_srv_rate       

# Label Encoding

In [7]:
enc = OneHotEncoder()
enc.fit(df)
one_hot_df = enc.transform(df)
columns = enc.get_feature_names()
pd.DataFrame(one_hot_df.toarray(), columns=columns)

Unnamed: 0,x0_0,x0_1,x0_2,x0_3,x0_4,x0_5,x0_7,x0_8,x0_10,x0_18,...,x40_0.81,x40_0.84,x40_0.88,x40_0.91,x40_0.96,x40_0.97,x40_0.99,x40_1.0,x41_anomaly,x41_normal
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
996,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
997,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
# Splitting the dataset into the Training set and Test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Scaling

In [8]:
#StandardScaler
from sklearn.preprocessing import StandardScaler
scaled1 = StandardScaler().fit_transform(one_hot_df.toarray())
scaled1[:5]

array([[ 0.29903516, -0.12758019, -0.09534626, ..., -0.36245045,
        -0.96943583,  0.96943583],
       [ 0.29903516, -0.12758019, -0.09534626, ..., -0.36245045,
        -0.96943583,  0.96943583],
       [ 0.29903516, -0.12758019, -0.09534626, ..., -0.36245045,
         1.03152779, -1.03152779],
       [ 0.29903516, -0.12758019, -0.09534626, ..., -0.36245045,
        -0.96943583,  0.96943583],
       [ 0.29903516, -0.12758019, -0.09534626, ..., -0.36245045,
        -0.96943583,  0.96943583]])

In [9]:
pd.DataFrame(scaled1).describe().head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2017,2018,2019,2020,2021,2022,2023,2024,2025,2026
count,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,...,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0
mean,1.209132e-16,-5.334405e-18,2.845016e-17,3.55627e-18,-2.489389e-17,0.0,-3.55627e-18,1.0668810000000001e-17,0.0,-1.6003210000000003e-17,...,-8.890675e-18,-7.11254e-18,-1.0668810000000001e-17,-1.0668810000000001e-17,-3.55627e-18,-2.489389e-17,-1.9559480000000002e-17,-1.2446940000000002e-17,0.0,-5.5122180000000004e-17
std,1.000501,1.000501,1.000501,1.000501,1.000501,1.000501,1.000501,1.000501,1.000501,1.000501,...,1.000501,1.000501,1.000501,1.000501,1.000501,1.000501,1.000501,1.000501,1.000501,1.000501
min,-3.344088,-0.1275802,-0.09534626,-0.03165445,-0.06340426,-0.044789,-0.04478859,-0.05488213,-0.031654,-0.03165445,...,-0.03165445,-0.03165445,-0.03165445,-0.03165445,-0.04478859,-0.04478859,-0.03165445,-0.3624504,-0.969436,-1.031528
25%,0.2990352,-0.1275802,-0.09534626,-0.03165445,-0.06340426,-0.044789,-0.04478859,-0.05488213,-0.031654,-0.03165445,...,-0.03165445,-0.03165445,-0.03165445,-0.03165445,-0.04478859,-0.04478859,-0.03165445,-0.3624504,-0.969436,-1.031528


# Dimensionality Reduction

In [16]:
#Decomposition
from sklearn.decomposition import *

In [None]:
#PCA

In [None]:
corr = df_int_selected.corr()
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns)

In [None]:
plt.figure(figsize = (16,5))

ax = sns.heatmap(df.iloc[:, 1:42:], annot=True, linewidths=.5)

In [None]:
plt.figure(figsize = (16,5))

corr = df.corr()

sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns)

In [None]:
dataset.var()

# Visualization - Data Preprocessing

In [None]:
#Variance
df['Item_Weight'].fillna(df['Item_Weight'].median(), inplace=True)
df['Outlet_Size'].fillna(df['Outlet_Size'].mode()[0], inplace=True)

In [None]:
figure, ax = plt.subplots(4,2, figsize=(12,24))

sns.distplot(data['charges'],ax= ax[0,0])
sns.distplot(data['age'],ax=ax[0,1])
sns.distplot(data['bmi'],ax= ax[1,0])
sns.distplot(data['children'],ax= ax[1,1])

corr = dataset.corr()

sns.heatmap(corr, cmap = 'Wistia', annot= True)
plt.show(sns)

In [None]:
figure, ax = plt.subplots(4,2, figsize=(12,24))

#See the distrubution of the data
sns.distplot(data['charges'],ax= ax[0,0])
sns.distplot(data['age'],ax=ax[0,1])
sns.distplot(data['bmi'],ax= ax[1,0])
sns.distplot(data['children'],ax= ax[1,1])


sns.countplot(data['sex'],ax=ax[2,0])
sns.countplot(data['smoker'],ax= ax[2,1])
sns.countplot(data['region'],ax= ax[3,0])



#visualizeing skewness
sns.pairplot(data)

#Lets look at smokers vs non-smokers on age vs charges:

sns.lmplot(x="age", y="charges", hue="smoker", data=data, palette = 'muted', height = 7)
plt.show(sns)

#Lets look at correlation:

corr = data.corr()

sns.heatmap(corr, cmap = 'Wistia', annot= True)
plt.show(sns)

In [None]:
#ndarray setup for label encoding
X = dataset.iloc[:, 1:42].values
y = dataset.iloc[:, 0].values

In [None]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
#Encode Categorical Variable 1
lcoder = LabelEncoder()

In [None]:
X

In [None]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
#Encode Categorical Variable 1
lcoder = LabelEncoder()
X[:,0] = lcoder.fit_transform(X[:, 0])
#Encode Categorical Variable 2
X[:, 1] = lcoder.fit_transform(X[:, 1])
#Encode Categorical Variable 3
X[:, 2] = lcoder.fit_transform(X[:, 2])

In [None]:
X1

# Encoding

In [None]:

#Create Dummy Variables
onehotencoder = OneHotEncoder(categories=[0])
X = onehotencoder.fit_transform(X).toarray()
#Removing the first dummy variable as 3 were created and we need to reduce to 2
X = X[:, 1:]

# Train/Test Split, Scaling

In [None]:
# Splitting the dataset into the Training set and Test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# ANN Generation

In [None]:
#Initializing the ANN
classifier = Sequential()

#Adding the input layer and the first hidden layer with Dropout
classifier.add(Dense(activation = 'relu', units = 22, kernel_initializer = 'uniform', input_dim = 42))
classifier.add(Dropout(rate = 0.1))

#Adding a second hidden layer
classifier.add(Dense(activation = 'relu', units = 22, kernel_initializer = 'uniform'))
classifier.add(Dropout(rate = 0.1))

#Adding an output layer (binary outcome)
classifier.add(Dense(activation = 'sigmoid', units = 1, kernel_initializer = 'uniform'))

#Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])


#Fitting the ANN to the Training Set
classifier.fit(X_train, y_train, batch_size = 10, epochs = 100)

# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5) 

# Confusion Matrix Accuracy
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Model Evaluation - Baseline

In [None]:
#Evaluate the ANN using Kfold - Pre-Eval Accuracy: 99.4%, Post-Eval Accuracy: 99.63
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
def build_classifier():
    classifier = Sequential()
    classifier.add(Dense(activation = 'relu', units = 22, kernel_initializer = 'uniform', input_dim = 42))
    classifier.add(Dense(activation = 'relu', units = 22, kernel_initializer = 'uniform'))
    classifier.add(Dense(activation = 'sigmoid', units = 1, kernel_initializer = 'uniform'))
    classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier

classifier = KerasClassifier(build_fn = build_classifier, batch_size = 10, epochs = 100)
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10, n_jobs = 1)
mean = accuracies.mean()
variace = accuracies.std()

# Model Tuning, Optimization

In [None]:
#Tuning the ANN to discover the best parameters for Keras optimization
 
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
def build_classifier(optimizer):
    classifier = Sequential()
    classifier.add(Dense(activation = 'relu', units = 22, kernel_initializer = 'uniform', input_dim = 42))
    classifier.add(Dense(activation = 'relu', units = 22, kernel_initializer = 'uniform'))
    classifier.add(Dense(activation = 'sigmoid', units = 1, kernel_initializer = 'uniform'))
    classifier.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier

classifier = KerasClassifier(build_fn = build_classifier)
#GridSearch Dictionary
parameters = {'batch_size': [25, 32], 
              'epochs': [100, 500],
              'optimizer': ['adam', 'rmsprop']}
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10)
grid_search = grid_search.fit(X_train, y_train)
best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_

# Analysis Outcome

In [None]:
"""
Analysis Outcome: Overall, the ANN model performed extremely well with predictions coming in over 99% using the initial training
data without feature modification. Although this model resulted in high accuracy...it shows preliminary expectations of overfitting.
Recommended Actions: Significantly reduce feature scope and re-validate which I believe will result in <99% accuracy but a less 
biased model. 

"""