# Network Intrusion Detector
### CSc 180, Sect. 01 (Spring 2021)
### Lucas Saechao, Noah Venethongkham, Ashley Thor
### 03/12/2021

# Abstract
Software that protects computer networks from unauthorized intrusion or access provide real, universal value to individuals and enterprises across the world. This type of software is known as an Intrusion Detection System (IDS), and is intended to monitor a network for malicious activity. The advent and explosion of machine learning models, and software in the last decade provide opportunities to develop new techniques in order to determine good or bad actors in a system. 

This project aims to build a Network IDS (NIDS) using machine learning, by providing an accurate, predictive model that can distinguish bad connections, intrusions, or attacks, from genuine, well-intentioned network connections.

This model is trained using the KDD Cup 1999 Dataset.

# Import Libraries

In [2]:
# matplotlib
%matplotlib inline
from matplotlib.pyplot import figure, show
import matplotlib.pyplot as plt

# numpy and pandas
import numpy as np
import pandas as pd

# scikit learn
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import column_or_1d
import sklearn.feature_extraction.text as sk_text

# tensorflow and keras
from tensorflow.keras import optimizers, regularizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Dense, Activation, Flatten, Dropout, Conv2D, MaxPooling2D
import tensorflow as tf
import tensorflow.keras

# python libraries
import requests
import shutil
import json
import time
import csv
import io
import os

# if OS is windows, import chime
if os.name == 'nt':
    import winsound

read = './data/'
write = './weights/'

path = os.path.join(read, 'nid.csv')
weights = os.path.join(write, "weights.hdf5")

# Helper Functions

In [3]:
# Plots a confusion matrix for the model
def plot_confusion_matrix(conf_matrix, title='Confusion Matrix', cmap=plt.cm.Blues):
    plt.imshow(conf_matrix, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arrange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
# Plot an ROC curve
def plot_roc(pred, y):
    fpr, tpr, thresholds = roc_curve(y, pred)
    roc_area_under_curve = auc(fpr, tpr)
    
    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = $0.2f)' % roc_area_under_curve)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
    plt.show
    
# Encodes text values into numerical variables
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for i in dummies.columns:
        dummy_name = "{}-{}".format(name, i)
        df[dummy_name] = dummies[i]
    df.drop(name, axis=1, inplace=True)
    
# Normalizes numerical values into a z-score
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()
    if sd is None:
        sd = df[name].std()
    df[name] = (df[name] - mean) / sd
    
# For formatting time
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)
    
# Beep if on a windows machine
if os.name == 'nt':
    def ding():
        winsound.Beep(2000, 300)
        winsound.Beep(2000, 300)
        winsound.Beep(2000, 300)

# Preprocessing Data Functions
Because this is to be treated as a binary classification problem, we will only care about normal outcomes compared to attacks - the data currently has several discrete attack types, so the following label encoder will provide it in normal and not-normal terms

In [4]:
def make_binary(input_str):
    out = 0 if input_str == 'normal.' else 1
    return out

# Import Data

In [9]:
df_csv = pd.read_csv(path, encoding='utf-8')
df_csv.columns = [
    'duration',
    'protocol_type',
    'service',
    'flag',
    'src_bytes',
    'dst_bytes',
    'land',
    'wrong_fragment',
    'urgent',
    'hot',
    'num_failed_logins',
    'logged_in',
    'num_compromised',
    'root_shell',
    'su_attempted',
    'num_root',
    'num_file_creations',
    'num_shells',
    'num_access_files',
    'num_outbound_cmds',
    'is_host_login',
    'is_guest_login',
    'count',
    'srv_count',
    'serror_rate',
    'srv_serror_rate',
    'rerror_rate',
    'srv_rerror_rate',
    'same_srv_rate',
    'diff_srv_rate',
    'srv_diff_host_rate',
    'dst_host_count',
    'dst_host_srv_count',
    'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate',
    'dst_host_srv_serror_rate',
    'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate',
    'outcome'
]

In [10]:
# print dataframe tuples
df_csv

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,outcome
0,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.00,0.00,0.00,0.0,0.0,normal.
1,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal.
2,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal.
3,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,59,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,normal.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494015,0,tcp,http,SF,310,1881,0,0,0,0,...,255,1.0,0.0,0.01,0.05,0.00,0.01,0.0,0.0,normal.
494016,0,tcp,http,SF,282,2286,0,0,0,0,...,255,1.0,0.0,0.17,0.05,0.00,0.01,0.0,0.0,normal.
494017,0,tcp,http,SF,203,1200,0,0,0,0,...,255,1.0,0.0,0.06,0.05,0.06,0.01,0.0,0.0,normal.
494018,0,tcp,http,SF,291,1200,0,0,0,0,...,255,1.0,0.0,0.04,0.05,0.04,0.01,0.0,0.0,normal.


# Data Preprocessing
Before it can be used, the data first must be examined and cleaned. This means that redundant records and records with missing values will need to be removed

In [11]:
# remove redundant records
df_csv.drop_duplicates(subset=None, inplace=True)

# remove values with missing values
df_csv.dropna()

# determine and remove any useless columns that may incorrectly influence the output

# encode symbolic variables
symbolic = [
    'protocol_type',
    'service',
    'flag',
    'land',
    'logged_in',
    'is_host_login',
    'is_guest_login',
]

continuous = [
    'duration',
    'src_bytes',
    'dst_bytes'
]
for i in symbolic:
    encode_text_dummy(df_csv, i)
for j in continuous:
    encode_numeric_zscore(df_csv, j)
    
y = df_csv['outcome'].map(make_binary)
# 0 for normal
# 1 for attack
y.value_counts()

0    87831
1    57754
Name: outcome, dtype: int64

In [12]:
# verify data frame
df_csv

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,flag-S3,flag-SF,flag-SH,land-0,land-1,logged_in-0,logged_in-1,is_host_login-0,is_guest_login-0,is_guest_login-1
0,-0.10785,-0.004261,-0.039036,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
1,-0.10785,-0.004263,-0.025041,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
2,-0.10785,-0.004272,-0.025041,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
3,-0.10785,-0.004273,-0.013612,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
4,-0.10785,-0.004273,-0.013612,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494015,-0.10785,-0.004222,-0.016095,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
494016,-0.10785,-0.004237,-0.009435,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
494017,-0.10785,-0.004281,-0.027294,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0
494018,-0.10785,-0.004232,-0.027294,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,1,1,0


In [13]:
# drop outcomes
df_csv = df_csv.drop('outcome', axis=1)

In [14]:
# convert to numpy array
df = df_csv.to_numpy()
df_x = df.reshape(len(df_csv), 1, len(df_csv.columns), 1)

In [15]:
df_x

array([[[[-0.10785025],
         [-0.00426104],
         [-0.03903572],
         ...,
         [ 1.        ],
         [ 1.        ],
         [ 0.        ]]],


       [[[-0.10785025],
         [-0.00426324],
         [-0.02504131],
         ...,
         [ 1.        ],
         [ 1.        ],
         [ 0.        ]]],


       [[[-0.10785025],
         [-0.00427203],
         [-0.02504131],
         ...,
         [ 1.        ],
         [ 1.        ],
         [ 0.        ]]],


       ...,


       [[[-0.10785025],
         [-0.00428082],
         [-0.02729423],
         ...,
         [ 1.        ],
         [ 1.        ],
         [ 0.        ]]],


       [[[-0.10785025],
         [-0.00423248],
         [-0.02729423],
         ...,
         [ 1.        ],
         [ 1.        ],
         [ 0.        ]]],


       [[[-0.10785025],
         [-0.00427203],
         [-0.02673511],
         ...,
         [ 1.        ],
         [ 1.        ],
         [ 0.        ]]]])

# Split into train/test set

In [16]:
x_train, x_test, y_train, y_test = train_test_split(df_x, y, test_size=0.25, random_state=42)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
y_train = tf.keras.utils.to_categorical(y_train, 2)
y_test = tf.keras.utils.to_categorical(y_test, 2)

In [17]:
# print shapes
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(109188, 1, 121, 1)
(109188, 2)
(36397, 1, 121, 1)
(36397, 2)


# Prepare Convolutional Model

In [86]:
from tensorflow.keras.optimizers import Adam
convnet = Sequential()
# add convnet layers
input_shape = (1,121,1)

convnet.add(Conv2D(64, kernel_size=(1, 1), strides=(1, 1), padding='valid',
                 activation='relu',
                 input_shape=input_shape))
convnet.add(Conv2D(64, (1, 1), activation='relu'))
convnet.add(MaxPooling2D(pool_size=(1, 1), strides=None))
convnet.add(Dropout(0.25))

convnet.add(Flatten())

convnet.add(Dense(2, activation='relu'))

convnet.add(Dropout(0.5))

# compile convnet
convnet.compile(loss=tf.keras.losses.categorical_crossentropy, optimizer=Adam(lr=0.001, decay=1e-6), metrics=['accuracy'])

In [87]:
# print convnet summary
convnet.summary()

Model: "sequential_40"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_56 (Conv2D)           (None, 1, 121, 64)        128       
_________________________________________________________________
conv2d_57 (Conv2D)           (None, 1, 121, 64)        4160      
_________________________________________________________________
max_pooling2d_16 (MaxPooling (None, 1, 121, 64)        0         
_________________________________________________________________
dropout_24 (Dropout)         (None, 1, 121, 64)        0         
_________________________________________________________________
flatten_8 (Flatten)          (None, 7744)              0         
_________________________________________________________________
dense_10 (Dense)             (None, 2)                 15490     
_________________________________________________________________
dropout_25 (Dropout)         (None, 2)               

# Fit and train model

In [49]:
print(len(x_test))

36397


In [90]:
start_time = time.time()
batch = 128

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-6, patience=20, verbose=2, mode='auto')
checkpoint = ModelCheckpoint(filepath=weights, verbose=0, save_best_only=True)

# fit convnet
convnet.fit(x_train, y_train, batch_size=batch, callbacks=[monitor, checkpoint], epochs=1000, verbose=2, validation_data=(x_test, y_test))

# load weights
convnet.load_weights(weights)
elapsed_time = time.time() - start_time
print("Elapsed Time: {}".format(hms_string(elapsed_time)))

Train on 109188 samples, validate on 36397 samples
Epoch 1/1000
109188/109188 - 5s - loss: 1.1921e-07 - acc: 0.6035 - val_loss: 1.1921e-07 - val_acc: 0.6027
Epoch 2/1000
109188/109188 - 5s - loss: 1.1921e-07 - acc: 0.6035 - val_loss: 1.1921e-07 - val_acc: 0.6027
Epoch 3/1000
109188/109188 - 5s - loss: 1.1921e-07 - acc: 0.6035 - val_loss: 1.1921e-07 - val_acc: 0.6027
Epoch 4/1000
109188/109188 - 5s - loss: 1.1921e-07 - acc: 0.6035 - val_loss: 1.1921e-07 - val_acc: 0.6027
Epoch 5/1000
109188/109188 - 5s - loss: 1.1921e-07 - acc: 0.6035 - val_loss: 1.1921e-07 - val_acc: 0.6027
Epoch 6/1000
109188/109188 - 6s - loss: 1.1921e-07 - acc: 0.6035 - val_loss: 1.1921e-07 - val_acc: 0.6027
Epoch 7/1000
109188/109188 - 4s - loss: 1.1921e-07 - acc: 0.6035 - val_loss: 1.1921e-07 - val_acc: 0.6027
Epoch 8/1000
109188/109188 - 4s - loss: 1.1921e-07 - acc: 0.6035 - val_loss: 1.1921e-07 - val_acc: 0.6027
Epoch 9/1000
109188/109188 - 4s - loss: 1.1921e-07 - acc: 0.6035 - val_loss: 1.1921e-07 - val_acc: 0.

In [91]:
# evaluate() computes the loss and accuracy
score = convnet.evaluate(x_test[0:100], y_test[0:100], verbose=0)
score
print('Test loss: {}'.format(score[0]))
print('Test accuracy: {}'.format(score[1]))

Test loss: 1.1920930376163597e-07
Test accuracy: 0.5799999833106995


In [115]:
# Ding when complete
if os.name == 'nt':
    ding()
else:
    print("Completed")

Completed


# Perform predictions

# Model Evaluation
As a classification model, this model will be evaluated on the following metrics: 
 * Accuracy
 * Precision
 * Recall
 * F1 Score
 * Log Loss
 * Confusion Matrix
 * ROC Curve