# Network Intrusion Detection by Machine Learning

IN previous notebook I have explored using the LSTM to identify anormaly in signals. In this notebook, I tried to use machine learning to classify the network activity. Given the log file of the data and bandwidth usage, we want to predict if such an activity a normal data access or a hacking process. 

Data taken from 
https://www.kaggle.com/sampadab17/network-intrusion-detection

Written by Shing Chi Leung at 5 February 2021

In [58]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [2]:
from google.colab import drive
drive.mount("gdrive")

Mounted at gdrive


In [75]:
train_file_path = "gdrive/MyDrive/Colab Notebooks/network_train_data.csv"

df = pd.read_csv(train_file_path)

print("Size of training set is {}\n".format(df.shape))

print("First few rows of raw data")
df.head(5)

Size of training set is (25192, 42)

First few rows of raw data


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,150,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0.0,0.0,0.0,0.0,0.08,0.15,0.0,255,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,6,1.0,1.0,0.0,0.0,0.05,0.07,0.0,255,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,anomaly
3,0,tcp,http,SF,232,8153,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,5,0.2,0.2,0.0,0.0,1.0,0.0,0.0,30,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,30,32,0.0,0.0,0.0,0.0,1.0,0.0,0.09,255,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


I need to check the type of data presented in each column to learn about their typical trend. 

In [67]:
print("Summary of train textual data:")
print("Protocol Type inclues {}\n".format(set(df["protocol_type"])))
print("Service inclues {}\n".format(set(df["service"])))
print("Flag inclues {}\n".format(set(df["flag"])))
print(" ")

print("Statistics of numerical data:")

numerical_columns = ["duration", "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", 
    "logged_in", "num_compromised", "root_shell",	"su_attempted",	"num_root",	
    "num_file_creations",	"num_shells",	"num_access_files",	"num_outbound_cmds", "is_host_login",	
    "is_guest_login", "count", "srv_count",	"serror_rate", "srv_serror_rate", 
    "rerror_rate",	"srv_rerror_rate", "same_srv_rate",	"diff_srv_rate",	"srv_diff_host_rate",	
    "dst_host_count",	"dst_host_srv_count",	"dst_host_same_srv_rate",	"dst_host_diff_srv_rate",	
    "dst_host_same_src_port_rate",	"dst_host_srv_diff_host_rate",	"dst_host_serror_rate",	"dst_host_srv_serror_rate",	
    "dst_host_rerror_rate",	"dst_host_srv_rerror_rate"]

df[numerical_columns].agg(["max", "min", "mean", "std"])



Summary of train textual data:
Protocol Type inclues {'udp', 'icmp', 'tcp'}

Service inclues {'uucp', 'exec', 'whois', 'other', 'time', 'private', 'iso_tsap', 'sunrpc', 'efs', 'hostnames', 'telnet', 'http_443', 'IRC', 'nntp', 'pop_3', 'finger', 'domain', 'systat', 'shell', 'sql_net', 'netbios_ns', 'discard', 'supdup', 'uucp_path', 'netbios_dgm', 'auth', 'eco_i', 'ntp_u', 'daytime', 'pop_2', 'ldap', 'rje', 'nnsp', 'gopher', 'http_8001', 'csnet_ns', 'ecr_i', 'kshell', 'smtp', 'red_i', 'Z39_50', 'domain_u', 'echo', 'ctf', 'netstat', 'link', 'netbios_ssn', 'remote_job', 'printer', 'bgp', 'vmnet', 'tim_i', 'X11', 'urh_i', 'name', 'http', 'courier', 'login', 'imap4', 'ftp_data', 'pm_dump', 'ftp', 'klogin', 'mtp', 'urp_i', 'ssh'}

Flag inclues {'OTH', 'S1', 'SF', 'REJ', 'SH', 'RSTOS0', 'S0', 'RSTR', 'RSTO', 'S2', 'S3'}

 
Statistics of numerical data:


Unnamed: 0,duration,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
max,42862.0,1.0,3.0,1.0,77.0,4.0,1.0,884.0,1.0,2.0,975.0,40.0,1.0,8.0,0.0,0.0,1.0,511.0,511.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,normal
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,anomaly
mean,305.054104,7.9e-05,0.023738,4e-05,0.198039,0.001191,0.394768,0.22785,0.001548,0.00135,0.249841,0.014727,0.000357,0.004327,0.0,0.0,0.00913,84.59118,27.698754,0.286338,0.283762,0.11863,0.12026,0.660559,0.062363,0.095931,182.532074,115.063036,0.519791,0.082539,0.147453,0.031844,0.2858,0.279846,0.1178,0.118769,
std,2686.55564,0.00891,0.260221,0.0063,2.154202,0.045418,0.488811,10.417352,0.039316,0.048785,11.500842,0.529602,0.018898,0.098524,0.0,0.0,0.095115,114.673451,72.468242,0.447312,0.447599,0.318745,0.322335,0.439637,0.17855,0.256583,98.993895,110.64685,0.448944,0.187191,0.308367,0.110575,0.445316,0.446075,0.305869,0.317333,


In [74]:
oe = OrdinalEncoder()
oe.fit(df[["protocol_type", "flag"]])
np_text_enc = oe.transform(df[["protocol_type", "flag"]])

print("Shape of textual data: {}\n".format(np_text_enc.shape))
print("First few rows of textual data: \n{}\n".format(np_text_enc[:5]))

Shape of textual data: (25192, 2)

First few rows of textual data: 
[[1. 9.]
 [2. 9.]
 [1. 5.]
 [1. 9.]
 [1. 9.]]



In [23]:
mms = MinMaxScaler()
mms.fit(df[numerical_columns])

np_num_scaled = mms.transform(df[numerical_columns])
print("First few rows of numerical data: \n{}\n".format(np_num_scaled[:3]))
print("Shape of numerical data: {}".format(np_num_scaled.shape))

First few rows of numerical data: 
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.00196078
  0.00196078 0.         0.         0.         0.         1.
  0.         0.         0.58823529 0.09803922 0.17       0.03
  0.17       0.         0.         0.         0.05       0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.02352941
  0.         0.         0.         0.         0.         0.08
  0.15       0.         1.         0.00392157 0.         0.6
  0.88       0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.23921569
  0.00980392 1.   

In [71]:
# concatenuate the two data together to form the complete training set

sources = np.concatenate((np_text_enc, np_num_scaled), axis=1)
print("Source data shape: {}".format(sources.shape))

Source data shape: (25192, 38)


In [72]:
oe2 = OrdinalEncoder()

# change the "class" data from 1D to 2D for transform
temp_array = df["class"].to_numpy()
temp_array = temp_array.reshape(temp_array.shape[0], 1)

# fit and transform the discrete data
oe2.fit(temp_array)
targets = oe2.transform(temp_array)

print("Target data shape: {}\n".format(targets.shape))
print("First few rows of class: \n{}\n".format(targets))

Target data shape: (25192, 1)

First few rows of class: 
[[1.]
 [1.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]



In [59]:
x_train, x_test, y_train, y_test = train_test_split(sources, targets, test_size=0.1)

In [62]:
def build_model(input_shape):

  model = Sequential()

  model.add(Dense(80, activation="relu", input_shape=input_shape))
  model.add(Dense(80, activation="relu"))
  model.add(Dense(1, activation="sigmoid"))

  return model

model = build_model(x_train[0].shape)
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 80)                3120      
_________________________________________________________________
dense_10 (Dense)             (None, 80)                6480      
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 81        
Total params: 9,681
Trainable params: 9,681
Non-trainable params: 0
_________________________________________________________________


In [63]:
def compile_model(model):
  model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

compile_model(model)

In [64]:
def train_model(model, x_train, y_train, epochs):
  history = model.fit(x_train, y_train, epochs=epochs, validation_split=0.1)

history = train_model(model, x_train, y_train, 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [65]:
results = model.evaluate(x_test, y_test, batch_size=16)





In [70]:
y_pred = model.predict(x_test, batch_size=16)

print("Output first few results:")
for i in range(5):
  print("Predictions: {} --- Actual {}".format(y_pred[i], y_test[i]))

Output first few results:
Predictions: [0.98243797] --- Actual [1.]
Predictions: [0.9985635] --- Actual [1.]
Predictions: [0.9975023] --- Actual [1.]
Predictions: [0.9979465] --- Actual [1.]
Predictions: [0.99838567] --- Actual [1.]
