# Network intrusion detection and classification

### 1. Import dependencies

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from keras.layers import Dense, Dropout
from keras.models import Sequential
from google.colab import drive

Using TensorFlow backend.


In [3]:
drive.mount('/content/drive/') 
path = '/content/drive/My Drive/Colab Notebooks/datasets/network_data_train.txt'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive/


### 2. Data Preprocessing

In [0]:
# data does not have column headers, so define them
headers = ["duration","protocol_type","service","flag","src_bytes",
           "dst_bytes","land","wrong_fragment","urgent","hot",
           "num_failed_logins","logged_in","num_compromised","root_shell",
           "su_attempted","num_root","num_file_creations","num_shells",
           "num_access_files","num_outbound_cmds","is_host_login",
           "is_guest_login","count","srv_count","serror_rate", 
           "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate", 
           "diff_srv_rate", "srv_diff_host_rate","dst_host_count",
           "dst_host_srv_count","dst_host_same_srv_rate",
           "dst_host_diff_srv_rate","dst_host_same_src_port_rate",
           "dst_host_srv_diff_host_rate","dst_host_serror_rate",
           "dst_host_srv_serror_rate","dst_host_rerror_rate",
           "dst_host_srv_rerror_rate","attack", "last_flag"]

# read csv
data = pd.read_csv(path, names=headers)

# choose relevent features
features = ['duration','src_bytes','dst_bytes','num_file_creations', 
           'num_shells','num_failed_logins','wrong_fragment', 
           'urgent', 'is_guest_login', 'su_attempted', 'land', 'attack']
df = data[features]

In [5]:
pd.set_option('display.max_columns', None)
print(df.head())

   duration  src_bytes  dst_bytes  num_file_creations  num_shells  \
0         0        491          0                   0           0   
1         0        146          0                   0           0   
2         0          0          0                   0           0   
3         0        232       8153                   0           0   
4         0        199        420                   0           0   

   num_failed_logins  wrong_fragment  urgent  is_guest_login  su_attempted  \
0                  0               0       0               0             0   
1                  0               0       0               0             0   
2                  0               0       0               0             0   
3                  0               0       0               0             0   
4                  0               0       0               0             0   

   land   attack  
0     0   normal  
1     0   normal  
2     0  neptune  
3     0   normal  
4     0   normal  


In [0]:
class_labels = ['normal', 'dos', 'probing', 'u2r', 'r2l']

# data has too many classes
# to simplify problem, categorize them generic network attack types : Probing, U2R, R2L, and DOS

dos = ['neptune', 'apache2', 'processtable', 'smurf', 'back', 'snmpguess', 'mailbomb', 'snmpgetattack',
              'pod', 'multihop', 'teardrop', 'sqlattack', 'land']
probing = ['mscan', 'satan', 'saint', 'portsweep', 'ipsweep', 'nmap', 'spy']
u2r = ['buffer_overflow', 'ps', 'xterm', 'perl', 'loadmodule', 'imap']
r2l = ['guess_passwd', 'warezmaster', 'httptunnel', 'named', 'sendmail', 'xlock','xsnoop', 'rootkit', 
       'ftp_write', 'worm', 'phf', 'udpstorm', 'warezclient']

# replace all values in dataframe with corresponding int value

df['attack'].replace('normal', 0, inplace=True)
df['attack'].replace(dos, 1, inplace=True)
df['attack'].replace(probing, 2, inplace=True)
df['attack'].replace(u2r, 3, inplace=True)
df['attack'].replace(r2l, 4, inplace=True)

# shuffle the data
df = df.sample(frac=1).reset_index(drop=True)

In [8]:
df['attack'].value_counts()

0    67343
1    45934
2    11658
4      985
3       53
Name: attack, dtype: int64

In [9]:
# split data into inputs/outputs
features = df[['duration','src_bytes','dst_bytes','num_file_creations', 
           'num_shells','num_failed_logins','wrong_fragment', 
           'urgent', 'is_guest_login', 'su_attempted', 'land']]
labels = df['attack']

X = np.array(features)
y = np.array(labels).reshape(-1, 1)

# make sure both arrays have correct dimensions
print(X.shape)
print(y.shape)

(125973, 11)
(125973, 1)


### 3. Buidling the model

In [0]:
model = Sequential()

model.add(Dense(units=32, activation='relu', input_dim=(11)))

model.add(Dense(units=24, activation='relu'))
model.add(Dropout(.2))

model.add(Dense(units=11, activation='relu'))
model.add(Dropout(.2))

model.add(Dense(units=8, activation='relu'))

model.add(Dense(units=5, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

In [0]:
model.fit(X, y, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f44326e0898>

In [0]:
test_path = '/content/drive/My Drive/Colab Notebooks/datasets/network_data_test.txt'

test = pd.read_csv(path, names=headers)

features = ['duration','src_bytes','dst_bytes','num_file_creations', 
           'num_shells','num_failed_logins','wrong_fragment', 
           'urgent', 'is_guest_login', 'su_attempted', 'land', 'attack']

test = test[features]

test['attack'].replace('normal', 0, inplace=True)
test['attack'].replace(dos, 1, inplace=True)
test['attack'].replace(probing, 2, inplace=True)
test['attack'].replace(u2r, 3, inplace=True)
test['attack'].replace(r2l, 4, inplace=True)

X_test, y_test = df.drop('attack', axis=1), df['attack']

val_loss, val_acc = model.evaluate(X_test, y_test) 



In [0]:
print(val_acc)
print(val_loss)

0.8859517515672312
0.4039089575777885


#### 88% Accuracy on validation data

In [0]:
# generate a random index to make a prediction on
import random
prediction_index = random.randint(0, len(X_test))

# make prediction
pred_input = np.array(list(X_test.iloc[prediction_index])).reshape(1, 11)
prediction = class_labels[model.predict( pred_input ).argmax()]
actual = class_labels[y_test.iloc[prediction_index]]

# compare prediction vs actual value
print(f'Predicted Value: {prediction}')
print(f'Actual Value: {actual}')

Predicted Value: dos
Actual Value: dos
