In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score

%matplotlib inline

In [2]:
columns = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent",
        "hot", "num_failed_logins", "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root", 
        "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login",
        "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
        "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", 
        "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
        "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"]

df = pd.read_csv("data/kddcup.data.corrected", sep=",", names=columns, index_col=None)


In [3]:
# Filter to only 'http' attacks
df = df[df["service"] == "http"]
df = df.drop("service", axis=1)


In [4]:
df['label'] = df['label'].apply(lambda x: 0 if x=='normal.' else 1)
df['label'].value_counts()

label
0    619046
1      4045
Name: count, dtype: int64

In [5]:
datatypes = dict(zip(df.dtypes.index, df.dtypes))

encoder_map = {}
for col, datatype in datatypes.items():
    if datatype == 'object':
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col])
        encoder_map[col] = encoder 


In [6]:
# Check the variables with highest correlation with 'label'
df2 = df.copy()
label_corr = df2.corr()['label']

In [24]:
# Filter out anything that has null entry or is not weakly correlated
train_cols = label_corr[(~label_corr.isna()) & (np.abs(label_corr) > 0.2)]
train_cols = list(train_cols[:-1].index)
train_cols

['src_bytes',
 'hot',
 'num_compromised',
 'count',
 'serror_rate',
 'srv_serror_rate',
 'same_srv_rate',
 'diff_srv_rate',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_diff_srv_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate']

In [7]:
labels = df2['label']
# Conduct a train-test split    
x_train, x_test, y_train, y_test = train_test_split(df2.drop('label',axis=1).values, labels.values, test_size = 0.15, random_state = 42)


In [8]:
# Additional split of training dataset to create validation split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [9]:
from tensorflow.keras.utils import to_categorical

In [10]:
y_train =  to_categorical(y_train)
y_test =  to_categorical(y_test)
y_val =  to_categorical(y_val)

In [11]:
print("Shapes")
print(f"x_train:{x_train.shape}\ny_train:{y_train.shape}")
print(f"\nx_val:{x_val.shape}\ny_val:{y_val.shape}")
print(f"\nx_test:{x_test.shape}\ny_test:{y_test.shape}")

Shapes
x_train:(423701, 40)
y_train:(423701, 2)

x_val:(105926, 40)
y_val:(105926, 2)

x_test:(93464, 40)
y_test:(93464, 2)


In [30]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

In [35]:
input_layer = Input(shape=(13))
h1 = Dense(80, activation='relu')(input_layer)
h2 = Dense(80, activation='relu')(h1)
h3 = Dense(40, activation='relu')(h2)
h4 = Dense(20, activation='relu')(h3)
h5 = Dense(10, activation='relu')(h4)
out = Dense(2, activation='softmax')(h5)

model = Model(input_layer, out)

In [36]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics='accuracy')

In [37]:
epochs = 8
batch_size = 128

history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=epochs, batch_size=batch_size)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [38]:
model.evaluate(x_test, y_test)



[0.00018023540906142443, 0.9999786019325256]

In [39]:
preds = model.predict(x_test)

In [40]:
preds

array([[1.0000000e+00, 1.0556079e-12],
       [1.0000000e+00, 2.8465298e-15],
       [1.0000000e+00, 5.0529242e-10],
       ...,
       [1.0000000e+00, 4.9877577e-16],
       [1.0000000e+00, 5.4690689e-22],
       [1.0000000e+00, 2.3340392e-20]], dtype=float32)

In [41]:
y_true = y_test.argmax(axis=1)
y_true

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [42]:
y_preds = preds.argmax(axis=1)

In [43]:
precision_score(y_true, y_preds)

0.9964973730297724

In [44]:
recall_score(y_true, y_preds)

1.0

In [45]:
f1_score(y_true, y_preds)

0.9982456140350877

In [46]:
from sklearn.metrics import ConfusionMatrixDisplay

In [48]:
cm = confusion_matrix(y_true, y_preds)

In [54]:
ConfusionMatrixDisplay(cm)


<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1803e348910>

In [56]:
cm

array([[92893,     2],
       [    0,   569]], dtype=int64)