In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/models')


In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from tensorflow.keras.utils import plot_model, to_categorical
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from models_ddos import model_conv1D


In [None]:
# Define the feature names

features = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login',
    'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label'
]

# Load the KDD-CUP-99 dataset

dataset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/kddcup.data_10_percent_corrected', header=None)

# Add the feature names

dataset.columns = features

distinct_labels = dataset['label'].unique()

# Print the distinct labels

print(distinct_labels)
print(dataset.shape)


['normal.' 'buffer_overflow.' 'loadmodule.' 'perl.' 'neptune.' 'smurf.'
 'guess_passwd.' 'pod.' 'teardrop.' 'portsweep.' 'ipsweep.' 'land.'
 'ftp_write.' 'back.' 'imap.' 'satan.' 'phf.' 'nmap.' 'multihop.'
 'warezmaster.' 'warezclient.' 'spy.' 'rootkit.']
(494021, 42)


In [None]:
# Mapping of distinct attack types

attack_mapping = {
    'back.': 0, 'buffer_overflow.': 1, 'ftp_write.': 2, 'guess_passwd.': 3, 'imap.': 4, 'ipsweep.': 5, 'land.': 6, 'loadmodule.': 7, 'multihop.': 8, 'neptune.': 9, 'nmap.': 10, 'normal.': 11, 'perl.': 12, 'phf.': 13, 'pod.': 14, 'portsweep.': 15, 'rootkit.': 16, 'satan.': 17, 'smurf.': 18, 'spy.': 19, 'teardrop.': 20, 'warezclient.': 21, 'warezmaster.': 22
}

# Function to get attack type

def get_key(val):
    for key, value in attack_mapping.items():
        if value == val:
            return key

# Replace attack types with indices

dataset['label'] = dataset['label'].replace(attack_mapping)

nclass = dataset['label'].nunique()
print(f'The number of distinct labels is: {nclass}')
print(dataset['label'].value_counts())


The number of distinct labels is: 23
label
18    280790
9     107201
11     97278
0       2203
17      1589
5       1247
15      1040
21      1020
20       979
14       264
10       231
3         53
1         30
6         21
22        20
4         12
16        10
7          9
2          8
8          7
13         4
12         3
19         2
Name: count, dtype: int64


In [None]:
# Define categorical / numeric features

target_column = ['label']
features = dataset.columns.difference(target_column)

categorical_features = ['protocol_type', 'service', 'flag']
numeric_features = dataset.columns.difference(categorical_features).drop(target_column)


In [None]:
# Define one-hot encoding / standard scaling to categorical / numeric features resp.

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('num', StandardScaler(), numeric_features)
    ])

# Apply the transformations

X = pd.DataFrame(preprocessor.fit_transform(dataset[features]))
y = dataset[target_column]

# Display the shape of the transformed data

print(X.shape)
print(y.shape)

(494021, 118)
(494021, 1)


In [None]:
# Split data into training and test sets

X_train, X_test, y_train_ini, y_test_ini = train_test_split(X, y, test_size=0.30, random_state=42)

# Apply one-hot encoding to targets

y_train = to_categorical(y_train_ini, nclass)
y_test = to_categorical(y_test_ini, nclass)

# Display the shape of the training / test sets

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(345814, 118)
(148207, 118)
(345814, 23)
(148207, 23)


In [None]:
# Set model hyperparameters

lr = 1e-4
N = 64
inshape = X_train.shape[1]

# Define model

model = model_conv1D(lr=lr,N=N,inshape=inshape,nclass=nclass)

# Set threshold proba

thresh_proba = 0.995

# Set number of epochs and batch size

number_of_epochs = 1
batch_size = 10171


In [None]:
# Process continuos training through batches ( number of epochs can vary )

for epoch in range(1, number_of_epochs+1):

    print(f"Epoch {epoch} / {number_of_epochs}")

    for i in range(0, len(X_train), batch_size):

        iteration = int(i / batch_size + 1)

        X_batch = X_train[i : i + batch_size]
        y_batch = y_train[i : i + batch_size]

        model.fit(X_batch, y_batch)

        pred = model.predict(X_test)
        y_pred = pred.argmax(axis=-1)

        attack_types = dict()
        for k in range(pred.shape[0]):
            max_class = np.argmax(pred[k])
            max_proba = pred[k, max_class]
            max_key = get_key(max_class)
            if max_key != 'normal.' and max_proba > thresh_proba:
                if max_key in attack_types:
                    attack_types[max_key] += 1
                else:
                    attack_types[max_key] = 1

        # Sort the dictionary by values in descending order

        sorted_attack_types = sorted(attack_types.items(), key=lambda item: -item[1])

        print(f"Epoch {epoch}, Iteration {iteration}, Sorted attack types blocked : {sorted_attack_types}")

    # Epoch results

    accuracy = accuracy_score(y_test_ini.astype('int32'), y_pred)
    print(f"Summary results for epoch {epoch}:")
    print(f'Accuracy = {accuracy}')

Epoch 1 / 1
[1m318/318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 30ms/step - acc: 0.9389 - loss: 1.0520
[1m4632/4632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 7ms/step
Epoch 1, Iteration 1, Sorted attack types blocked : [('smurf.', 80538), ('neptune.', 25415), ('guess_passwd.', 1)]
[1m318/318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 24ms/step - acc: 0.9897 - loss: 0.0446
[1m4632/4632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 7ms/step
Epoch 1, Iteration 2, Sorted attack types blocked : [('smurf.', 84253), ('neptune.', 25728), ('buffer_overflow.', 12), ('guess_passwd.', 11), ('back.', 7)]
[1m318/318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 27ms/step - acc: 0.9942 - loss: 0.0226
[1m4632/4632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 7ms/step
Epoch 1, Iteration 3, Sorted attack types blocked : [('smurf.', 84266), ('neptune.', 31324), ('ipsweep.', 79)]
[1m318/318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 