# Anomaly detection of Network Packets - Unsupervised Approach

This is a task from http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html about network traffic packets. There are several kinds of preclassified packets: normal, trojans, viruses, etc.

The goal of this work to implement a detector of no normal = anomaly packets.

## Preparing data

Download & unpack data to ./data/kdd folder

In [69]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

SEED = 87

In [70]:
!wget http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz

--2022-03-28 08:39:44--  http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz
Resolving kdd.ics.uci.edu (kdd.ics.uci.edu)... 128.195.1.86
Connecting to kdd.ics.uci.edu (kdd.ics.uci.edu)|128.195.1.86|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2144903 (2.0M) [application/x-gzip]
Saving to: ‘kddcup.data_10_percent.gz’

          kddcup.da   0%[                    ]       0  --.-KB/s               ^C


In [71]:
!mkdir ./data/kdd && mv ./kddcup.data_10_percent.gz ./data/kdd/kddcup.data_10_percent.gz

mkdir: ./data/kdd: File exists


In [72]:
!gzip -d ./data/kdd/kddcup.data_10_percent.gz
# as a result - a new (csv) file appears in the directory

gzip: can't stat: ./data/kdd/kddcup.data_10_percent.gz (./data/kdd/kddcup.data_10_percent.gz.gz): No such file or directory


In [73]:
# verify file exists..
!ls ./data/kdd

kddcup.data_10_percent     kddcup.data_10_percent.pkl


In [74]:
# specify column names - will be using that during import

col_names = ["duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins","logged_in",
             "num_compromised","root_shell","su_attempted","num_root","num_file_creations","num_shells","num_access_files","num_outbound_cmds",
             "is_host_login","is_guest_login","count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
             "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate",
             "dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate","dst_host_rerror_rate",
             "dst_host_srv_rerror_rate","label"]

In [75]:
file_path = './data/kdd/kddcup.data_10_percent'
df = pd.read_csv(file_path, header=None, names=col_names, index_col=False)
df.head

<bound method NDFrame.head of         duration protocol_type service flag  src_bytes  dst_bytes  land  \
0              0           tcp    http   SF        181       5450     0   
1              0           tcp    http   SF        239        486     0   
2              0           tcp    http   SF        235       1337     0   
3              0           tcp    http   SF        219       1337     0   
4              0           tcp    http   SF        217       2032     0   
...          ...           ...     ...  ...        ...        ...   ...   
494016         0           tcp    http   SF        310       1881     0   
494017         0           tcp    http   SF        282       2286     0   
494018         0           tcp    http   SF        203       1200     0   
494019         0           tcp    http   SF        291       1200     0   
494020         0           tcp    http   SF        219       1234     0   

        wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0    

In [76]:
print(df['label'].value_counts())

smurf.              280790
neptune.            107201
normal.              97278
back.                 2203
satan.                1589
ipsweep.              1247
portsweep.            1040
warezclient.          1020
teardrop.              979
pod.                   264
nmap.                  231
guess_passwd.           53
buffer_overflow.        30
land.                   21
warezmaster.            20
imap.                   12
rootkit.                10
loadmodule.              9
ftp_write.               8
multihop.                7
phf.                     4
perl.                    3
spy.                     2
Name: label, dtype: int64


In [77]:
le = preprocessing.LabelEncoder()
le.fit(df.label)

print(le.classes_)

['back.' 'buffer_overflow.' 'ftp_write.' 'guess_passwd.' 'imap.'
 'ipsweep.' 'land.' 'loadmodule.' 'multihop.' 'neptune.' 'nmap.' 'normal.'
 'perl.' 'phf.' 'pod.' 'portsweep.' 'rootkit.' 'satan.' 'smurf.' 'spy.'
 'teardrop.' 'warezclient.' 'warezmaster.']


There are nominal & categorcial values in the dataset - do OneHot encoding for them

In [78]:
col_categories = ['protocol_type',
                  'service',
                  'flag',
                  'land',
                  'logged_in',
                  'is_host_login',
                  'is_guest_login']

In [79]:
# just checking how data looks like
print(df['service'].value_counts())

ecr_i      281400
private    110893
http        64293
smtp         9723
other        7237
            ...  
X11            11
tim_i           7
pm_dump         1
tftp_u          1
red_i           1
Name: service, Length: 66, dtype: int64


In [80]:
# sample: protocol ("http", "ftp", "smtp")

#abc.protocol = "http"
#edf.protocol = "smtp"

# sample | is_http | is_ftp | is_smtp
#-----------------------------------------
# abc.      1.         0.        0
# edf.      0.         0.        1


# do one-hot encoding for the symbolic data
cat_data = pd.get_dummies(df[col_categories])
cat_data.head(10)

Unnamed: 0,land,logged_in,is_host_login,is_guest_login,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,service_IRC,service_X11,service_Z39_50,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
6,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
7,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [81]:
# exclude symbolic columns from the original dataset

col_numerics = list(set(df.columns.values) - set(col_categories))

In [82]:
print(col_numerics)
col_numerics.remove('label')

['dst_host_rerror_rate', 'rerror_rate', 'count', 'srv_diff_host_rate', 'dst_host_count', 'num_file_creations', 'dst_host_srv_rerror_rate', 'src_bytes', 'num_failed_logins', 'dst_host_srv_serror_rate', 'dst_bytes', 'label', 'num_access_files', 'dst_host_serror_rate', 'hot', 'num_compromised', 'serror_rate', 'num_outbound_cmds', 'same_srv_rate', 'wrong_fragment', 'srv_rerror_rate', 'num_shells', 'srv_count', 'duration', 'diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'urgent', 'dst_host_diff_srv_rate', 'su_attempted', 'dst_host_same_srv_rate', 'srv_serror_rate', 'dst_host_srv_count', 'num_root', 'root_shell']


In [83]:
# and add one-hot-encoded columns
numerics_data = df[col_numerics].copy()
numerics_cat_data = pd.concat([numerics_data, cat_data] ,axis=1)

In [84]:
labels = df['label'].copy()
transormed_labels = le.transform(labels)

KeyboardInterrupt: 

In [None]:
df
transormed_labels

In [None]:
# wrap verything together into a single function

def get_with_one_hot_encoded(df, col_categories, le):
    cat_data = pd.get_dummies(df[col_categories])
    col_numerics = list(set(df.columns.values) - set(col_categories))
    col_numerics.remove('label')

    numerics_data = df[col_numerics].copy()
    numerics_cat_data = pd.concat([numerics_data, cat_data] ,axis=1)

    labels = df['label'].copy()
    transormed_labels = le.transform(labels)

    return numerics_cat_data, transormed_labels

In [None]:
X, y = get_with_one_hot_encoded(df, col_categories, le)

In [None]:
# split dataset (with applied transforms) + labels
# into train/test with 75% ratio

X_train, X_test, y_train, y_test = train_test_split(
    X, # numerics+cat data,
    y, # transormed_labels,
    test_size = 0.25,
    random_state = SEED)

In [None]:
# validate dimensions
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

### Export data to .pickle

In [None]:
import pickle

In [None]:
preprocessed_data = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'le': le
}

pickle_path = './data/kdd/kddcup.data_10_percent.pkl'
with open(pickle_path, 'wb') as f:
    pickle.dump(preprocessed_data, f)

## GAN Approach

In [87]:
import pandas as pd
import collections

import numpy as np
from sklearn.preprocessing import MinMaxScaler

import mxnet as mx
from mxnet import gluon, init, npx #,np
from mxnet.gluon import nn
from statistics import mean

# https://github.com/tqdm/tqdm
from tqdm import tqdm

import matplotlib.pyplot as plt

#npx.set_np()

#seed data for reproducibility
mx.random.seed(SEED)

### Read preprocessed data

In [88]:
import pickle

# load data back from pickle file:
pickle_file = './data/kdd/kddcup.data_10_percent.pkl'
with open(pickle_file, 'rb') as f:
    preprocessed_data = pickle.load(f)

# check what's inside:
for key in preprocessed_data:
    print (key)

X_train
X_test
y_train
y_test
le


In [89]:
le = preprocessed_data["le"]
X_train = preprocessed_data["X_train"]
y_train = preprocessed_data["y_train"]
X_test = preprocessed_data["X_test"]
y_test = preprocessed_data["y_test"]

In [90]:
# lets identify what results do we actually have in labels..
np.unique(y_test)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 13, 14, 15, 16, 17,
       18, 20, 21, 22])

In [91]:
# we need to distinguish 'Normal' vs rest of labels, hence lets preprocess them a bit:

# find Normal class label:
pd.DataFrame(le.classes_)
# normal. == 11

Unnamed: 0,0
0,back.
1,buffer_overflow.
2,ftp_write.
3,guess_passwd.
4,imap.
5,ipsweep.
6,land.
7,loadmodule.
8,multihop.
9,neptune.


In [92]:
NORMAL_LABEL = 11

# Semantics:
#  0 - real data, "normal"
#  1 - real, but not normal, = abnormal, =anomaly

# decode multi-class verctor into binary
y_train[y_train != NORMAL_LABEL] = 1
y_train[y_train == NORMAL_LABEL] = 0

y_test[y_test != NORMAL_LABEL] = 1
y_test[y_test == NORMAL_LABEL] = 0

In [93]:
# we want to keep Normal packets for GAN training

temp = X_train.copy()
temp['label'] = y_train
temp = temp.loc[temp['label'] == 0]
temp = temp.drop('label', axis = 1)
X_train = temp.copy()

# as a result:
# training dataset - will contain only normal packets
# test datatet - will contain both types of data

In [94]:
scaler01 = MinMaxScaler()

X_train = scaler01.fit_transform(X_train)
X_test = scaler01.transform(X_test)

In [121]:
dataset = {
    'X_train': X_train.astype(np.float32),
    'y_train': y_train.astype(np.float32),
    'X_test': X_test.astype(np.float32),
    'y_test': y_test.astype(np.float32)
}

In [96]:
print('Training set, normal packets = ', X_train.shape[0])
print('Testing set, normal packets = ', collections.Counter(y_test)[0])
print('Testing set, abnormal packets = ', collections.Counter(y_test)[1])

Training set, normal packets =  72970
Testing set, normal packets =  24308
Testing set, abnormal packets =  99198


In [97]:
### Lets define Generator network
def generator_model() :
    model = nn.Sequential()
    model.add(nn.Dense(128, in_units = 118, activation='relu'))
    model.add(nn.Dense(128, activation='relu'))
    model.add(nn.Dense(256, activation='relu'))
    model.add(nn.Dense(256, activation='relu'))
    model.add(nn.Dense(118, activation='tanh'))

    return model

In [124]:
def update_G(Z, net_D, net_G, loss, trainer_G):
    """Update generator."""
    batch_size = Z.shape[0]
    ones = mx.np.ones((batch_size,), ctx=Z.ctx, dtype=mx.np.float64)
    with mx.autograd.record():
        # We could reuse `fake_X` from `update_D` to save computation
        fake_X = net_G(Z)
        # Recomputing `fake_Y` is needed since `net_D` is changed
        fake_Y = net_D(fake_X)
        loss_G = loss(fake_Y, ones)
    loss_G.backward()
    trainer_G.step(batch_size)
    return float(loss_G.sum())

In [99]:
### Lets define Descriminator network
def discriminator_model() :
    model = nn.Sequential()
    model.add(nn.Dense(256, in_units = 118, activation='relu', ))
    #model.add(nn.Dropout(0.2))
    model.add(nn.Dense(256, activation='relu'))
    #model.add(nn.Dropout(0.2))
    model.add(nn.Dense(128, activation='relu'))
    #model.add(nn.Dropout(0.2))
    model.add(nn.Dense(128, activation = 'relu'))
    #model.add(nn.Dropout(0.2))
    model.add(nn.Dense(1, activation='sigmoid'))
    #model.add(Activation('sigmoid'))

    return model

In [125]:
def update_D(X, Z, net_D, net_G, loss, trainer_D):
    """Update discriminator."""
    batch_size = X.shape[0]
    ones = mx.np.ones((batch_size,), ctx=X.ctx, dtype=mx.np.float64)
    zeros = mx.np.zeros((batch_size,), ctx=X.ctx, dtype=mx.np.float64)
    with mx.autograd.record():
        real_Y = net_D(X)
        fake_X = net_G(Z)
        # Do not need to compute gradient for `net_G`, detach it from
        # computing gradients.
        fake_Y = net_D(fake_X.detach())
        loss_D = (loss(real_Y, ones) + loss(fake_Y, zeros)) / 2
    loss_D.backward()
    trainer_D.step(batch_size)
    return float(loss_D.sum())

In [101]:
ctx = mx.gpu(0) if mx.context.num_gpus() > 0 else mx.cpu(0)

In [102]:
ctx

cpu(0)

In [126]:
def train_GAN (net_D, net_G, data_iter, num_epochs, lr, latent_dim,
          ctx=ctx):

    loss = gluon.loss.SigmoidBCELoss()

    trainer_hp = {'learning_rate': lr, 'beta1': 0.5}

    trainer_D = gluon.Trainer(net_D.collect_params(), 'adam', trainer_hp)
    trainer_G = gluon.Trainer(net_G.collect_params(), 'adam', trainer_hp)

    history_D = []
    history_G = []

    for epoch in range(1, num_epochs + 1):
        # Train one epoch and track loss per batches
        epoch_history_D = []
        epoch_history_G = []

        total_batches = len(data_iter)

        with tqdm(total=total_batches, position=0, leave=True) as pbar:
            for batch_num, (X) in enumerate(data_iter):
                batch_size = X.shape[0]
                Z = mx.np.random.normal(0, 1, size=(batch_size, latent_dim, 1, 1))
                X = X.as_in_ctx(ctx)
                Z = Z.as_in_ctx(ctx)

                loss_D = update_D(X, Z, net_D, net_G, loss, trainer_D)
                loss_D = loss_D/batch_size

                loss_G = update_G(Z, net_D, net_G, loss, trainer_G)
                loss_G = loss_G/batch_size

                epoch_history_D.append(loss_D)
                epoch_history_G.append(loss_G)

                pbar.set_description(f'Epoch: {epoch}, Batch: {batch_num}/{total_batches},'
                                     f'Loss(D): {loss_D:.3f}, Loss (G): {loss_G:.3f}')
                pbar.update()

            # Show generated examples
            # Z = np.random.normal(0, 1, size=(21, latent_dim, 1, 1), ctx=device)
            # Normalize the synthetic data to N(0, 1)
            # fake_x = net_G(Z).transpose(0, 2, 3, 1) / 2 + 0.5
            # imgs = np.concatenate(
            #     [np.concatenate([fake_x[i * 7 + j] for j in range(7)], axis=1)
            #      for i in range(len(fake_x)//7)], axis=0)

            # Show the losses
            loss_D = mean(epoch_history_D)
            loss_G = mean(epoch_history_G)

            history_D.append(loss_D)
            history_G.append(loss_G)

        pbar.close()

    return history_G, history_D

In [127]:
num_epochs = 4
batch_size = 512
learning_rate = 0.005

# Only the first `n_train` examples are used for training
ds_train = gluon.data.ArrayDataset(X_train.astype(mx.np.float64))
train_loader = gluon.data.DataLoader(ds_train, batch_size, shuffle=True)

net_G = generator_model()
net_G.initialize(init=mx.init.Xavier(), force_reinit=True, ctx=ctx)

net_D = discriminator_model()
net_D.initialize(init=mx.init.Xavier(), force_reinit=True, ctx=ctx)

In [129]:
hist_G, hist_D = train_GAN(net_D, net_G, train_loader, num_epochs, learning_rate, 118)

  0%|          | 0/143 [00:00<?, ?it/s]


MXNetError: Traceback (most recent call last):
  File "../src/io/../operator/elemwise_op_common.h", line 135
MXNetError: Check failed: assign(&dattr, vec.at(i)): Incompatible attr in node  at 1-th input: expected float64, got float32

In [None]:
plt.plot(hist_D, label = 'Discriminator')
plt.plot(hist_G, label = 'Generator')
plt.legend()