In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, auc
from sklearn.preprocessing import OrdinalEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv('kddcup99_csv.csv', sep=',')

In [4]:
df.shape

(494020, 42)

# KDD Cup 99

https://kdd.ics.uci.edu/databases/kddcup99/task.html

Dataset obsahuje zaznamenanou komunikaci v siti ktera mela simulovat realne provoz ve vojenske siti. Kazdy zaznam obsahuje informace rozparsovaneho packetu. Z jednotlivych atributu chceme zjistit, zda se jedna o dobrou, ci spatnou komunikaci (utok). Nasledne se tento utok deli na dalsi kategorie.

* DOS: denial-of-service, e.g. syn flood;
* R2L: unauthorized access from a remote machine, e.g. guessing password;
* U2R: unauthorized access to local superuser (root) privileges, e.g., various ``buffer overflow’’ attacks;
* probing: surveillance and other probing, e.g., port scanning.

My se temto kategoriim nebudeme venovat, jednoduse budeme rozdelovat na good a bad, jak je napsano v zadani.

Podivame se na par zakladnich atributu.

In [5]:
df.dtypes

duration                         int64
protocol_type                   object
service                         object
flag                            object
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
lnum_compromised                 int64
lroot_shell                      int64
lsu_attempted                    int64
lnum_root                        int64
lnum_file_creations              int64
lnum_shells                      int64
lnum_access_files                int64
lnum_outbound_cmds               int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_rate          

In [6]:
df.dst_host_rerror_rate.value_counts()

0.00    458791
1.00     26040
0.01      1596
0.02       932
0.04       801
         ...  
0.63        19
0.34        18
0.46        15
0.39        14
0.79        13
Name: dst_host_rerror_rate, Length: 101, dtype: int64

## Nas target atr. neboli vyhodnocene pripojeni
Abychom byli schopni provest klasifikaci, budeme muset objekt filtrovat, "normal" bude good connection a zbytek bad connection.

respektive 0-Good, 1-Bad 

In [7]:
df.label.unique()

array(['normal', 'buffer_overflow', 'loadmodule', 'perl', 'neptune',
       'smurf', 'guess_passwd', 'pod', 'teardrop', 'portsweep', 'ipsweep',
       'land', 'ftp_write', 'back', 'imap', 'satan', 'phf', 'nmap',
       'multihop', 'warezmaster', 'warezclient', 'spy', 'rootkit'],
      dtype=object)

In [8]:
df.label.value_counts()

smurf              280790
neptune            107201
normal              97277
back                 2203
satan                1589
ipsweep              1247
portsweep            1040
warezclient          1020
teardrop              979
pod                   264
nmap                  231
guess_passwd           53
buffer_overflow        30
land                   21
warezmaster            20
imap                   12
rootkit                10
loadmodule              9
ftp_write               8
multihop                7
phf                     4
perl                    3
spy                     2
Name: label, dtype: int64

## Protokoly pripojeni

In [9]:
df.protocol_type.unique()

array(['tcp', 'udp', 'icmp'], dtype=object)

## Servisy

In [10]:
df.service.unique()

array(['http', 'smtp', 'finger', 'domain_u', 'auth', 'telnet', 'ftp',
       'eco_i', 'ntp_u', 'ecr_i', 'other', 'private', 'pop_3', 'ftp_data',
       'rje', 'time', 'mtp', 'link', 'remote_job', 'gopher', 'ssh',
       'name', 'whois', 'domain', 'login', 'imap4', 'daytime', 'ctf',
       'nntp', 'shell', 'IRC', 'nnsp', 'http_443', 'exec', 'printer',
       'efs', 'courier', 'uucp', 'klogin', 'kshell', 'echo', 'discard',
       'systat', 'supdup', 'iso_tsap', 'hostnames', 'csnet_ns', 'pop_2',
       'sunrpc', 'uucp_path', 'netbios_ns', 'netbios_ssn', 'netbios_dgm',
       'sql_net', 'vmnet', 'bgp', 'Z39_50', 'ldap', 'netstat', 'urh_i',
       'X11', 'urp_i', 'pm_dump', 'tftp_u', 'tim_i', 'red_i'],
      dtype=object)

### Uz ted vime, ze budeme muset prevest objekty na dummies aby se s nimi nas model vyporadal

In [11]:
df.flag.value_counts()

SF        378439
S0         87007
REJ        26875
RSTR         903
RSTO         579
SH           107
S1            57
S2            24
RSTOS0        11
S3            10
OTH            8
Name: flag, dtype: int64

In [12]:
df.describe(exclude=np.number)

Unnamed: 0,protocol_type,service,flag,label
count,494020,494020,494020,494020
unique,3,66,11,23
top,icmp,ecr_i,SF,smurf
freq,283602,281400,378439,280790


## Chybejici atributy ?
Zadne nejsou 

In [13]:
df.apply(lambda x: x.isna().sum()).sort_values(ascending=False)

duration                       0
dst_host_count                 0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_host_rate             0
dst_host_srv_count             0
protocol_type                  0
dst_host_same_srv_rate         0
dst_host_diff_srv_rate         0
dst_host_same_src_port_rate    0
dst_host_srv_diff_host_rate    0
dst_host_serror_rate           0
dst_host_srv_serror_rate       0
dst_host_rerror_rate           0
dst_host_srv_rerror_rate       0
count                          0
is_guest_login                 0
is_host_login                  0
lnum_outbound_cmds             0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent    

# Data Preprocesing

* prevod "label" na 0-Good, 1-Bad
* ostatni object atr na dummies, protoze nam nejde o poradi a chceme stejne vzdalenosti mezi atr. 

In [14]:
df.label.value_counts()

smurf              280790
neptune            107201
normal              97277
back                 2203
satan                1589
ipsweep              1247
portsweep            1040
warezclient          1020
teardrop              979
pod                   264
nmap                  231
guess_passwd           53
buffer_overflow        30
land                   21
warezmaster            20
imap                   12
rootkit                10
loadmodule              9
ftp_write               8
multihop                7
phf                     4
perl                    3
spy                     2
Name: label, dtype: int64

In [15]:
df.loc[df.label != 'normal', 'label'] = 'Bad' 
enc_label = OrdinalEncoder(categories=[['normal', 'Bad']])
df['label'] = enc_label.fit_transform(df[['label']])[:, 0]
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,0.0
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0.0
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0.0
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# df = df.drop(['protocol_type', 'service', 'flag'], axis=1)

In [17]:
df = df.join(pd.get_dummies(df.protocol_type, prefix='protocol_type')).drop(['protocol_type'], axis=1)

In [18]:
df = df.join(pd.get_dummies(df.service, prefix='service')).drop(['service'], axis=1)

In [19]:
df = df.join(pd.get_dummies(df.flag, prefix='flag')).drop(['flag'], axis=1)

In [20]:
df.dtypes

duration          int64
src_bytes         int64
dst_bytes         int64
land              int64
wrong_fragment    int64
                  ...  
flag_S1           uint8
flag_S2           uint8
flag_S3           uint8
flag_SF           uint8
flag_SH           uint8
Length: 119, dtype: object

# Klasifikace

In [25]:
X, y = df.loc[:, df.columns != 'label'], df.loc[:, 'label']
X.shape
y.loc[y==0].shape, y.loc[y==1].shape, y.shape

((97277,), (396743,), (494020,))

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((395216, 118), (98804, 118), (395216,), (98804,))

In [571]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score(y_test,y_pred)

0.9997921377685675

In [572]:
skf = StratifiedKFold(n_splits=5)
scores = list()
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    scores.append(f1_score(y_test, y_pred))
    print(f'Bad packet packet ratio in train set: {y_train.value_counts(normalize=True)[1]:.2}; Bad packet ratio in test set: {y_test.value_counts(normalize=True)[1]:.2}')
    
scores

Bad packet packet ratio in train set: 0.8; Bad packet ratio in test set: 0.8
Bad packet packet ratio in train set: 0.8; Bad packet ratio in test set: 0.8
Bad packet packet ratio in train set: 0.8; Bad packet ratio in test set: 0.8
Bad packet packet ratio in train set: 0.8; Bad packet ratio in test set: 0.8
Bad packet packet ratio in train set: 0.8; Bad packet ratio in test set: 0.8


[0.9902399918560558,
 0.9997480092732587,
 0.9999054897834456,
 0.9986527831988214,
 0.9994960312460628]

In [573]:
np.mean(scores), np.min(scores), np.max(scores)

(0.997608461071529, 0.9902399918560558, 0.9999054897834456)

In [574]:
skf = StratifiedKFold(n_splits=5, shuffle=True)
scores = list()
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    scores.append(f1_score(y_test, y_pred))
    print(f'Bad packet ratio in train set: {y_train.value_counts(normalize=True)[1]:.2}; Bad packet ratio in test set: {y_test.value_counts(normalize=True)[1]:.2}')
    
scores

Bad packet ratio in train set: 0.8; Bad packet ratio in test set: 0.8
Bad packet ratio in train set: 0.8; Bad packet ratio in test set: 0.8
Bad packet ratio in train set: 0.8; Bad packet ratio in test set: 0.8
Bad packet ratio in train set: 0.8; Bad packet ratio in test set: 0.8
Bad packet ratio in train set: 0.8; Bad packet ratio in test set: 0.8


[0.9998550697240653,
 0.9998928816720541,
 0.9998172549167881,
 0.9998550752033672,
 0.9997794594867077]

In [575]:
clf = DecisionTreeClassifier(splitter='random')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score(y_test,y_pred)

0.9997920513192139

In [576]:
clf = DecisionTreeClassifier(criterion='entropy')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score(y_test,y_pred)

0.9998235531357129

In [577]:
clf = DecisionTreeClassifier(max_leaf_nodes=15)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score(y_test,y_pred)

0.9992562150168928

In [578]:
clf = MLPClassifier(hidden_layer_sizes=(5,3), activation='relu', solver='adam', max_iter=1000, random_state=13)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score(y_test,y_pred)

0.9984242836793606

In [579]:
skf = KFold(n_splits=5)
scores = list()
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clf = MLPClassifier(hidden_layer_sizes=(10,5,3), activation='tanh', solver='adam', max_iter=500, random_state=5)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    scores.append(f1_score(y_test, y_pred))
    print(f'Bad packet packet ratio in train set: {y_train.value_counts(normalize=True)[1]:.2}; Bad packet ratio in test set: {y_test.value_counts(normalize=True)[1]:.2}')
scores

Bad packet packet ratio in train set: 0.9; Bad packet ratio in test set: 0.43
Bad packet packet ratio in train set: 0.79; Bad packet ratio in test set: 0.85
Bad packet packet ratio in train set: 0.75; Bad packet ratio in test set: 1.0
Bad packet packet ratio in train set: 0.77; Bad packet ratio in test set: 0.93
Bad packet packet ratio in train set: 0.8; Bad packet ratio in test set: 0.8


[0.9720382457069916,
 0.9911091380303322,
 1.0,
 0.9990048236145024,
 0.9990633937831976]

In [580]:
clf = MLPClassifier(hidden_layer_sizes=(10,5,3), activation='tanh', solver='adam', max_iter=500, random_state=5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score(y_test,y_pred)

0.9990633937831976

In [581]:
clf = MLPClassifier(hidden_layer_sizes=(10,5,3), activation='tanh', solver='lbfgs', max_iter=100, random_state=5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score(y_test,y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


0.9943432040142578

In [582]:
clf = MLPClassifier(hidden_layer_sizes=(10,5,3), activation='tanh', solver='sgd', max_iter=500, random_state=5, learning_rate='adaptive')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score(y_test,y_pred)

0.9977272870354631

In [583]:
clf = MLPClassifier(hidden_layer_sizes=(10,5,3), activation='relu', solver='adam', max_iter=500, random_state=5, early_stopping=True)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score(y_test,y_pred)

0.9985363861930338

In [584]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score(y_test,y_pred)


0.9999182724043632

In [585]:
clf = RandomForestClassifier(criterion='entropy')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score(y_test,y_pred)

0.9999182724043632

In [23]:
clf = RandomForestClassifier(max_depth=10)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score(y_test,y_pred)

0.9996975692125458

In [24]:
clf = RandomForestClassifier(max_features='log2', random_state=10)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_score(y_test,y_pred)

0.9998425067879575

# Zaver
V datasetu se nachazi vetsi pocet spatnych packetu nez tech dobrych, takze jsem pro par kontrol pouzil kfold. Ukazalo se ze f1 skore je verohodne protoze ve vsech pripadech bylo velice podobne

Nejakou vetsi optimalizaci pomoci hyper parametru se mi zkousela dost spatne, protoze i defaultni nastaveni vytvarelo model s velikou presnosti a rozdily byly vetsinou az hloubeji za desetinou carkou.

Jedinny pripad kdy skore kleslo na hodnotu 0.97 bylo pri velkem rozdilu v poctu spatnych packetu v train setu a v test setu
* Bad packet packet ratio in train set: 0.9; Bad packet ratio in test set: 0.43

Dlouho jsem premyslel jestli je to vubec dobre, ze to ve vetsine pripadech vychazi kolem 0.99, prosel jsem si nekolikrat preprocesing dat, ale nevidim nikde zadnou chybu.