In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

from sklearn.metrics import  classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

%matplotlib inline 

In [2]:
import importlib
import ids_common
importlib.reload(ids_common)

<module 'ids_common' from 'd:\\stuff\\univ\\ids\\ids_common.py'>

In [3]:
SEED = random.randint(0, 42424242)
SEED = 42
print(f'SEED = {SEED}')

SEED = 42


In [4]:
df_orig = pd.read_csv('./sampled_data/sample_1/clean_dataset.csv', low_memory=False)

In [5]:
df_orig['Attack_type'].value_counts()

Attack_type
Normal                   2157201
DDoS_UDP                  320144
Password                  288556
DDoS_TCP                  201994
DDoS_ICMP                  99663
DDoS_HTTP                  84789
Vulnerability_scanner      72128
SQL_injection              46888
Uploading                  30172
Backdoor                   23248
Port_Scanning              19974
XSS                        13962
Ransomware                  9141
OS_Fingerprinting            724
MITM                         358
Name: count, dtype: int64

In [27]:
df = pd.DataFrame()

for atype in df_orig['Attack_type'].unique():
    tmp = df_orig[df_orig['Attack_type']== atype]
    df = pd.concat([df, tmp.sample(n=min(10000, tmp.shape[0]))], ignore_index=True)

df['Attack_type'].value_counts()

Attack_type
Normal                   10000
DDoS_UDP                 10000
Password                 10000
XSS                      10000
DDoS_ICMP                10000
DDoS_TCP                 10000
SQL_injection            10000
Uploading                10000
DDoS_HTTP                10000
Backdoor                 10000
Port_Scanning            10000
Vulnerability_scanner    10000
Ransomware                9141
OS_Fingerprinting          724
MITM                       358
Name: count, dtype: int64

In [28]:
df_train, df_test = ids_common.ds_split(df, seed=42)
print(f'df_train={df_train.shape}')
print(f'df_test={df_test.shape}')

X_train, y_train = ids_common.get_X_y(df_train, ids_common.target_label_2_class)
print(f'X_train={X_train.shape} y_train={y_train.shape}')

X_test, y_test = ids_common.get_X_y(df_test, ids_common.target_label_2_class)
print(f'X_test={X_test.shape} y_test={y_test.shape}')

df_train=(104178, 61)
df_test=(26045, 61)
X_train=(104178, 59) y_train=(104178,)
X_test=(26045, 59) y_test=(26045,)


In [29]:
from sklearn.tree import DecisionTreeClassifier
cls = DecisionTreeClassifier(**{'criterion': 'entropy', 'max_depth': 16}, 
                             random_state=SEED)
cls.fit(X_train, y_train)
print(f'{cls.__class__.__name__}:')
print(f'Train score: {cls.score(X_train, y_train)}')
print(f'Test score:  {cls.score(X_test, y_test)}')

fetures_scores = sorted(zip(cls.feature_importances_.round(2),
           cls.feature_names_in_),
           reverse=True)[:10]
print(fetures_scores)

features = [f[1] for f in fetures_scores if f[0] > 0]

print(features)

DecisionTreeClassifier:
Train score: 0.9873389775192459
Test score:  0.9882511038587061
[(0.5, 'tcp.ack'), (0.24, 'mqtt.msgtype'), (0.08, 'tcp.seq'), (0.07, 'tcp.checksum'), (0.06, 'http.tls_port'), (0.04, 'tcp.flags'), (0.0, 'udp.time_delta'), (0.0, 'udp.stream'), (0.0, 'tcp.len'), (0.0, 'tcp.flags.ack')]
['tcp.ack', 'mqtt.msgtype', 'tcp.seq', 'tcp.checksum', 'http.tls_port', 'tcp.flags']


In [30]:
from sklearn.ensemble import RandomForestClassifier
cls = RandomForestClassifier(**{'criterion': 'entropy', 'max_depth': 128, 'max_features': 'log2', 'n_estimators': 100},
                             random_state=SEED)
cls.fit(X_train, y_train)
print(f'{cls.__class__.__name__}:')
print(f'Train score: {cls.score(X_train, y_train)}')
print(f'Test score:  {cls.score(X_test, y_test)}')

fetures_scores = sorted(zip(cls.feature_importances_.round(2),
           cls.feature_names_in_),
           reverse=True)

for fs in fetures_scores:
    if fs[0] <= 0:
        break
    print(fs)

features = [f[1] for f in fetures_scores if f[0] > 0]

print(features)

RandomForestClassifier:
Train score: 0.9999616041774655
Test score:  0.9858322134766749
(0.27, 'tcp.ack')
(0.17, 'tcp.seq')
(0.1, 'tcp.checksum')
(0.08, 'tcp.ack_raw')
(0.06, 'tcp.flags')
(0.06, 'mqtt.msgtype')
(0.05, 'tcp.len')
(0.05, 'mqtt.hdrflags')
(0.03, 'mqtt.len')
(0.02, 'tcp.connection.rst')
(0.01, 'udp.stream')
(0.01, 'tcp.flags.ack')
(0.01, 'tcp.connection.synack')
(0.01, 'tcp.connection.fin')
(0.01, 'mqtt.ver')
(0.01, 'mqtt.topic_len')
(0.01, 'mqtt.conflags')
(0.01, 'icmp.seq_le')
(0.01, 'icmp.checksum')
(0.01, 'http.tls_port')
['tcp.ack', 'tcp.seq', 'tcp.checksum', 'tcp.ack_raw', 'tcp.flags', 'mqtt.msgtype', 'tcp.len', 'mqtt.hdrflags', 'mqtt.len', 'tcp.connection.rst', 'udp.stream', 'tcp.flags.ack', 'tcp.connection.synack', 'tcp.connection.fin', 'mqtt.ver', 'mqtt.topic_len', 'mqtt.conflags', 'icmp.seq_le', 'icmp.checksum', 'http.tls_port']


In [31]:

from sklearn.linear_model import LogisticRegression

# [GS] Tuned:
# [GS] Score: 0.9352859928757888
# [GS] Parms:  {'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}
# [GS] Estimator: LogisticRegression(C=1, max_iter=1000, random_state=42)
# LogisticRegression:
# Train score: 0.9317354638260098
# Test score:  0.9307774227902024

cls = LogisticRegression(**{'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'},
                            max_iter=1000, random_state=SEED)

# grid_search = GridSearchCV(cls, {
#                                 'solver': ['lbfgs','newton-cholesky'],
#                                 'penalty': ['l2', None],
#                                 'C': [0.1, 1, 10, 100, 1000]
#                                 }, cv=2, verbose=0)

# grid_search.fit(X_train, y_train)

# print('[GS] Tuned:')
# print('[GS] Score:', grid_search.best_score_)
# print('[GS] Parms: ', grid_search.best_params_)
# print('[GS] Estimator:', grid_search.best_estimator_)

# cls = grid_search.best_estimator_

cls.fit(X_train, y_train)
print(f'{cls.__class__.__name__}:')
print(f'Train score: {cls.score(X_train, y_train)}')
print(f'Test score:  {cls.score(X_test, y_test)}')

LogisticRegression:
Train score: 0.9270383382288007
Test score:  0.9269725475139182


In [32]:
from sklearn.neighbors import KNeighborsClassifier

cls = KNeighborsClassifier(5)
cls.fit(X_train, y_train)
print(f'{cls.__class__.__name__}:')
print(f'Train score: {cls.score(X_train, y_train)}')
print(f'Test score:  {cls.score(X_test, y_test)}')

KNeighborsClassifier:
Train score: 0.9321065868033558
Test score:  0.9209061240161259


In [33]:
from sklearn.naive_bayes import GaussianNB

cls = GaussianNB()
cls.fit(X_train, y_train)
print(f'{cls.__class__.__name__}:')
print(f'Train score: {cls.score(X_train, y_train)}')
print(f'Test score:  {cls.score(X_test, y_test)}')

GaussianNB:
Train score: 0.8892088540766765
Test score:  0.8887694375119984


In [34]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

cls = QuadraticDiscriminantAnalysis()
cls.fit(X_train, y_train)
print(f'{cls.__class__.__name__}:')
print(f'Train score: {cls.score(X_train, y_train)}')
print(f'Test score:  {cls.score(X_test, y_test)}')



QuadraticDiscriminantAnalysis:
Train score: 0.9487895716945997
Test score:  0.9503551545402189


In [35]:
from sklearn.neural_network import MLPClassifier
cls = MLPClassifier(alpha=1, max_iter=1000, random_state=42)
cls.fit(X_train, y_train)
print(f'{cls.__class__.__name__}:')
print(f'Train score: {cls.score(X_train, y_train)}')
print(f'Test score:  {cls.score(X_test, y_test)}')


MLPClassifier:
Train score: 0.9259920520647353
Test score:  0.9268957573430601


In [37]:
from sklearn.ensemble import AdaBoostClassifier
cls = AdaBoostClassifier(algorithm="SAMME", random_state=42)
cls.fit(X_train, y_train)
print(f'{cls.__class__.__name__}:')
print(f'Train score: {cls.score(X_train, y_train)}')
print(f'Test score:  {cls.score(X_test, y_test)}')


AdaBoostClassifier:
Train score: 0.9803221409510645
Test score:  0.9816855442503359


In [38]:
import xgboost as xgb
cls = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
# cls = xgb.XGBClassifier(objective="multi:softprob", random_state=42)
cls.fit(X_train, y_train)
print(f'{cls.__class__.__name__}:')
print(f'Train score: {cls.score(X_train, y_train)}')
print(f'Test score:  {cls.score(X_test, y_test)}')

XGBClassifier:
Train score: 0.987540555587552
Test score:  0.9882511038587061


In [11]:

x_train = X_train[features]

from sklearn.svm import SVC
cls = SVC(kernel="linear", C=0.25, random_state=42)

cls.fit(x_train, y_train)
print(f'{cls.__class__.__name__}:')
print(f'Train score: {cls.score(x_train, y_train)}')
print(f'Test score:  {cls.score(X_test, y_test)}')

In [None]:
df_train_global, df_test_global = ids_common.ds_split(df_orig, seed=42)

print(f'df_train_global={df_train_global.shape}')
print(f'df_test_global={df_test_global.shape}')

In [6]:
X_train, y_train = ids_common.get_X_y(df_train_global, ids_common.target_label_2_class)

In [16]:
kbest_chi2 = SelectKBest(chi2, k=20).fit(X_train, y_train)
kbest_anova = SelectKBest(f_classif, k=20).fit(X_train, y_train)

  f = msb / msw


In [17]:
kbest_anova.get_feature_names_out()

array(['icmp.checksum', 'icmp.seq_le', 'http.response', 'tcp.ack_raw',
       'tcp.checksum', 'tcp.connection.syn', 'tcp.flags', 'udp.stream',
       'mqtt.conflag.cleansess', 'mqtt.conflags', 'mqtt.hdrflags',
       'mqtt.len', 'mqtt.msgtype', 'mqtt.proto_len',
       'http.request.method-0.0', 'http.request.method-GET',
       'http.request.version-0.0', 'mqtt.protoname-0.0',
       'mqtt.protoname-MQTT', 'mqtt.topic-0.0'], dtype=object)

In [22]:
kbest_chi2.get_feature_names_out()

array(['icmp.checksum', 'icmp.seq_le', 'http.content_length',
       'http.tls_port', 'tcp.ack', 'tcp.ack_raw', 'tcp.checksum',
       'tcp.connection.synack', 'tcp.flags', 'tcp.flags.ack', 'tcp.len',
       'tcp.seq', 'udp.stream', 'dns.retransmit_request_in',
       'mqtt.hdrflags', 'mqtt.len', 'mqtt.msgtype', 'mqtt.topic_len',
       'mqtt.ver', 'mbtcp.len'], dtype=object)

In [21]:
fs1 = set(kbest_anova.get_feature_names_out())
fs2 = set(kbest_chi2.get_feature_names_out())

fs1.symmetric_difference(fs2)

{'dns.retransmit_request_in',
 'http.content_length',
 'http.request.method-0.0',
 'http.request.method-GET',
 'http.request.version-0.0',
 'http.response',
 'http.tls_port',
 'mbtcp.len',
 'mqtt.conflag.cleansess',
 'mqtt.conflags',
 'mqtt.proto_len',
 'mqtt.protoname-0.0',
 'mqtt.protoname-MQTT',
 'mqtt.topic-0.0',
 'mqtt.topic_len',
 'mqtt.ver',
 'tcp.ack',
 'tcp.connection.syn',
 'tcp.connection.synack',
 'tcp.flags.ack',
 'tcp.len',
 'tcp.seq'}