In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

from sklearn.metrics import  classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

%matplotlib inline 

In [6]:
import importlib
import ids_common
importlib.reload(ids_common)

<module 'ids_common' from 'd:\\stuff\\univ\\ids\\ids_common.py'>

In [3]:
SEED = random.randint(0, 42424242)
SEED = 42
print(f'SEED = {SEED}')

#TODO: rename to detection_class and classification_class
target_label_2_class = 'Attack_label' # 0 indicates normal and 1 indicates attacks
target_label_15_class = 'Attack_type'

def ds_detection_split(dataset):
    """"Normal vs Attack"""
    y = dataset[target_label_2_class]
    X = dataset.drop([target_label_2_class, target_label_15_class], axis=1, inplace=False)
    return train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)

def ds_split(dataset):
    """Simple split, stratify against Attack_type"""
    return train_test_split(dataset, test_size=0.2, random_state=SEED, stratify=dataset[target_label_15_class])

def ds_classification_split(dataset):
    """Split on Attack_type"""
    loc_df = dataset.drop(dataset[dataset[target_label_2_class] == 0].index, inplace=False)
    y = loc_df[target_label_15_class]
    X = loc_df.drop([target_label_2_class, target_label_15_class], axis=1, inplace=False)
    return train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)

SEED = 42


In [4]:
df_combined = pd.read_csv('./sampled_data/sample_1/combined_dataset.csv', low_memory=False)
ids_common.ferrag_preparation(df_combined, verbose=True)
df_combined.info()

Before: shape=(4066850, 63)
Before: dropna: NA: 0, DUPS: 603202
After: dropna: NA: 0, DUPS: 0
After: shape=(3463648, 48)
<class 'pandas.core.frame.DataFrame'>
Index: 3463648 entries, 0 to 4066849
Data columns (total 48 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   arp.opcode                 object 
 1   arp.hw.size                object 
 2   icmp.checksum              object 
 3   icmp.seq_le                object 
 4   icmp.unused                float64
 5   http.content_length        float64
 6   http.request.method        object 
 7   http.referer               object 
 8   http.request.version       object 
 9   http.response              float64
 10  http.tls_port              float64
 11  tcp.ack                    float64
 12  tcp.ack_raw                object 
 13  tcp.checksum               object 
 14  tcp.connection.fin         object 
 15  tcp.connection.rst         float64
 16  tcp.connection.syn         float64
 17  tcp.co

In [5]:
df_train_global, df_test_global = ds_split(df_combined)
print(f'df_train_global={df_train_global.shape}')
print(f'df_test_global={df_test_global.shape}')

X_train_detection, X_test_detection, y_train_detection, y_test_detection = ds_detection_split(df_train_global)
print(f'X_train_detection={X_train_detection.shape} y_train_detection={y_train_detection.shape}')
print(f'X_test_detection={X_test_detection.shape}   y_test_detection={y_test_detection.shape}')

X_train_classification, X_test_classification, y_train_classification, y_test_classification = ds_classification_split(df_train_global)
print(f'X_train_classification={X_train_classification.shape} y_train_classification={y_train_classification.shape}')
print(f'X_test_classification={X_test_classification.shape}   y_test_classification={y_test_classification.shape}')

df_train_global=(2770918, 48)
df_test_global=(692730, 48)
X_train_detection=(2216734, 46) y_train_detection=(2216734,)
X_test_detection=(554184, 46)   y_test_detection=(554184,)
X_train_classification=(812691, 46) y_train_classification=(812691,)
X_test_classification=(203173, 46)   y_test_classification=(203173,)


In [7]:
detector = DecisionTreeClassifier(random_state=SEED)

detector.fit(X_train_detection, y_train_detection)
y_train_detection_predict = detector.predict(X_train_detection)
y_test_detection_predict = detector.predict(X_test_detection)

ids_common.report(y_train_detection, 
       y_train_detection_predict, 
       y_test_detection,
       y_test_detection_predict)

ids_common.plot_cm(y_test_detection, y_test_detection_predict)

ValueError: could not convert string to float: '192.168.0.128'

In [None]:
classifier = DecisionTreeClassifier(random_state=SEED)

classifier.fit(X_train_classification, y_train_classification)
y_train_classification_predict = classifier.predict(X_train_classification)
y_test_classification_predict = classifier.predict(X_test_classification)

ids_common.report(y_train_classification, 
       y_train_classification_predict, 
       y_test_classification,
       y_test_classification_predict)

ids_common.plot_cm(y_test_classification, y_test_classification_predict)

In [None]:
y_global_label = df_test_global[target_label_2_class]
y_global_type = df_test_global[target_label_15_class]
X_global = df_test_global.drop([target_label_2_class, target_label_15_class], axis=1, inplace=False)


y_global_detection_predict = detector.predict(X_global)

ds_classification = df_test_global[y_global_detection_predict==1]
X_global_classification = ds_classification.drop([target_label_2_class, target_label_15_class], axis=1, inplace=False)

y_global_classification_predict = classifier.predict(X_global_classification)


global_result = pd.Series(y_global_detection_predict, index=X_global.index.copy())
global_result = global_result.map(lambda x: 'Normal' if x==0 else 'Attack')
global_result.loc[X_global_classification.index] = y_global_classification_predict

print('DETECTION:')
print(classification_report(y_global_label, y_global_detection_predict))
ids_common.plot_cm(y_global_label, y_global_detection_predict)

print('CLASSIFICATION:')
print(classification_report(y_global_type, global_result))
ids_common.plot_cm(y_global_type, global_result)