In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.tree import export_graphviz
from sklearn.preprocessing import LabelEncoder

from typing import Union, List

import os
import random
import warnings

In [3]:
folder: str = "/data/experience/CSV"

sub_folders: list = [
    '/1.Deauth',
    '/2.Disas',
    '/3.(Re)Assoc',
    '/4.Rogue_AP',
    '/5.Krack',
    '/6.Kr00k',
    '/7.SSH',
    '/8.Botnet',
    '/9.Malware',
    '/10.SQL_Injection',
    '/11.SSDP',
    '/12.Evil_Twin',
    '/13.Website_spoofing'
]

In [4]:
# Load dataset
def load_fusion_dataset(idx: Union[int, List[int]], all: bool = False) -> pd.DataFrame:
    res = []
    
    for sub_folder in sub_folders:
        path: str = folder + sub_folder
        files: list = os.listdir(path)
        
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=pd.errors.DtypeWarning)
        
            if all: df = pd.concat([pd.read_csv(path + '/' + file) for file in files])
            elif type(idx) == list: df = pd.concat([pd.read_csv(path + '/' + files[i % len(files)]) for i in idx])
            else: df = pd.read_csv(path + '/' + files[idx % len(files)])
        
        res.append(df)
    
    return pd.concat(res, axis=0)
        

def load_dataset(num: int = 1, all: bool = False) -> pd.DataFrame:
    if num < 1 or num > 13: return None
    
    path: str = folder + sub_folders[num-1]
    files: list = os.listdir(path)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=pd.errors.DtypeWarning)
        df = pd.concat([pd.read_csv(path + '/' + file) for file in files], ignore_index=True) if all else pd.read_csv(path + '/' + files[0])
    
    return df

In [65]:
# Preprocessing
def wireless_preprocessing(df_copy: pd.DataFrame) -> pd.DataFrame:
    # Filter protocols related with wireless attacks
    df_copy = df_copy[[col for col in df_copy.columns if col.startswith(('frame', 'radiotap', 'wlan', 'eapol', 'Label'))]]

    df_copy = df_copy.dropna(subset=['Label'])

    # fill NaN data
    for col in ['frame.encap_type', 'frame.len', 'frame.number', 'frame.time_delta', 'frame.time_epoch',
                'radiotap.channel.freq', 'radiotap.length', 'radiotap.timestamp.ts', 'wlan.fc.frag',
                'wlan.fc.order', 'wlan.fc.moredata', 'wlan.fc.protected', 'wlan.fc.pwrmgt',
                'wlan.fc.type', 'wlan.fc.retry', 'wlan.fc.subtype', 'wlan_radio.duration',
                'wlan_radio.channel', 'wlan_radio.data_rate', 'wlan_radio.frequency',
                'wlan_radio.signal_dbm', 'radiotap.datarate', 'radiotap.mactime', 'eapol.type']:
        df_copy[col] = df_copy[col].fillna(-10000.0)
    
    df_copy['eapol.type'] = df_copy['eapol.type'].fillna(-1.0)
    df_copy['eapol.len'] = df_copy['eapol.len'].fillna(0.0)
    df_copy['eapol.keydes.replay_counter'] = df_copy['eapol.keydes.replay_counter'].fillna(-1.0)
    df_copy['eapol.keydes.key_len'] = df_copy['eapol.keydes.key_len'].fillna(0.0)
    df_copy['wlan.duration'] = df_copy['wlan.duration'].fillna(0.0) 
    df_copy['wlan_radio.timestamp'] = df_copy['wlan_radio.timestamp'].fillna(0.0) 
    df_copy['radiotap.dbm_antsignal'] = df_copy['radiotap.dbm_antsignal'].fillna(-1000)
    df_copy['radiotap.rxflags'] = df_copy['radiotap.rxflags'].fillna('0xffffffff')
    df_copy['wlan.fc.ds'] = df_copy['wlan.fc.ds'].fillna('0xffffffff')
    df_copy['wlan_radio.phy'] = df_copy['wlan_radio.phy'].fillna(-1000.0)
    
    df_copy.loc[df_copy['wlan.country_info.fnm'] == 1.0, 'wlan.country_info.fnm'] = '1.0'
    df_copy.loc[df_copy['wlan_radio.phy'] == 'Normal', 'wlan_radio.phy'] = -1000.0

    # Change type of several flags columns
    df_copy.loc[:, 'radiotap.rxflags'] = df_copy['radiotap.rxflags'].apply(lambda x: int(x, 16) if type(x)==str else x)
    df_copy.loc[:, 'wlan_radio.phy'] = df_copy['wlan_radio.phy'].apply(lambda x: float(x) if type(x)==str else x)
    df_copy.loc[:, 'Label'] = df_copy['Label'].apply(lambda x: x != 'Normal')
    df_copy.loc[:, 'radiotap.present.tsft'] = df_copy['radiotap.present.tsft'].apply(lambda x: x == '1-0-0')
    df_copy.loc[:, 'radiotap.dbm_antsignal'] = df_copy['radiotap.dbm_antsignal'].apply(lambda x: int(x) if isinstance(x, (int, float)) else -int(x.split('-')[1]))
    df_copy.loc[:, 'wlan.fc.ds'] = df_copy['wlan.fc.ds'].apply(lambda x: int(x, 16))
    df_copy.loc[:, 'wlan.analysis.has_key'] = df_copy['wlan.analysis.kck'].isna()
    df_copy.loc[:, 'wlan.fixed.used'] = df_copy['wlan.fixed.timestamp'].notna()
    df_copy.loc[:, 'wlan.rsn.used'] = df_copy['wlan.rsn.ie.gtk.key'].notna()
    df_copy.loc[:, 'eapol.used'] = df_copy['eapol.type'].notna()
    
    df_copy = df_copy.astype({
        'radiotap.channel.flags.cck': 'bool', 
        'radiotap.channel.flags.ofdm': 'bool', 
        'Label': 'bool',
        'radiotap.present.tsft': 'bool',
        'radiotap.dbm_antsignal': 'int64',
        'radiotap.rxflags': 'int64',
        'wlan.fc.ds': 'int64'
    })

    # delete unnecessary columns
    drop_columns = [
        'frame.time', 'frame.time_delta_displayed', 'frame.time_relative',
        'wlan.analysis.kck', 'wlan.analysis.kek', 'wlan_radio.end_tsf', 'wlan_radio.start_tsf'
    ]
    df_copy = df_copy.drop(columns=drop_columns, axis=1)
    df_copy = df_copy[[col for col in df_copy.columns if not col.startswith(('wlan.fixed', 'wlan.rsn', 'wlan_rsn'))]]

    # Fill category data
    df_copy['wlan.bssid'] = df_copy['wlan.bssid'].fillna('-')
    df_copy['wlan.country_info.fnm'] = df_copy['wlan.country_info.fnm'].fillna('-')
    df_copy['wlan.country_info.code'] = df_copy['wlan.country_info.code'].fillna('-')
    df_copy['wlan.da'] = df_copy['wlan.da'].fillna('-')
    df_copy['wlan.sa'] = df_copy['wlan.sa'].fillna('-')
    df_copy['wlan.ta'] = df_copy['wlan.ta'].fillna('-')
    df_copy['wlan.tag'] = df_copy['wlan.tag'].fillna('-')
    df_copy['wlan.tag.length'] = df_copy['wlan.tag.length'].fillna('-')
    df_copy['wlan.seq'] = df_copy['wlan.seq'].fillna(-1.0)
    df_copy['wlan.ssid'] = df_copy['wlan.ssid'].fillna('Unknown')
    
    df_copy = df_copy.dropna(how='all', axis=1)

    for col in df_copy.select_dtypes('object').columns:
        le = LabelEncoder()
        df_copy[col] = le.fit_transform(df_copy[col])
    
    return df_copy

In [6]:
def load_and_preprocessing_dataset(num: int) -> tuple:
    if num: df = load_dataset(num, True)
    else: df = load_fusion_dataset(random.randint(), True)
    
    df = wireless_preprocessing(df)
    
    features = df.drop(columns=['Label'])
    labels = df['Label']
    
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, stratify=labels)
    
    return X_train, X_test, y_train, y_test

In [7]:
def get_clf_eval(y_test, pred, pred_proba):
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    auc = roc_auc_score(y_test, pred_proba)

    return f'정확도: {accuracy:.4f}, 정밀도: {precision:.4f}, 재현율: {recall:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}'
    
def get_model_train_eval(model, X_train = None, X_test = None, y_train = None, y_test = None):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    pred_proba = model.predict_proba(X_test)[:, 1]
    
    res_msg = f"[Model 명: {model.__class__.__name__}]\n"
    res_msg += get_clf_eval(y_test, pred, pred_proba)
    res_msg += "\n\n"
    
    return res_msg

In [None]:
from joblib import Parallel, delayed

def get_model_train_eval_parallel(num: int):
    X_train, X_test, y_train, y_test = load_and_preprocessing_dataset(num)
    
    dt_clf = DecisionTreeClassifier()
    lr_clf = LogisticRegression(verbose=0)
    lgbm_clf = LGBMClassifier(verbose=-1)
    
    res_msg = f"Dataset: {sub_folders[num-1][1:]} ========================\n"
    
    res_msg += get_model_train_eval(dt_clf, X_train, X_test, y_train, y_test)
    res_msg += get_model_train_eval(lr_clf, X_train, X_test, y_train, y_test)
    res_msg += get_model_train_eval(lgbm_clf, X_train, X_test, y_train, y_test)
    
    return res_msg

warnings.filterwarnings('ignore', category=UndefinedMetricWarning)
warnings.filterwarnings('ignore', category=UserWarning, module='lightgbm')
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

for num in [1, 2, 3, 4, 5, 6, 12]:
    X_train, X_test, y_train, y_test = load_and_preprocessing_dataset(num)
    
    dt_clf = DecisionTreeClassifier()
    lr_clf = LogisticRegression(verbose=0)
    lgbm_clf = LGBMClassifier(verbose=-1)
    
    res_msg = f"Dataset: {sub_folders[num-1][1:]} ========================\n"
    
    res_msg += get_model_train_eval(dt_clf, X_train, X_test, y_train, y_test)
    res_msg += get_model_train_eval(lr_clf, X_train, X_test, y_train, y_test)
    res_msg += get_model_train_eval(lgbm_clf, X_train, X_test, y_train, y_test)
    
    print(res_msg)

(* 가독성을 위해 셀 출력에서 warning 메세지를 지웠습니다 *)

Dataset: 1.Deauth ========================
[Model 명: DecisionTreeClassifier]
정확도: 1.0000, 정밀도: 1.0000, 재현율: 1.0000, F1: 1.0000, AUC: 1.0000

[Model 명: LogisticRegression]
정확도: 0.9761, 정밀도: 0.0000, 재현율: 0.0000, F1: 0.0000, AUC: 0.9111

[Model 명: LGBMClassifier]
정확도: 1.0000, 정밀도: 1.0000, 재현율: 1.0000, F1: 1.0000, AUC: 1.0000


Dataset: 2.Disas ========================
[Model 명: DecisionTreeClassifier]
정확도: 1.0000, 정밀도: 0.9999, 재현율: 1.0000, F1: 1.0000, AUC: 1.0000

[Model 명: LogisticRegression]
정확도: 0.9627, 정밀도: 0.0000, 재현율: 0.0000, F1: 0.0000, AUC: 0.9176

[Model 명: LGBMClassifier]
정확도: 1.0000, 정밀도: 0.9999, 재현율: 1.0000, F1: 1.0000, AUC: 1.0000


Dataset: 3.(Re)Assoc ========================
[Model 명: DecisionTreeClassifier]
정확도: 1.0000, 정밀도: 1.0000, 재현율: 1.0000, F1: 1.0000, AUC: 1.0000

[Model 명: LogisticRegression]
정확도: 0.9970, 정밀도: 0.0000, 재현율: 0.0000, F1: 0.0000, AUC: 0.8683

[Model 명: LGBMClassifier]
정확도: 0.9982, 정밀도: 0.7416, 재현율: 0.5845, F1: 0.6538, AUC: 0.7920


Dataset: 4.Rogue_AP ========================
[Model 명: DecisionTreeClassifier]
정확도: 1.0000, 정밀도: 1.0000, 재현율: 1.0000, F1: 1.0000, AUC: 1.0000

[Model 명: LogisticRegression]
정확도: 0.9993, 정밀도: 0.0000, 재현율: 0.0000, F1: 0.0000, AUC: 0.8875

[Model 명: LGBMClassifier]
정확도: 1.0000, 정밀도: 1.0000, 재현율: 1.0000, F1: 1.0000, AUC: 1.0000


Dataset: 5.Krack ========================
[Model 명: DecisionTreeClassifier]
정확도: 1.0000, 정밀도: 1.0000, 재현율: 1.0000, F1: 1.0000, AUC: 1.0000

[Model 명: LogisticRegression]
정확도: 0.9764, 정밀도: 0.7459, 재현율: 0.4888, F1: 0.5906, AUC: 0.9800

[Model 명: LGBMClassifier]
정확도: 1.0000, 정밀도: 1.0000, 재현율: 1.0000, F1: 1.0000, AUC: 1.0000


Dataset: 6.Kr00k ========================
[Model 명: DecisionTreeClassifier]
정확도: 1.0000, 정밀도: 1.0000, 재현율: 1.0000, F1: 1.0000, AUC: 1.0000

[Model 명: LogisticRegression]
정확도: 0.9339, 정밀도: 0.0000, 재현율: 0.0000, F1: 0.0000, AUC: 0.8696

[Model 명: LGBMClassifier]
정확도: 1.0000, 정밀도: 1.0000, 재현율: 1.0000, F1: 1.0000, AUC: 1.0000


Dataset: 12.Evil_Twin ========================
[Model 명: DecisionTreeClassifier]
정확도: 1.0000, 정밀도: 0.9996, 재현율: 0.9998, F1: 0.9997, AUC: 0.9999

[Model 명: LogisticRegression]
정확도: 0.9717, 정밀도: 0.0000, 재현율: 0.0000, F1: 0.0000, AUC: 0.8919

[Model 명: LGBMClassifier]
정확도: 1.0000, 정밀도: 0.9999, 재현율: 0.9998, F1: 0.9998, AUC: 1.0000