In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

from sklearn.metrics import  classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

%matplotlib inline 

In [2]:
import importlib
import ids_common
importlib.reload(ids_common)

<module 'ids_common' from 'd:\\stuff\\univ\\ids\\ids_common.py'>

In [3]:
sample_koefs = {
# ATTACK
    'OS_Fingerprinting_attack.csv':1,
    'MITM_attack.csv':1,
    'Ransomware_attack.csv':1,
    'XSS_attack.csv':1,
    'Port_Scanning_attack.csv':1,
    'Backdoor_attack.csv':1,
    'Uploading_attack.csv':1,
    'SQL_injection_attack.csv':1,
    'Vulnerability_scanner_attack.csv':0.5,
    'DDoS_HTTP_Flood_attack.csv':0.5,
    'Password_attack.csv':0.3,
    'DDoS_TCP_SYN_Flood_attack.csv':0.1,
    'DDoS_ICMP_Flood_attack.csv':0.1,
    'DDoS_UDP_Flood_attack.csv':0.1,
# Normal
    'Modbus.csv':0.5,
    'Heart_Rate.csv':0.5,
    'phValue.csv':0.3,
    'Flame_Sensor.csv':0.25,
    'Distance.csv':0.25,
    'Soil_Moisture.csv':0.25,
    'IR_Receiver.csv':0.2,
    'Sound_Sensor.csv':0.2,
    'Temperature_and_Humidity.csv':0.2,
    'Water_Level.csv':0.2
}

In [3]:
sample_dir = './sampled_data/sample_1/'

In [43]:
if not os.path.exists(sample_dir):
    os.mkdir(sample_dir)

with open(sample_dir + 'koefs.txt', 'w') as f:
    for name, koef in sample_koefs.items():
        f.write(f"'{name}' : {koef},\n")


In [39]:
DATASET = 'dataset/Edge-IIoTset-dataset'
datasets_locations = []
for path, dirs, files in os.walk(DATASET):
    for file in files:
        if file.endswith('csv'):
            loc = f'{path}/{file}'
            if 'Selected dataset for ML and DL' not in loc:
                datasets_locations.append(loc)

In [42]:
#sample

total_size = 0

for ds_loc in datasets_locations:
    _, f = os.path.split(ds_loc)

    df = pd.read_csv(ds_loc, low_memory=False)
    koef = sample_koefs[f]
    df_sample = df.sample(frac=koef)

    print(f'{f} {sample_koefs[f]}: {df.shape} -> {df_sample.shape}')

    total_size += df_sample.shape[0]

    df_sample.to_csv(sample_dir + f'sampled_{koef}_' + f, index=False)

print(f'Total sample size: {total_size}')

Backdoor_attack.csv 1: (24862, 63) -> (24862, 63)
DDoS_HTTP_Flood_attack.csv 0.5: (229022, 63) -> (114511, 63)
DDoS_ICMP_Flood_attack.csv 0.1: (2914354, 63) -> (291435, 63)
DDoS_TCP_SYN_Flood_attack.csv 0.1: (2020120, 63) -> (202012, 63)
DDoS_UDP_Flood_attack.csv 0.1: (3201626, 63) -> (320163, 63)
MITM_attack.csv 1: (1229, 63) -> (1229, 63)
OS_Fingerprinting_attack.csv 1: (1001, 63) -> (1001, 63)
Password_attack.csv 0.3: (1053385, 63) -> (316016, 63)
Port_Scanning_attack.csv 1: (22564, 63) -> (22564, 63)
Ransomware_attack.csv 1: (10925, 63) -> (10925, 63)
SQL_injection_attack.csv 1: (51203, 63) -> (51203, 63)
Uploading_attack.csv 1: (37634, 63) -> (37634, 63)
Vulnerability_scanner_attack.csv 0.5: (145869, 63) -> (72934, 63)
XSS_attack.csv 1: (15915, 63) -> (15915, 63)
Distance.csv 0.25: (1143540, 63) -> (285885, 63)
Flame_Sensor.csv 0.25: (1070196, 63) -> (267549, 63)
Heart_Rate.csv 0.5: (165319, 63) -> (82660, 63)
IR_Receiver.csv 0.2: (1307778, 63) -> (261556, 63)
Modbus.csv 0.5: (159

In [44]:
# combine dataframes

df_combined = pd.DataFrame()
for ds_file in os.listdir(sample_dir):
    if 'sampled' in ds_file:
        df = pd.read_csv(sample_dir + ds_file, low_memory=False)
        df_combined = pd.concat([df_combined, df], ignore_index=True)

        print(f'Added: {ds_file} {df.shape}')

print(f'Combined df size: {df_combined.shape}')
df_combined.to_csv(sample_dir + f'combined_dataset.csv', index=False)


Added: sampled_0.1_DDoS_ICMP_Flood_attack.csv (291435, 63)
Added: sampled_0.1_DDoS_TCP_SYN_Flood_attack.csv (202012, 63)
Added: sampled_0.1_DDoS_UDP_Flood_attack.csv (320163, 63)
Added: sampled_0.25_Distance.csv (285885, 63)
Added: sampled_0.25_Flame_Sensor.csv (267549, 63)
Added: sampled_0.25_Soil_Moisture.csv (298194, 63)
Added: sampled_0.2_IR_Receiver.csv (261556, 63)
Added: sampled_0.2_Sound_Sensor.csv (302577, 63)
Added: sampled_0.2_Temperature_and_Humidity.csv (323144, 63)
Added: sampled_0.2_Water_Level.csv (459058, 63)
Added: sampled_0.3_Password_attack.csv (316016, 63)
Added: sampled_0.3_phValue.csv (224072, 63)
Added: sampled_0.5_DDoS_HTTP_Flood_attack.csv (114511, 63)
Added: sampled_0.5_Heart_Rate.csv (82660, 63)
Added: sampled_0.5_Modbus.csv (79751, 63)
Added: sampled_0.5_Vulnerability_scanner_attack.csv (72934, 63)
Added: sampled_1_Backdoor_attack.csv (24862, 63)
Added: sampled_1_MITM_attack.csv (1229, 63)
Added: sampled_1_OS_Fingerprinting_attack.csv (1001, 63)
Added: samp

In [4]:
df_combined = pd.read_csv('./sampled_data/sample_1/combined_dataset.csv', low_memory=False)

In [5]:
df_combined['Attack_type'].value_counts()

Attack_type
Normal                   2584446
DDoS_UDP                  320163
Password                  316016
DDoS_ICMP                 291435
DDoS_TCP                  202012
DDoS_HTTP                 114511
Vulnerability_scanner      72934
SQL_injection              51203
Uploading                  37634
Backdoor                   24862
Port_Scanning              22564
XSS                        15915
Ransomware                 10925
MITM                        1229
OS_Fingerprinting           1001
Name: count, dtype: int64

In [6]:
DROP = 'DROP'
TARGET = 'TARGET'
UNKNOWN = 'UNKNOWN'
SINGLE_VAL = 'SINGLE_VAL'
FIX_0 = 'FIX_0'
TO_CAT = 'TO_CAT'
TO_NUM = 'TO_NUM'
TO_BOOL = 'TO_BOOL'
HEX_TO_NUM = 'HEX_TO_NUM'
GET_HTTP_VERSION = 'GET_HTTP_VERSION'

feature_actions = {
            'frame.time' : [DROP],
            'ip.src_host' : [DROP],
            'ip.dst_host' : [DROP],
            'arp.dst.proto_ipv4' : [DROP],
            'arp.opcode' : [TO_NUM],
            'arp.hw.size' : [TO_NUM],
            'arp.src.proto_ipv4' : [DROP],
            'icmp.checksum' : [HEX_TO_NUM, TO_NUM],
            'icmp.seq_le' : [HEX_TO_NUM, TO_NUM],
            'icmp.transmit_timestamp' : [DROP],
            'icmp.unused' : [],
            'http.file_data' : [DROP], #
            'http.content_length' : [TO_NUM],
            'http.request.uri.query' : [DROP],
            'http.request.method': [FIX_0, TO_CAT],
            'http.referer': [DROP],
            'http.request.full_uri': [DROP],
            'http.request.version': [GET_HTTP_VERSION],
            'http.response': [TO_BOOL, TO_CAT],
            'http.tls_port': [],
            'tcp.ack': [TO_NUM],
            'tcp.ack_raw': [HEX_TO_NUM, TO_NUM],
            'tcp.checksum': [HEX_TO_NUM, TO_NUM],
            'tcp.connection.fin': [TO_NUM],
            'tcp.connection.rst': [TO_CAT],
            'tcp.connection.syn': [TO_CAT],
            'tcp.connection.synack': [TO_CAT],
            'tcp.dstport': [DROP], #
            'tcp.flags': [HEX_TO_NUM, TO_NUM],
            'tcp.flags.ack': [TO_BOOL, TO_CAT],
            'tcp.len': [TO_NUM],
            'tcp.options': [DROP], #
            'tcp.payload': [DROP], #
            'tcp.seq': [TO_NUM],
            'tcp.srcport': [DROP],
            'udp.port': [DROP],
            'udp.stream': [TO_NUM],
            'udp.time_delta': [UNKNOWN],
            'dns.qry.name': [DROP],
            'dns.qry.name.len': [TO_NUM], 
            'dns.qry.qu': [DROP],
            'dns.qry.type': [],
            'dns.retransmission': [TO_BOOL, TO_CAT],
            'dns.retransmit_request': [TO_CAT],
            'dns.retransmit_request_in': [],
            'mqtt.conack.flags': [DROP],
            'mqtt.conflag.cleansess': [TO_BOOL, TO_CAT],
            'mqtt.conflags': [HEX_TO_NUM, TO_NUM],
            'mqtt.hdrflags': [HEX_TO_NUM, TO_NUM],
            'mqtt.len': [TO_NUM],
            'mqtt.msg_decoded_as': [],
            'mqtt.msg': [DROP], #
            'mqtt.msgtype': [TO_NUM],
            'mqtt.proto_len': [TO_NUM],
            'mqtt.protoname': [TO_CAT],
            'mqtt.topic': [TO_CAT],
            'mqtt.topic_len': [TO_NUM],
            'mqtt.ver': [TO_NUM],
            'mbtcp.len': [],
            'mbtcp.trans_id': [],
            'mbtcp.unit_id': [],
            'Attack_label': [TARGET],
            'Attack_type': [TARGET]
        }

In [7]:
df = df_combined.copy()

def hex_to_num(x):
    try:
        return float(x)
    except:
        pass

    if '0x' in x:
        return int(x, 16)
    
    raise ValueError(f'Unexpected value: {x}')

def get_http_version(x):
    if x == '0.0':
        return x
    
    if 'HTTP/1.0' in x:
        return 'HTTP/1.0'

    if 'HTTP/1.1' in x:
        return 'HTTP/1.1'
    
    return pd.NA

print(f'Before: {df.shape}')
print(df['Attack_type'].value_counts())
print()

convert_to_numeric = []
drop_columns = []

for col in df.columns:
    if df[col].dtype == 'object':
        df.loc[(df[col] == '0') | (df[col] == '0x00000000'), col] = '0.0'

for col, actions in feature_actions.items():

    if DROP in actions:
        drop_columns.append(col)
        continue

    if HEX_TO_NUM in actions:
        df[col] = df[col].apply(hex_to_num)

    if GET_HTTP_VERSION in actions:
        df[col] = df[col].apply(get_http_version)

    if TO_NUM in actions:
        df.loc[:, [col]] = df.loc[:, [col]].apply(pd.to_numeric, errors='coerce')
        convert_to_numeric.append(col)

    # if TO_BOOL in actions:
    #     df[col] = df[col].apply(lambda x: '0.0' if str(x) in ['0.0', '0'] else '1.0')


df.drop(drop_columns, axis=1, inplace=True)

print(df.isna().sum())

ids_common.drop_na_dups(df, verbose=True)

df['Attack_type'] = df['Attack_type'].astype('category')

for col in convert_to_numeric:
    df[col] = df[col].astype('float64')

for col in df.columns:
    if df[col].dtype == 'object':
        ids_common.encode_text_dummy(df, col)

print(f'After: {df.shape}')
print(df['Attack_type'].value_counts())


Before: (4066850, 63)
Attack_type
Normal                   2584446
DDoS_UDP                  320163
Password                  316016
DDoS_ICMP                 291435
DDoS_TCP                  202012
DDoS_HTTP                 114511
Vulnerability_scanner      72934
SQL_injection              51203
Uploading                  37634
Backdoor                   24862
Port_Scanning              22564
XSS                        15915
Ransomware                 10925
MITM                        1229
OS_Fingerprinting           1001
Name: count, dtype: int64

arp.opcode                     3
arp.hw.size                  257
icmp.checksum                  0
icmp.seq_le                    0
icmp.unused                    0
http.content_length            0
http.request.method            0
http.request.version           6
http.response                  0
http.tls_port                  0
tcp.ack                        0
tcp.ack_raw                    0
tcp.checksum                   0
tcp.connection.

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3427020 entries, 0 to 4066849
Data columns (total 63 columns):
 #   Column                               Dtype   
---  ------                               -----   
 0   arp.opcode                           float64 
 1   arp.hw.size                          float64 
 2   icmp.checksum                        float64 
 3   icmp.seq_le                          float64 
 4   icmp.unused                          float64 
 5   http.content_length                  float64 
 6   http.response                        float64 
 7   http.tls_port                        float64 
 8   tcp.ack                              float64 
 9   tcp.ack_raw                          float64 
 10  tcp.checksum                         float64 
 11  tcp.connection.fin                   float64 
 12  tcp.connection.rst                   float64 
 13  tcp.connection.syn                   float64 
 14  tcp.connection.synack                float64 
 15  tcp.flags           

In [9]:
df.to_csv(sample_dir + f'clean_dataset.csv', index=False)

In [203]:

ds_stats = dict()
for col in df.columns:
    if col in ids_common.dropped_by_ferrag:
        continue
    
    uniques = df[col].unique()
    dtype = df[col].dtype
    num_uniq = len(uniques)
    vc = '[too many]'
    num_rows = df.shape[0]
    if num_uniq < 16:
        # vc = df[col].value_counts()
        vc = df.groupby('Attack_label')[col].value_counts()
    else:
        uniques = uniques[:8] + uniques[-8:]

    ds_stats[col] = (dtype, num_uniq, uniques, vc)

In [204]:
for col, uniques in ds_stats.items():
    dtype, num, unqs, vc = uniques

    if dtype != 'object':
        continue

    pr = '*' if num != 1 else ''

    print(f'{pr}\tcol = {col}[{dtype}], num_uniq = {num}, uniques = {unqs}')
    print(f'\t{vc}')
    print('-'*80)
    print()


    # Normal                   2584446

*	col = http.request.method[object], num_uniq = 8, uniques = ['0.0' 'GET' 'POST' 'TRACE' 'OPTIONS' 'SEARCH' 'PUT' 'PROPFIND']
	Attack_label  http.request.method
0             0.0                    2157201
1             0.0                    1210079
              GET                      52018
              POST                      7305
              TRACE                      392
              OPTIONS                     11
              PROPFIND                     5
              PUT                          5
              SEARCH                       4
Name: count, dtype: int64
--------------------------------------------------------------------------------

*	col = http.request.version[object], num_uniq = 3, uniques = ['0.0' 'HTTP/1.0' 'HTTP/1.1']
	Attack_label  http.request.version
0             0.0                     2157201
1             0.0                     1210079
              HTTP/1.1                  38282
              HTTP/1.0                  21458
Name: count, d