In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

from sklearn.metrics import  classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

%matplotlib inline 

In [2]:
import importlib
import ids_common
importlib.reload(ids_common)

<module 'ids_common' from 'd:\\stuff\\univ\\ids\\ids_common.py'>

In [3]:
sample_koefs = {
# ATTACK
    'OS_Fingerprinting_attack.csv':1,
    'MITM_attack.csv':1,
    'Ransomware_attack.csv':1,
    'XSS_attack.csv':1,
    'Port_Scanning_attack.csv':1,
    'Backdoor_attack.csv':1,
    'Uploading_attack.csv':1,
    'SQL_injection_attack.csv':1,
    'Vulnerability_scanner_attack.csv':0.5,
    'DDoS_HTTP_Flood_attack.csv':0.5,
    'Password_attack.csv':0.3,
    'DDoS_TCP_SYN_Flood_attack.csv':0.1,
    'DDoS_ICMP_Flood_attack.csv':0.1,
    'DDoS_UDP_Flood_attack.csv':0.1,
# Normal
    'Modbus.csv':0.5,
    'Heart_Rate.csv':0.5,
    'phValue.csv':0.3,
    'Flame_Sensor.csv':0.25,
    'Distance.csv':0.25,
    'Soil_Moisture.csv':0.25,
    'IR_Receiver.csv':0.2,
    'Sound_Sensor.csv':0.2,
    'Temperature_and_Humidity.csv':0.2,
    'Water_Level.csv':0.2
}

In [43]:
sample_dir = './sampled_data/sample_1/'
if not os.path.exists(sample_dir):
    os.mkdir(sample_dir)

with open(sample_dir + 'koefs.txt', 'w') as f:
    for name, koef in sample_koefs.items():
        f.write(f"'{name}' : {koef},\n")


In [39]:
DATASET = 'dataset/Edge-IIoTset-dataset'
datasets_locations = []
for path, dirs, files in os.walk(DATASET):
    for file in files:
        if file.endswith('csv'):
            loc = f'{path}/{file}'
            if 'Selected dataset for ML and DL' not in loc:
                datasets_locations.append(loc)

In [42]:
#sample

total_size = 0

for ds_loc in datasets_locations:
    _, f = os.path.split(ds_loc)

    df = pd.read_csv(ds_loc, low_memory=False)
    koef = sample_koefs[f]
    df_sample = df.sample(frac=koef)

    print(f'{f} {sample_koefs[f]}: {df.shape} -> {df_sample.shape}')

    total_size += df_sample.shape[0]

    df_sample.to_csv(sample_dir + f'sampled_{koef}_' + f, index=False)

print(f'Total sample size: {total_size}')

Backdoor_attack.csv 1: (24862, 63) -> (24862, 63)
DDoS_HTTP_Flood_attack.csv 0.5: (229022, 63) -> (114511, 63)
DDoS_ICMP_Flood_attack.csv 0.1: (2914354, 63) -> (291435, 63)
DDoS_TCP_SYN_Flood_attack.csv 0.1: (2020120, 63) -> (202012, 63)
DDoS_UDP_Flood_attack.csv 0.1: (3201626, 63) -> (320163, 63)
MITM_attack.csv 1: (1229, 63) -> (1229, 63)
OS_Fingerprinting_attack.csv 1: (1001, 63) -> (1001, 63)
Password_attack.csv 0.3: (1053385, 63) -> (316016, 63)
Port_Scanning_attack.csv 1: (22564, 63) -> (22564, 63)
Ransomware_attack.csv 1: (10925, 63) -> (10925, 63)
SQL_injection_attack.csv 1: (51203, 63) -> (51203, 63)
Uploading_attack.csv 1: (37634, 63) -> (37634, 63)
Vulnerability_scanner_attack.csv 0.5: (145869, 63) -> (72934, 63)
XSS_attack.csv 1: (15915, 63) -> (15915, 63)
Distance.csv 0.25: (1143540, 63) -> (285885, 63)
Flame_Sensor.csv 0.25: (1070196, 63) -> (267549, 63)
Heart_Rate.csv 0.5: (165319, 63) -> (82660, 63)
IR_Receiver.csv 0.2: (1307778, 63) -> (261556, 63)
Modbus.csv 0.5: (159

In [44]:
# combine dataframes

df_combined = pd.DataFrame()
for ds_file in os.listdir(sample_dir):
    if 'sampled' in ds_file:
        df = pd.read_csv(sample_dir + ds_file, low_memory=False)
        df_combined = pd.concat([df_combined, df], ignore_index=True)

        print(f'Added: {ds_file} {df.shape}')

print(f'Combined df size: {df_combined.shape}')
df_combined.to_csv(sample_dir + f'combined_dataset.csv', index=False)


Added: sampled_0.1_DDoS_ICMP_Flood_attack.csv (291435, 63)
Added: sampled_0.1_DDoS_TCP_SYN_Flood_attack.csv (202012, 63)
Added: sampled_0.1_DDoS_UDP_Flood_attack.csv (320163, 63)
Added: sampled_0.25_Distance.csv (285885, 63)
Added: sampled_0.25_Flame_Sensor.csv (267549, 63)
Added: sampled_0.25_Soil_Moisture.csv (298194, 63)
Added: sampled_0.2_IR_Receiver.csv (261556, 63)
Added: sampled_0.2_Sound_Sensor.csv (302577, 63)
Added: sampled_0.2_Temperature_and_Humidity.csv (323144, 63)
Added: sampled_0.2_Water_Level.csv (459058, 63)
Added: sampled_0.3_Password_attack.csv (316016, 63)
Added: sampled_0.3_phValue.csv (224072, 63)
Added: sampled_0.5_DDoS_HTTP_Flood_attack.csv (114511, 63)
Added: sampled_0.5_Heart_Rate.csv (82660, 63)
Added: sampled_0.5_Modbus.csv (79751, 63)
Added: sampled_0.5_Vulnerability_scanner_attack.csv (72934, 63)
Added: sampled_1_Backdoor_attack.csv (24862, 63)
Added: sampled_1_MITM_attack.csv (1229, 63)
Added: sampled_1_OS_Fingerprinting_attack.csv (1001, 63)
Added: samp

In [4]:
df_combined = pd.read_csv('./sampled_data/sample_1/combined_dataset.csv', low_memory=False)

In [5]:
df_combined['Attack_type'].value_counts()

Attack_type
Normal                   2584446
DDoS_UDP                  320163
Password                  316016
DDoS_ICMP                 291435
DDoS_TCP                  202012
DDoS_HTTP                 114511
Vulnerability_scanner      72934
SQL_injection              51203
Uploading                  37634
Backdoor                   24862
Port_Scanning              22564
XSS                        15915
Ransomware                 10925
MITM                        1229
OS_Fingerprinting           1001
Name: count, dtype: int64

In [7]:
ids_common.ferrag_preparation(df_combined, verbose=True)
df_combined.info()

Before: shape=(4066850, 63)
Before: dropna: NA: 0, DUPS: 603202
After: dropna: NA: 0, DUPS: 0
After: shape=(3463648, 48)
<class 'pandas.core.frame.DataFrame'>
Index: 3463648 entries, 0 to 4066849
Data columns (total 48 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   arp.opcode                 object 
 1   arp.hw.size                object 
 2   icmp.checksum              object 
 3   icmp.seq_le                object 
 4   icmp.unused                float64
 5   http.content_length        float64
 6   http.request.method        object 
 7   http.referer               object 
 8   http.request.version       object 
 9   http.response              float64
 10  http.tls_port              float64
 11  tcp.ack                    float64
 12  tcp.ack_raw                object 
 13  tcp.checksum               object 
 14  tcp.connection.fin         object 
 15  tcp.connection.rst         float64
 16  tcp.connection.syn         float64
 17  tcp.co

In [9]:
# df_combined.to_csv(sample_dir + f'ferrag_cleanup.csv', index=False)
df = df_combined.copy()

In [32]:
df['http.request.version'].unique()


ddf = df.loc[~df['http.request.version'].isin(['0.0', '0', 'HTTP/1.1', 'HTTP/1.0'])]




In [45]:
ddf['http.request.version'].value_counts()

http.request.version
script>alert(1)/script><\" HTTP/1.1                                                6
-a HTTP/1.1                                                                        5
name=a><input name=i value=XSS>&lt;script>alert('Vulnerable')</script> HTTP/1.1    5
-al&_PHPLIB[libdir]=http://cirt.net/rfiinc.txt?? HTTP/1.1                          5
-al&ABSOLUTE_PATH_STUDIP=http://cirt.net/rfiinc.txt?? HTTP/1.1                     5
/etc/passwd|?data=Download HTTP/1.1                                                4
Src=javascript:alert('Vulnerable')><Img Src=\" HTTP/1.1                            4
By Dr HTTP/1.1                                                                     3
> HTTP/1.1                                                                         2
0x0000511a                                                                         1
Name: count, dtype: int64

In [24]:

ds_stats = dict()
for col in df.columns:
    if col in ids_common.dropped_by_ferrag:
        continue
    
    uniques = df[col].unique()
    dtype = df[col].dtype
    num_uniq = len(uniques)
    vc = '[too many]'
    num_rows = df.shape[0]
    if num_uniq < 16:
        # vc = df[col].value_counts()
        vc = df.groupby('Attack_label')[col].value_counts()
    else:
        uniques = uniques[:8] + uniques[-8:]

    ds_stats[col] = (dtype, num_uniq, uniques, vc)

In [26]:
for col, uniques in ds_stats.items():
    dtype, num, unqs, vc = uniques

    if dtype != 'object':
        continue

    pr = '*' if num != 1 else ''

    print(f'{pr}\tcol = {col}[{dtype}], num_uniq = {num}, uniques = {unqs}')
    print(f'\t{vc}')
    print('-'*80)
    print()


    # Normal                   2584446

*	col = arp.opcode[object], num_uniq = 8, uniques = ['0.0' '1.0' '2.0' '6.0' '0' '2' '192.168.0.128' '1']
	Attack_label  arp.opcode   
0             0.0              1887106
              0                 306699
              6.0                    6
              1.0                    2
              2.0                    2
              1                      1
              192.168.0.128          1
              2                      1
1             0.0              1269803
              1.0                   12
              2.0                   12
              6.0                    3
Name: count, dtype: int64
--------------------------------------------------------------------------------

*	col = arp.hw.size[object], num_uniq = 10, uniques = ['0.0' '6.0' '0' '192.168.0.128' '192.168.0.170' '192.168.0.101'
 '192.168.7.55' '192.168.7.1' '192.168.0.1' '192.168.7.62']
	Attack_label  arp.hw.size  
0             0.0              2114606
              0                  79200
   