# 01 - Explore the CICIoT2023 dataset

Imports

In [3]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

In [4]:
DATASET_DIRECTORY = '../datasets/CICIoT2023/'

Import the Dataset

In [7]:
# Find all CSV files in the dataset directory and sort them
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
df_sets.sort()

# Split the dataset into training and test sets. 80% training, 20% test
training_sets = df_sets[:int(len(df_sets)*.8)]
test_sets = df_sets[int(len(df_sets)*.8):]

Examine file counts and size

In [8]:
# Print the number of files in each set
print('Training sets: {}'.format(len(training_sets)))
print('Test sets: {}'.format(len(test_sets)))

Training sets: 135
Test sets: 34


In [11]:
# Show the min, max and average file size. Output size in MB
print('Min file size: {} MB'.format(round(min([os.path.getsize(DATASET_DIRECTORY + k) for k in df_sets]) / 1000000, 2)))
print('Max file size: {} MB'.format(round(max([os.path.getsize(DATASET_DIRECTORY + k) for k in df_sets]) / 1000000, 2)))
print('Avg file size: {} MB'.format(round(np.mean([os.path.getsize(DATASET_DIRECTORY + k) for k in df_sets]) / 1000000, 2)))

Min file size: 62.41 MB
Max file size: 132.99 MB
Avg file size: 81.39 MB


In [12]:
# Read the first csv from training sets into a dataframe
df = pd.read_csv(DATASET_DIRECTORY + training_sets[0])

# Print the number of rows and columns in the dataframe
print('Rows: {}'.format(df.shape[0]))
print('Columns: {}'.format(df.shape[1]))

# Print the first 5 rows of the dataframe
df.head()

Rows: 238687
Columns: 47


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.0,54.0,6.0,64.0,0.329807,0.329807,0.0,1.0,0.0,1.0,...,0.0,54.0,83343830.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS-RSTFINFlood
1,0.0,57.04,6.33,64.0,4.290556,4.290556,0.0,0.0,0.0,0.0,...,2.822973,57.04,82926070.0,9.5,10.464666,4.010353,160.987842,0.05,141.55,DoS-TCP_Flood
2,0.0,0.0,1.0,64.0,33.396799,33.396799,0.0,0.0,0.0,0.0,...,0.0,42.0,83127990.0,9.5,9.165151,0.0,0.0,0.0,141.55,DDoS-ICMP_Flood
3,0.328175,76175.0,17.0,64.0,4642.13301,4642.13301,0.0,0.0,0.0,0.0,...,0.0,50.0,83015700.0,9.5,10.0,0.0,0.0,0.0,141.55,DoS-UDP_Flood
4,0.11732,101.73,6.11,65.91,6.202211,6.202211,0.0,0.0,1.0,0.0,...,23.113111,57.88,82973000.0,9.5,11.346876,32.716243,3016.808286,0.19,141.55,DoS-SYN_Flood


Columns used in the paper

In [13]:
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count',
       'syn_count', 'fin_count', 'urg_count', 'rst_count', 
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
       'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
       'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
       'Radius', 'Covariance', 'Variance', 'Weight', 
]
y_column = 'label'

In [14]:
# Show the number of X_columns and y_column
print('X_columns: {}'.format(len(X_columns)))
print('y_column: {}'.format(y_column))

X_columns: 46
y_column: label


In [21]:
# Show the number of unique values in the y_column
print('Unique values in y_column: {}'.format(len(df[y_column].unique())))

# Print them out
print(df[y_column].unique())

# Save them as attack_labels
attack_labels = df[y_column].unique()


Unique values in y_column: 34
['DDoS-RSTFINFlood' 'DoS-TCP_Flood' 'DDoS-ICMP_Flood' 'DoS-UDP_Flood'
 'DoS-SYN_Flood' 'Mirai-greeth_flood' 'DDoS-SynonymousIP_Flood'
 'Mirai-udpplain' 'DDoS-SYN_Flood' 'DDoS-PSHACK_Flood' 'DDoS-TCP_Flood'
 'DDoS-UDP_Flood' 'BenignTraffic' 'MITM-ArpSpoofing'
 'DDoS-ACK_Fragmentation' 'Mirai-greip_flood' 'DoS-HTTP_Flood'
 'DDoS-ICMP_Fragmentation' 'Recon-PortScan' 'DNS_Spoofing'
 'DDoS-UDP_Fragmentation' 'Recon-OSScan' 'XSS' 'DDoS-HTTP_Flood'
 'Recon-HostDiscovery' 'CommandInjection' 'VulnerabilityScan'
 'DDoS-SlowLoris' 'Backdoor_Malware' 'BrowserHijacking'
 'DictionaryBruteForce' 'SqlInjection' 'Recon-PingSweep'
 'Uploading_Attack']


In [22]:
# Count the number of rows for each attack label
for label in attack_labels:
    print('{}: {}'.format(label, df[df[y_column] == label].shape[0]))
    

DDoS-RSTFINFlood: 20669
DoS-TCP_Flood: 13630
DDoS-ICMP_Flood: 36554
DoS-UDP_Flood: 16957
DoS-SYN_Flood: 10275
Mirai-greeth_flood: 5016
DDoS-SynonymousIP_Flood: 18189
Mirai-udpplain: 4661
DDoS-SYN_Flood: 20739
DDoS-PSHACK_Flood: 21210
DDoS-TCP_Flood: 23149
DDoS-UDP_Flood: 27626
BenignTraffic: 5600
MITM-ArpSpoofing: 1614
DDoS-ACK_Fragmentation: 1505
Mirai-greip_flood: 3758
DoS-HTTP_Flood: 414
DDoS-ICMP_Fragmentation: 2377
Recon-PortScan: 430
DNS_Spoofing: 925
DDoS-UDP_Fragmentation: 1484
Recon-OSScan: 517
XSS: 18
DDoS-HTTP_Flood: 169
Recon-HostDiscovery: 697
CommandInjection: 28
VulnerabilityScan: 210
DDoS-SlowLoris: 106
Backdoor_Malware: 22
BrowserHijacking: 30
DictionaryBruteForce: 63
SqlInjection: 31
Recon-PingSweep: 6
Uploading_Attack: 8


In [24]:
# Creating a dictionary of attack types for 33 attack classes + 1 for benign traffic
traffic_34_class = {'BenignTraffic': 0 ,
                    'DDoS-RSTFINFlood' :1, 'DDoS-PSHACK_Flood':2,  'DDoS-SYN_Flood':3, 'DDoS-UDP_Flood':4, 'DDoS-TCP_Flood':5, 
                    'DDoS-ICMP_Flood':6, 'DDoS-SynonymousIP_Flood':7, 'DDoS-ACK_Fragmentation':8, 'DDoS-UDP_Fragmentation':9, 'DDoS-ICMP_Fragmentation':10, 
                    'DDoS-SlowLoris':11, 'DDoS-HTTP_Flood':12, 'DoS-UDP_Flood':13, 'DoS-SYN_Flood':14, 'DoS-TCP_Flood':15, 'DoS-HTTP_Flood':16,                 # DDoS and DoS
                    'Mirai-greeth_flood': 17, 'Mirai-greip_flood': 18, 'Mirai-udpplain': 19,                                                                    # Mirai 
                    'Recon-PingSweep': 20, 'Recon-OSScan': 21, 'Recon-PortScan': 22, 'VulnerabilityScan': 23, 'Recon-HostDiscovery': 24,                        # Reconnaissance
                    'DNS_Spoofing': 25, 'MITM-ArpSpoofing': 26,                                                                                                 # Spoofing
                    'BrowserHijacking': 27, 'Backdoor_Malware': 28, 'XSS': 29, 'Uploading_Attack': 30, 'SqlInjection': 31, 'CommandInjection': 32,              # Web
                    'DictionaryBruteForce': 33}                                                                                                                 # Brute Force 

traffic_7_class = {'BenignTraffic': 0 ,
                    'DDoS-RSTFINFlood' :1, 'DDoS-PSHACK_Flood':1,  'DDoS-SYN_Flood':1, 'DDoS-UDP_Flood':1, 'DDoS-TCP_Flood':1, 
                    'DDoS-ICMP_Flood':1, 'DDoS-SynonymousIP_Flood':1, 'DDoS-ACK_Fragmentation':1, 'DDoS-UDP_Fragmentation':1, 'DDoS-ICMP_Fragmentation':1, 
                    'DDoS-SlowLoris':1, 'DDoS-HTTP_Flood':1, 'DoS-UDP_Flood':1, 'DoS-SYN_Flood':1, 'DoS-TCP_Flood':1, 'DoS-HTTP_Flood':1,                       # DDoS and DoS
                    'Mirai-greeth_flood': 2, 'Mirai-greip_flood': 2, 'Mirai-udpplain': 2,                                                                       # Mirai
                    'Recon-PingSweep': 3, 'Recon-OSScan': 3, 'Recon-PortScan': 3, 'VulnerabilityScan': 3, 'Recon-HostDiscovery': 3,                             # Reconnaissance        
                    'DNS_Spoofing': 4, 'MITM-ArpSpoofing': 4,                                                                                                   # Spoofing
                    'BrowserHijacking': 5, 'Backdoor_Malware': 5, 'XSS': 5, 'Uploading_Attack': 5, 'SqlInjection': 5, 'CommandInjection': 5,                    # Web
                    'DictionaryBruteForce': 33 }                                                                                                                # Brute Force

traffic_2_class = {'BenignTraffic': 0 ,
                    'DDoS-RSTFINFlood' :1, 'DDoS-PSHACK_Flood':1,  'DDoS-SYN_Flood':1, 'DDoS-UDP_Flood':1, 'DDoS-TCP_Flood':1,
                    'DDoS-ICMP_Flood':1, 'DDoS-SynonymousIP_Flood':1, 'DDoS-ACK_Fragmentation':1, 'DDoS-UDP_Fragmentation':1, 'DDoS-ICMP_Fragmentation':1,
                    'DDoS-SlowLoris':1, 'DDoS-HTTP_Flood':1, 'DoS-UDP_Flood':1, 'DoS-SYN_Flood':1, 'DoS-TCP_Flood':1, 'DoS-HTTP_Flood':1,                       # DDoS and DoS
                    'Mirai-greeth_flood': 1, 'Mirai-greip_flood': 1, 'Mirai-udpplain': 1,                                                                       # Mirai
                    'Recon-PingSweep': 1, 'Recon-OSScan': 1, 'Recon-PortScan': 1, 'VulnerabilityScan': 1, 'Recon-HostDiscovery': 1,                             # Reconnaissance
                    'DNS_Spoofing': 1, 'MITM-ArpSpoofing': 1,                                                                                                   # Spoofing
                    'BrowserHijacking': 1, 'Backdoor_Malware': 1, 'XSS': 1, 'Uploading_Attack': 1, 'SqlInjection': 1, 'CommandInjection': 1,                    # Web
                    'DictionaryBruteForce': 1 }                                                                                                                 # Brute Force


In [25]:
# show first 10 rows of y column
df[y_column].head(10)


0           DDoS-RSTFINFlood
1              DoS-TCP_Flood
2            DDoS-ICMP_Flood
3              DoS-UDP_Flood
4              DoS-SYN_Flood
5         Mirai-greeth_flood
6    DDoS-SynonymousIP_Flood
7             Mirai-udpplain
8              DoS-UDP_Flood
9             DDoS-SYN_Flood
Name: label, dtype: object

In [26]:
# map the y column to the attack_labels for 34 class
df[y_column] = df[y_column].map(traffic_34_class)

In [27]:
# show first 10 rows of y column
df[y_column].head(10)

0     1
1    15
2     6
3    13
4    14
5    17
6     7
7    19
8    13
9     3
Name: label, dtype: int64