# 01 - Explore the CICIoT2023 dataset

Imports

In [132]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

In [133]:
DATASET_DIRECTORY = '../datasets/CICIoT2023/'

Import the Dataset

In [134]:
# Find all CSV files in the dataset directory and sort them
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
df_sets.sort()

# Split the dataset into training and test sets. 80% training, 20% test
training_sets = df_sets[:int(len(df_sets)*.8)]
test_sets = df_sets[int(len(df_sets)*.8):]

Examine file counts and size

In [135]:
# Print the number of files in each set
print('Training sets: {}'.format(len(training_sets)))
print('Test sets: {}'.format(len(test_sets)))

Training sets: 135
Test sets: 34


In [136]:
# Show the min, max and average file size. Output size in MB
print('Min file size: {} MB'.format(round(min([os.path.getsize(DATASET_DIRECTORY + k) for k in df_sets]) / 1000000, 2)))
print('Max file size: {} MB'.format(round(max([os.path.getsize(DATASET_DIRECTORY + k) for k in df_sets]) / 1000000, 2)))
print('Avg file size: {} MB'.format(round(np.mean([os.path.getsize(DATASET_DIRECTORY + k) for k in df_sets]) / 1000000, 2)))

Min file size: 62.41 MB
Max file size: 132.99 MB
Avg file size: 81.39 MB


In [137]:
# Read the last csv from training sets into a dataframe
df = pd.read_csv(DATASET_DIRECTORY + training_sets[-1])


# Print the number of rows and columns in the dataframe
print('Rows: {}'.format(df.shape[0]))
print('Columns: {}'.format(df.shape[1]))

# Print the first 5 rows of the dataframe
df.head()

Rows: 243649
Columns: 47


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.000838,54.62,6.05,64.0,11.961779,11.961779,0.0,0.0,0.0,0.0,...,0.111473,54.45,83075980.0,9.5,10.392912,0.037895,0.0359,0.02,141.55,DDoS-TCP_Flood
1,0.005486,75.88,6.0,64.0,29.502125,29.502125,0.0,0.0,1.0,0.0,...,0.100314,54.24,83093250.0,9.5,10.395361,0.143036,0.346802,0.03,141.55,DDoS-SYN_Flood
2,0.0,0.0,45.61,65.81,151.517376,151.517376,0.0,0.0,0.0,0.0,...,57.165223,576.8,83693790.0,9.5,33.783684,80.958879,8638.780727,0.4,141.55,Mirai-greeth_flood
3,0.0,54.0,6.0,64.0,1.500542,1.500542,0.0,0.0,1.0,0.0,...,0.0,54.0,83094080.0,9.5,10.392305,0.0,0.0,0.0,141.55,DDoS-SYN_Flood
4,0.004568,745.42,5.95,65.13,8.0821,8.0821,0.0,0.0,0.0,0.0,...,549.190629,927.04,83335610.0,9.5,41.550978,776.661367,318084.344439,0.95,141.55,DDoS-ACK_Fragmentation


Columns used in the paper

In [138]:
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count',
       'syn_count', 'fin_count', 'urg_count', 'rst_count', 
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
       'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
       'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
       'Radius', 'Covariance', 'Variance', 'Weight', 
]
y_column = 'label'

In [139]:
# Show the number of X_columns and y_column
print('X_columns: {}'.format(len(X_columns)))
print('y_column: {}'.format(y_column))

X_columns: 46
y_column: label


In [140]:
# Show the number of unique values in the y_column
print('Unique values in y_column: {}'.format(len(df[y_column].unique())))

# Print them out
print(df[y_column].unique())

# Save them as attack_labels
attack_labels = df[y_column].unique()


Unique values in y_column: 34
['DDoS-TCP_Flood' 'DDoS-SYN_Flood' 'Mirai-greeth_flood'
 'DDoS-ACK_Fragmentation' 'Mirai-greip_flood' 'DDoS-ICMP_Flood'
 'DDoS-RSTFINFlood' 'DDoS-PSHACK_Flood' 'DoS-TCP_Flood'
 'DDoS-SynonymousIP_Flood' 'DDoS-ICMP_Fragmentation' 'DDoS-UDP_Flood'
 'DoS-UDP_Flood' 'DoS-SYN_Flood' 'BenignTraffic' 'DDoS-SlowLoris'
 'Mirai-udpplain' 'DDoS-UDP_Fragmentation' 'DictionaryBruteForce'
 'DNS_Spoofing' 'Recon-HostDiscovery' 'MITM-ArpSpoofing' 'Recon-OSScan'
 'Recon-PortScan' 'VulnerabilityScan' 'DoS-HTTP_Flood' 'CommandInjection'
 'DDoS-HTTP_Flood' 'SqlInjection' 'BrowserHijacking' 'Backdoor_Malware'
 'Recon-PingSweep' 'Uploading_Attack' 'XSS']


In [141]:
# Count the number of rows for each attack label
for label in attack_labels:
    print('{}: {}'.format(label, df[df[y_column] == label].shape[0]))
    

DDoS-TCP_Flood: 23858
DDoS-SYN_Flood: 21241
Mirai-greeth_flood: 5180
DDoS-ACK_Fragmentation: 1455
Mirai-greip_flood: 3789
DDoS-ICMP_Flood: 37502
DDoS-RSTFINFlood: 21149
DDoS-PSHACK_Flood: 21507
DoS-TCP_Flood: 13898
DDoS-SynonymousIP_Flood: 18730
DDoS-ICMP_Fragmentation: 2382
DDoS-UDP_Flood: 27891
DoS-UDP_Flood: 17507
DoS-SYN_Flood: 10574
BenignTraffic: 5738
DDoS-SlowLoris: 131
Mirai-udpplain: 4626
DDoS-UDP_Fragmentation: 1442
DictionaryBruteForce: 69
DNS_Spoofing: 954
Recon-HostDiscovery: 711
MITM-ArpSpoofing: 1578
Recon-OSScan: 469
Recon-PortScan: 432
VulnerabilityScan: 165
DoS-HTTP_Flood: 384
CommandInjection: 33
DDoS-HTTP_Flood: 138
SqlInjection: 21
BrowserHijacking: 49
Backdoor_Malware: 10
Recon-PingSweep: 13
Uploading_Attack: 5
XSS: 18


In [142]:
# Creating a dictionary of attack types for 33 attack classes + 1 for benign traffic
dict_34_classes = {'BenignTraffic': 0 ,
                    'DDoS-RSTFINFlood' :1, 'DDoS-PSHACK_Flood':2,  'DDoS-SYN_Flood':3, 'DDoS-UDP_Flood':4, 'DDoS-TCP_Flood':5, 
                    'DDoS-ICMP_Flood':6, 'DDoS-SynonymousIP_Flood':7, 'DDoS-ACK_Fragmentation':8, 'DDoS-UDP_Fragmentation':9, 'DDoS-ICMP_Fragmentation':10, 
                    'DDoS-SlowLoris':11, 'DDoS-HTTP_Flood':12, 'DoS-UDP_Flood':13, 'DoS-SYN_Flood':14, 'DoS-TCP_Flood':15, 'DoS-HTTP_Flood':16,                 # DDoS and DoS
                    'Mirai-greeth_flood': 17, 'Mirai-greip_flood': 18, 'Mirai-udpplain': 19,                                                                    # Mirai 
                    'Recon-PingSweep': 20, 'Recon-OSScan': 21, 'Recon-PortScan': 22, 'VulnerabilityScan': 23, 'Recon-HostDiscovery': 24,                        # Reconnaissance
                    'DNS_Spoofing': 25, 'MITM-ArpSpoofing': 26,                                                                                                 # Spoofing
                    'BrowserHijacking': 27, 'Backdoor_Malware': 28, 'XSS': 29, 'Uploading_Attack': 30, 'SqlInjection': 31, 'CommandInjection': 32,              # Web
                    'DictionaryBruteForce': 33}                                                                                                                 # Brute Force 

dict_7_classes = {  0: 0 ,
                    1 :1, 2:1,  3:1, 4:1, 5:1, 6:1, 7:1, 8:1, 9:1, 10:1, 11:1, 12:1, 13:1, 14:1, 15:1, 16:1,                                                    # DDoS and DoS                    
                    17: 2, 18: 2, 19: 2,                                                                                                                        # Mirai
                    20: 3, 21: 3, 22: 3, 23: 3, 24: 3,                                                                                                          # Reconnaissance
                    25: 4, 26: 4,                                                                                                                               # Spoofing
                    27: 5, 28: 5, 29: 5, 30: 5, 31: 5, 32: 5,                                                                                                   # Web
                    33: 6}                                                                                                                                      # Brute Force

dict_2_classes = {  0: 0 ,
                    1 :1, 2:1,  3:1, 4:1, 5:1, 6:1, 7:1, 8:1, 9:1, 10:1, 11:1, 12:1, 13:1, 14:1, 15:1, 16:1,                                                    # DDoS and DoS  
                    17: 1, 18: 1, 19: 1,                                                                                                                        # Mirai 
                    20: 1, 21: 1, 22: 1, 23: 1, 24: 1,                                                                                                          # Reconnaissance
                    25: 1, 26: 1,                                                                                                                               # Spoofing
                    27: 1, 28: 1, 29: 1, 30: 1, 31: 1, 32: 1,                                                                                                   # Web
                    33: 1}                                                                                                                                      # Brute Force

# Map y column to the dict_34_classes values
df['label'] = df['label'].map(dict_34_classes)


In [143]:
# show first 10 rows of y column
df[y_column].head(10)


0     5
1     3
2    17
3     3
4     8
5    18
6     6
7     1
8     2
9    15
Name: label, dtype: int64

In [145]:
# show the unique values in the y column and their counts
print(df[y_column].value_counts())

# Count how many different unique values are in y column
print('Unique values in y_column: {}'.format(len(df[y_column].unique())))

6     37502
4     27891
5     23858
2     21507
3     21241
1     21149
7     18730
13    17507
15    13898
14    10574
0      5738
17     5180
19     4626
18     3789
10     2382
26     1578
8      1455
9      1442
25      954
24      711
21      469
22      432
16      384
23      165
12      138
11      131
33       69
27       49
32       33
31       21
29       18
20       13
28       10
30        5
Name: label, dtype: int64
Unique values in y_column: 34


In [146]:
# Further map the y_column to the dict_7_classes values
df['label'] = df['label'].map(dict_7_classes)

# Show the first 10 rows of the y column
df[y_column].head(10)

# Show the unique values in the y column and their counts
print(df[y_column].value_counts())


1    219789
2     13595
0      5738
4      2532
3      1790
5       136
6        69
Name: label, dtype: int64
