In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

In [3]:
dataset_path = "../datasets/Edge-IIoT/"

df = pd.read_pickle(dataset_path + "Edge-IIoTset dataset/Selected dataset for ML and DL/DNN-EdgeIIoT-dataset.pkl")   

## Explore the dataset

In [4]:
print("Total number of samples: ", df.shape[0])

Total number of samples:  2219201


### Attack types and counts

In [5]:
print(df['Attack_type'].value_counts()) # print the number of each type of attack

# Sum the counts of each type of attack
print(df['Attack_type'].value_counts().sum())


Normal                   1615643
DDoS_UDP                  121568
DDoS_ICMP                 116436
SQL_injection              51203
Password                   50153
Vulnerability_scanner      50110
DDoS_TCP                   50062
DDoS_HTTP                  49911
Uploading                  37634
Backdoor                   24862
Port_Scanning              22564
XSS                        15915
Ransomware                 10925
MITM                        1214
Fingerprinting              1001
Name: Attack_type, dtype: int64
2219201


In [26]:
# Sum the number of rows where Attack_type != 'Normal'
print(f"Total number of attack rows = {df[df['Attack_type'] != 'Normal'].shape[0]}")

print(f'Veryify: Total rows 2219201 - Attack rows 1615643 = {2219201 - 1615643}')

Total number of attack rows = 603558
Veryify: Total rows 2219201 - Attack rows 1615643 = 603558


### IP Addresses

Show the different IP addresses in the dataset for Attack Type 0 (Benign) and Attack Type 1 (Malicious)

In [27]:
# Filter the DataFrame to only include rows where attack_label is 0
attack_label_0 = df[df["Attack_label"] == 0]

# Get the unique IP addresses in the ip.src_host and ip.dst_host columns
unique_ips = pd.concat([attack_label_0["ip.src_host"], attack_label_0["ip.dst_host"]]).unique()

# Get the number of unique IP addresses
num_unique_ips = len(unique_ips)

# Print the number of unique IP addresses
print("Number of unique IP addresses for attack_label 0:", num_unique_ips)

# print out the unique IPs
print("Attack type 0 unique IPs:")
print(unique_ips)

Number of unique IP addresses for attack_label 0: 28
Attack type 0 unique IPs:
['192.168.0.128' '192.168.0.101' '0' '0.0.0.0' '192.168.0.1'
 '146.101.60.86' '168.167.71.131' '94.228.220.14' '143.107.229.210'
 '85.254.217.5' '197.82.150.123' '13.244.55.14' '197.84.150.123'
 '160.119.193.252' '196.200.160.123' '224.0.0.1' '224.0.0.251'
 '31.131.0.123' '255.255.255.255' '192.168.0.255' '45.159.204.28'
 '62.149.0.30' '5.189.141.35' '196.10.52.58' '45.85.15.7' '196.10.55.57'
 '45.222.43.250' '224.0.0.252']


In [28]:
# Filter the DataFrame to only include rows where attack_label is 0
attack_label_0 = df[df["Attack_label"] == 1]

# Get the unique IP addresses in the ip.src_host and ip.dst_host columns
unique_ips = pd.concat([attack_label_0["ip.src_host"], attack_label_0["ip.dst_host"]]).unique()

# Get the number of unique IP addresses
num_unique_ips = len(unique_ips)

# Print the number of unique IP addresses
print("Number of unique IP addresses for attack_label 0:", num_unique_ips)

# print out the unique IPs
print("Attack type 1 unique IPs:")
print(unique_ips)

Number of unique IP addresses for attack_label 0: 176675
Attack type 1 unique IPs:
['192.168.0.152' '192.168.0.101' '0' ... '158.218.153.182'
 '191.173.55.173' '242.206.172.124']


Explore the src/dst combinations

In [29]:
# Count the number of unique combinations of ip.src_host and ip.dst_host where Attack_label == 1. Show the top 5 results
print(df[df["Attack_label"] == 1].groupby(["ip.src_host", "ip.dst_host"]).size().sort_values(ascending=False).head(15))
print(df[df["Attack_label"] == 1].groupby(["ip.src_host", "ip.dst_host"]).size().sum())  



ip.src_host    ip.dst_host    
192.168.0.170  192.168.0.128      164980
192.168.0.128  192.168.0.170      143311
               0                  110035
0              0                    5068
               0.0                   873
192.168.0.128  224.0.0.251           275
0.0.0.0        224.0.0.1             272
192.168.0.128  224.0.0.252           196
192.168.0.101  0.0                   155
192.168.0.152  0.0                   120
0.0.0.0        0.0                    58
192.168.0.128  172.217.19.42          29
172.217.19.42  192.168.0.128          22
192.168.0.128  104.16.87.20           21
               142.250.200.205        19
dtype: int64
603558


Find Invalid IP Addresses

In [30]:

# Count the number of lines with an IP of 0, 0.0, 0.0.0.0 or Null in ip.src_host or ip.dst_host
print("Total number of rows where ip.src_host or ip.dst_host is 0, 0.0, 0.0.0.0")
print(df[(df["ip.src_host"] == "0") | (df["ip.src_host"] == "0.0") | (df["ip.src_host"] == "0.0.0.0") | (df["ip.src_host"].isnull()) | (df["ip.dst_host"] == "0") | (df["ip.dst_host"] == "0.0") | (df["ip.dst_host"] == "0.0.0.0") | (df["ip.dst_host"].isnull())].shape)


Total number of rows where ip.src_host or ip.dst_host is 0, 0.0, 0.0.0.0
(396939, 63)


In [31]:
# # Remove those rows from the df
# df = df[(df["ip.src_host"] != "0") & (df["ip.src_host"] != "0.0") & (df["ip.src_host"] != "0.0.0.0") & (df["ip.src_host"].notnull()) & (df["ip.dst_host"] != "0") & (df["ip.dst_host"] != "0.0") & (df["ip.dst_host"] != "0.0.0.0") & (df["ip.dst_host"].notnull())]


In [35]:
# Count the number of unique combinations of ip.src_host and ip.dst_host where Attack_label == 1. Show the top 5 results
print("The top 15 unique combinations of ip.src_host and ip.dst_host where Attack_label == 1")
print(df[df["Attack_label"] == 1].groupby(["ip.src_host", "ip.dst_host", "Attack_type"]).size().sort_values(ascending=False).head(15))                                       
print(df[df["Attack_label"] == 1].groupby(["ip.src_host", "ip.dst_host"]).size().sum()) 

The top 15 unique combinations of ip.src_host and ip.dst_host where Attack_label == 1
ip.src_host    ip.dst_host    Attack_type          
192.168.0.128  0              DDoS_UDP                 110035
192.168.0.170  192.168.0.128  DDoS_HTTP                 33330
                              SQL_injection             26375
192.168.0.128  192.168.0.170  Password                  25265
                              Vulnerability_scanner     25085
192.168.0.170  192.168.0.128  Vulnerability_scanner     24886
                              Password                  24814
192.168.0.128  192.168.0.170  SQL_injection             24771
192.168.0.170  192.168.0.128  Uploading                 20470
192.168.0.128  192.168.0.170  Uploading                 17101
                              DDoS_HTTP                 16564
                              Backdoor                  12200
192.168.0.170  192.168.0.128  Backdoor                  11826
                              Port_Scanning             

In [37]:
# Count the number of unique IP address in the ip.src_host column that contains 192.168
print("Number of unique IP addresses in the ip.src_host column that contains 192.168:")
print(df[df["ip.src_host"].str.contains("192.168")]["ip.src_host"].nunique())

# Show them
print(df[df["ip.src_host"].str.contains("192.168")]["ip.src_host"].unique())

# Count the number of unique IP addresses in the ip.src_host column that contains 192.168 and Attack_label == 1
print("\nNumber of unique IP addresses in the ip.src_host column that contains 192.168 and Attack_label == 1:")
print(df[(df["ip.src_host"].str.contains("192.168")) & (df["Attack_label"] == 1)]["ip.src_host"].nunique())
print(df[(df["ip.src_host"].str.contains("192.168")) & (df["Attack_label"] == 1)]["ip.src_host"].unique())
# Show the count of each unique value in the ip.src_host column that contains 192.168 and Attack_label == 1
print(df[(df["ip.src_host"].str.contains("192.168")) & (df["Attack_label"] == 1)]["ip.src_host"].value_counts())

# Do the same again for the ip.dst_host column
print("\nNumber of unique IP addresses in the ip.dst_host column that contains 192.168:")
print(df[df["ip.dst_host"].str.contains("192.168")]["ip.dst_host"].nunique())

# Show them
print("\nUnique IP addresses in the ip.dst_host column that contains 192.168:")
print(df[df["ip.dst_host"].str.contains("192.168")]["ip.dst_host"].unique())

# Count the number of unique IP addresses in the ip.dst_host column that contains 192.168 and Attack_label == 1 
print("\nNumber of unique IP addresses in the ip.dst_host column that contains 192.168 and Attack_label == 1:")
print(df[(df["ip.dst_host"].str.contains("192.168")) & (df["Attack_label"] == 1)]["ip.dst_host"].nunique())
print(df[(df["ip.dst_host"].str.contains("192.168")) & (df["Attack_label"] == 1)]["ip.dst_host"].unique())
# Show the count of each unique value in the ip.dst_host column that contains 192.168 and Attack_label == 1
print(df[(df["ip.dst_host"].str.contains("192.168")) & (df["Attack_label"] == 1)]["ip.dst_host"].value_counts())

# Count the number of unique ip.src_host and ip.dst_host combinations where Attack_label == 1 and neither column contains 192.168
print("\nNumber of unique ip.src_host and ip.dst_host combinations where Attack_label == 1 and neither column contains 192.168:")
print(df[(~df["ip.src_host"].str.contains("192.168")) & (~df["ip.dst_host"].str.contains("192.168")) & (df["Attack_label"] == 1)][["ip.src_host", "ip.dst_host"]].nunique())
# Show counts of each unique combination
print(df[(~df["ip.src_host"].str.contains("192.168")) & (~df["ip.dst_host"].str.contains("192.168")) & (df["Attack_label"] == 1)][["ip.src_host", "ip.dst_host"]].value_counts())

# Show the attack types for the unique combinations of ip.src_host and ip.dst_host where Attack_label == 1 and neither column contains 192.168
print("\nAttack types for the unique combinations of ip.src_host and ip.dst_host where Attack_label == 1 and neither column contains 192.168:")
print(df[(~df["ip.src_host"].str.contains("192.168")) & (~df["ip.dst_host"].str.contains("192.168")) & (df["Attack_label"] == 1)][["ip.src_host", "ip.dst_host", "Attack_type"]].value_counts())



Number of unique IP addresses in the ip.src_host column that contains 192.168:
5
['192.168.0.128' '192.168.0.101' '192.168.0.1' '192.168.0.152'
 '192.168.0.170']

Number of unique IP addresses in the ip.src_host column that contains 192.168 and Attack_label == 1:
5
['192.168.0.152' '192.168.0.101' '192.168.0.128' '192.168.0.1'
 '192.168.0.170']
192.168.0.128    294788
192.168.0.170    164982
192.168.0.101       155
192.168.0.152       120
192.168.0.1           2
Name: ip.src_host, dtype: int64

Number of unique IP addresses in the ip.dst_host column that contains 192.168:
5

Unique IP addresses in the ip.dst_host column that contains 192.168:
['192.168.0.101' '192.168.0.128' '192.168.0.1' '192.168.0.255'
 '192.168.0.170']

Number of unique IP addresses in the ip.dst_host column that contains 192.168 and Attack_label == 1:
3
['192.168.0.128' '192.168.0.170' '192.168.0.101']
192.168.0.128    290687
192.168.0.170    143311
192.168.0.101        12
Name: ip.dst_host, dtype: int64

Number of

In [40]:
# How many rows have the same ip.src_host and ip.dst_host value in the same row
print("Number of rows where ip.src_host and ip.dst_host are the same:")
print(df[df["ip.src_host"] == df["ip.dst_host"]].shape[0])

# Show the first 10 rows where ip.src_host and ip.dst_host are the same
print(df[df["ip.src_host"] == df["ip.dst_host"]].head(10))


Number of rows where ip.src_host and ip.dst_host are the same:
296264
                    frame.time ip.src_host ip.dst_host arp.dst.proto_ipv4  \
386   2021 11:44:32.409815000            0           0      192.168.0.128   
387   2021 11:44:32.409849000            0           0        192.168.0.1   
693   2021 11:44:51.587220000            0           0                  0   
694   2021 11:44:51.993461000            0           0                  0   
712   2021 11:44:52.990740000            0           0                  0   
761   2021 11:44:56.487014000            0           0                  0   
810   2021 11:44:58.721757000            0           0                  0   
811   2021 11:44:59.361750000            0           0                  0   
908   2021 11:45:04.641742000            0           0        192.168.0.1   
909   2021 11:45:04.642986000            0           0      192.168.0.128   

     arp.opcode  arp.hw.size arp.src.proto_ipv4  icmp.checksum  icmp.seq_le  \
386

In [43]:
# Show row 1 of the dataframe in full without any truncation
pd.set_option("display.max_columns", None)
print(df.iloc[1])


frame.time             2021 11:44:10.162218000 
ip.src_host                       192.168.0.101
ip.dst_host                       192.168.0.128
arp.dst.proto_ipv4                            0
arp.opcode                                  0.0
                                ...            
mbtcp.len                                   0.0
mbtcp.trans_id                              0.0
mbtcp.unit_id                               0.0
Attack_label                                  0
Attack_type                              Normal
Name: 1, Length: 63, dtype: object
