In [36]:
import os
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

In [37]:
dataset_path = "../datasets/Edge-IIoT/"

df = pd.read_pickle(dataset_path + "Edge-IIoTset dataset/Selected dataset for ML and DL/ML-EdgeIIoT-dataset.pkl")   

<a id="Explore"></a>
## Explore the dataset

In [38]:
print("Total number of samples: ", df.shape[0])

Total number of samples:  157800


### Attack types and counts

In [39]:
print(df['Attack_type'].value_counts()) # print the number of each type of attack

# Sum the counts of each type of attack
print(df['Attack_type'].value_counts().sum())


Normal                   24301
DDoS_UDP                 14498
DDoS_ICMP                14090
Ransomware               10925
DDoS_HTTP                10561
SQL_injection            10311
Uploading                10269
DDoS_TCP                 10247
Backdoor                 10195
Vulnerability_scanner    10076
Port_Scanning            10071
XSS                      10052
Password                  9989
MITM                      1214
Fingerprinting            1001
Name: Attack_type, dtype: int64
157800


In [40]:
# Sum the number of rows where Attack_type != 'Normal'
print(f"Total number of attack rows = {df[df['Attack_type'] != 'Normal'].shape[0]}")
print(f"Total number of normal rows = {df[df['Attack_type'] == 'Normal'].shape[0]}")

print(f"Total number of rows = {df.shape[0]} - total Attack row {df[df['Attack_type'] != 'Normal'].shape[0]} = {df.shape[0] - df[df['Attack_type'] != 'Normal'].shape[0]}")


Total number of attack rows = 133499
Total number of normal rows = 24301
Total number of rows = 157800 - total Attack row 133499 = 24301


## IP Addresses

Define the Known IP Addresses, including attacker IPs.

In [55]:
known_sensor_ip_addresses = [ '192.168.0.101', '192.168.2.194', '192.168.3.18', '192.168.4.73', '192.168.5.47', '192.168.6.56', '192.768.7.62', '192.168.8.163']
print ("known_sensor_ip_addresses:", known_sensor_ip_addresses)

tcp_dos_attack_ip_addresses = [ '207.192.25.133', '94.196.109.185', '133.149.252.77', '220.146.94.148' ]
tdp_dos_atack_ip_addresses = [ '190.123.219.128', '16.226.184.201', '153.125.214.15', '91.184.12.91' ]
http_attack_ip_addresses = [ '192.168.0.170', '216.58.198.74' ]
icmp_flood_attack_ip_addresses = [ '213.117.18.213', '183.223.100.122', '166.153.227.121', '49.81.59.152', '227.117.33.125' ]
port_scan_attack_ip_addresses = [ '192.168.0.170' ]
os_fingerprinting_attack_ip_addresses = [ '192.168.0.170' ]
vuln_scan_attack_ip_addresses = [ '192.168.0.170', '142.250.200.205', '172.217.19.35', '142.250.201.10' ]
dns_spoof_attack_ip_addresses = [ '192.168.0.101', '192.168.0.152', '172.217.19.35', '192.168.0.170' ]
arp_spoof_attack_ip_addresses = [ '192.168.0.101', '192.168.0.152', '172.217.19.35', '192.168.0.170' ]
xss_attack_ip_addresses = [ '192.168.0.170', '172.217.19.42', '104.16.87.20' ]
sql_injection_attack_ip_addresses = [ '192.168.0.170' ]
upload_attack_ip_addresses = [ '192.168.0.170' ]
backdoor_attack_ip_addresses = [ '192.168.0.170' ]
password_attack_ip_addresses = [ '192.168.0.170' ]
ransomware_attack_ip_addresses = [ '192.168.0.170' ] 

# Combine all attack IP addresses into one list, ensuring no duplicates
known_attacker_ip_addresses = list(set(tcp_dos_attack_ip_addresses + tdp_dos_atack_ip_addresses + http_attack_ip_addresses + icmp_flood_attack_ip_addresses + port_scan_attack_ip_addresses + os_fingerprinting_attack_ip_addresses + vuln_scan_attack_ip_addresses + dns_spoof_attack_ip_addresses + arp_spoof_attack_ip_addresses + xss_attack_ip_addresses + sql_injection_attack_ip_addresses + upload_attack_ip_addresses + backdoor_attack_ip_addresses + password_attack_ip_addresses + ransomware_attack_ip_addresses))
print (f"known_attacker_ip_addresses: \nNumber of IPs {len(known_attacker_ip_addresses)}\n{known_attacker_ip_addresses}")


known_sensor_ip_addresses: ['192.168.0.101', '192.168.2.194', '192.168.3.18', '192.168.4.73', '192.168.5.47', '192.168.6.56', '192.768.7.62', '192.168.8.163']
known_attacker_ip_addresses: 
Number of IPs 22
['192.168.0.170', '172.217.19.42', '133.149.252.77', '207.192.25.133', '49.81.59.152', '142.250.201.10', '216.58.198.74', '227.117.33.125', '192.168.0.152', '166.153.227.121', '183.223.100.122', '16.226.184.201', '142.250.200.205', '220.146.94.148', '213.117.18.213', '91.184.12.91', '104.16.87.20', '192.168.0.101', '172.217.19.35', '153.125.214.15', '190.123.219.128', '94.196.109.185']


Show the different IP addresses in the dataset for Attack Type 0 (Benign) and Attack Type 1 (Malicious)

In [42]:
# Filter the DataFrame to only include rows where attack_label is 0
attack_label_0 = df[df["Attack_label"] == 0]

# Get the unique IP addresses in the ip.src_host and ip.dst_host columns
benign_unique_ips = pd.concat([attack_label_0["ip.src_host"], attack_label_0["ip.dst_host"]]).unique()

# Get the number of unique IP addresses
num_benign_unique_ips = len(benign_unique_ips)

# Print the number of unique IP addresses
print("Number of unique IP address pairs for attack_label 0:", num_benign_unique_ips)

# print out the unique IPs
print("Attack type 0 unique IPs:")
print(benign_unique_ips)

Number of unique IP addresses for attack_label 0: 13
Attack type 0 unique IPs:
['192.168.0.128' '192.168.0.101' '0' '0.0.0.0' '143.107.229.210'
 '85.254.217.5' '197.84.150.123' '160.119.193.252' '192.168.0.1'
 '224.0.0.251' '224.0.0.1' '94.228.220.14' '196.200.160.123']


In [53]:
# Filter the DataFrame to only include rows where attack_label is 1
attack_label_1 = df[df["Attack_label"] == 1]

# Get the unique IP addresses in the ip.src_host and ip.dst_host columns
malicious_unique_ips = pd.concat([attack_label_1["ip.src_host"], attack_label_1["ip.dst_host"]]).unique()

# Get the number of unique IP addresses
num_malicious_unique_ips = len(malicious_unique_ips)

# Print the number of unique IP addresses
print("Number of unique IP address pairs for attack_label 1:", num_malicious_unique_ips)

# print out the unique IPs
print("Attack type 1 unique IPs:")
print(malicious_unique_ips)

Number of unique IP addresses for attack_label 1: 25712
Attack type 1 unique IPs:
['192.168.0.152' '192.168.0.101' '0' ... '26.223.146.121' '188.160.30.160'
 '126.188.47.126']


See if any IPs are in both benign and malicious datasets.

In [44]:
# Determine how many ips are in both lists
print("Number of IPs in both lists:", len(set(benign_unique_ips).intersection(malicious_unique_ips)))

# Show the IPs that are in both lists
print("IPs in both lists:")
print(set(benign_unique_ips).intersection(malicious_unique_ips))


Number of IPs in both lists: 7
IPs in both lists:
{'0', '224.0.0.1', '192.168.0.1', '224.0.0.251', '192.168.0.101', '0.0.0.0', '192.168.0.128'}


Examine the ip.src.host and ip.dst_host columns where attack_label is 1. 
If one of the IPs is in the known_attacker_ip_addresses list, add the other IP address to a list of attacked_ips, ensuring the list is unique. This should tell us what is attacked


In [59]:
# look at the ip.src.host and ip.dst_host columns where attack_label is 1. If one of the IPs is in the known_attacker_ip_addresses list, add the other IP address to a list of attacked_ips. Ensure the list is unique.
attacked_ips = []
for index, row in attack_label_1.iterrows():
    if row["ip.src_host"] in known_attacker_ip_addresses:
        attacked_ips.append(row["ip.dst_host"])
    elif row["ip.dst_host"] in known_attacker_ip_addresses:
        attacked_ips.append(row["ip.src_host"])

# Ensure the list is unique
attacked_ips = list(set(attacked_ips))

# Print the list of attacked IPs
print("Attacked IPs:")
print(attacked_ips)


Attacked IPs:
['192.168.0.128', '0.0']


It appears that the attacked IP is `192.168.0.128` which is an `edge server ip.`

### Explore the src/dst combinations
#### Benign

In [45]:
# Count the number of unique combinations of ip.src_host and ip.dst_host where Attack_label == 0. Show the top 15 results
print(df[df["Attack_label"] == 0].groupby(["ip.src_host", "ip.dst_host"]).size().sort_values(ascending=False).head(15))
print(df[df["Attack_label"] == 0].groupby(["ip.src_host", "ip.dst_host"]).size().sum())  

ip.src_host      ip.dst_host    
192.168.0.128    192.168.0.101      10089
192.168.0.101    192.168.0.128       9970
0                0                   4190
192.168.0.128    192.168.0.1           21
                 224.0.0.251           12
192.168.0.1      192.168.0.128          7
0.0.0.0          224.0.0.1              4
143.107.229.210  192.168.0.128          1
160.119.193.252  192.168.0.128          1
192.168.0.101    224.0.0.251            1
192.168.0.128    196.200.160.123        1
                 197.84.150.123         1
                 94.228.220.14          1
197.84.150.123   192.168.0.128          1
85.254.217.5     192.168.0.128          1
dtype: int64
24301


#### Malicious

In [46]:
# Count the number of unique combinations of ip.src_host and ip.dst_host where Attack_label == 1. Show the top 15 results
print(df[df["Attack_label"] == 1].groupby(["ip.src_host", "ip.dst_host"]).size().sort_values(ascending=False).head(15))
print(df[df["Attack_label"] == 1].groupby(["ip.src_host", "ip.dst_host"]).size().sum())  

ip.src_host    ip.dst_host  
192.168.0.170  192.168.0.128    47686
192.168.0.128  192.168.0.170    42290
               0                13096
0              0                 2928
               0.0                873
192.168.0.128  224.0.0.251        176
192.168.0.101  0.0                155
0.0.0.0        224.0.0.1          146
192.168.0.128  224.0.0.252        138
192.168.0.152  0.0                120
0.0.0.0        0.0                 58
192.168.0.128  172.217.19.42       19
172.217.19.42  192.168.0.128       17
192.168.0.128  104.16.87.20        17
               192.168.0.101        8
dtype: int64
133499


In [47]:
# Examine the ip.src_host and ip.dst_host columns, if one of the IPs is in the known_attacker_ip_addresses list, then show the other IP address
print(df[(df["ip.src_host"].isin(known_attacker_ip_addresses)) & (~df["ip.dst_host"].isin(known_attacker_ip_addresses))][["ip.src_host", "ip.dst_host"]].head(15))

# Create a list of unique iP addresses that match the previous criteria
ip_addresses = df[(df["ip.src_host"].isin(known_attacker_ip_addresses)) & (~df["ip.dst_host"].isin(known_attacker_ip_addresses))]["ip.dst_host"].unique()

# Print out the list
print(ip_addresses)


      ip.src_host ip.dst_host
0   192.168.0.152         0.0
1   192.168.0.101         0.0
2   192.168.0.152         0.0
3   192.168.0.101         0.0
4   192.168.0.152         0.0
5   192.168.0.101         0.0
6   192.168.0.152         0.0
7   192.168.0.101         0.0
16  192.168.0.152         0.0
17  192.168.0.101         0.0
22  192.168.0.152         0.0
23  192.168.0.101         0.0
24  192.168.0.152         0.0
25  192.168.0.101         0.0
26  192.168.0.152         0.0
['0.0' '192.168.0.128' '224.0.0.251']


Find Invalid IP Addresses

In [48]:

# Count the number of lines with an IP of 0, 0.0, 0.0.0.0 or Null in ip.src_host or ip.dst_host
print("Total number of rows where ip.src_host or ip.dst_host is 0, 0.0, 0.0.0.0")
print(df[(df["ip.src_host"] == "0") | (df["ip.src_host"] == "0.0") | (df["ip.src_host"] == "0.0.0.0") | (df["ip.src_host"].isnull()) | (df["ip.dst_host"] == "0") | (df["ip.dst_host"] == "0.0") | (df["ip.dst_host"] == "0.0.0.0") | (df["ip.dst_host"].isnull())].shape)


Total number of rows where ip.src_host or ip.dst_host is 0, 0.0, 0.0.0.0
(21578, 63)


In [49]:
# # Remove those rows from the df
# df = df[(df["ip.src_host"] != "0") & (df["ip.src_host"] != "0.0") & (df["ip.src_host"] != "0.0.0.0") & (df["ip.src_host"].notnull()) & (df["ip.dst_host"] != "0") & (df["ip.dst_host"] != "0.0") & (df["ip.dst_host"] != "0.0.0.0") & (df["ip.dst_host"].notnull())]


In [50]:
# Count the number of unique combinations of ip.src_host and ip.dst_host where Attack_label == 1. Show the top 5 results
print("The top 15 unique combinations of ip.src_host and ip.dst_host where Attack_label == 1")
print(df[df["Attack_label"] == 1].groupby(["ip.src_host", "ip.dst_host", "Attack_type"]).size().sort_values(ascending=False).head(15))                                       
print(df[df["Attack_label"] == 1].groupby(["ip.src_host", "ip.dst_host"]).size().sum()) 

The top 15 unique combinations of ip.src_host and ip.dst_host where Attack_label == 1
ip.src_host    ip.dst_host    Attack_type          
192.168.0.128  0              DDoS_UDP                 13096
192.168.0.170  192.168.0.128  DDoS_HTTP                 7068
                              Uploading                 5657
                              SQL_injection             5293
                              XSS                       5109
192.168.0.128  192.168.0.170  Vulnerability_scanner     5030
192.168.0.170  192.168.0.128  Password                  5024
                              Vulnerability_scanner     5014
192.168.0.128  192.168.0.170  SQL_injection             5001
                              Backdoor                  4997
                              Password                  4951
                              Ransomware                4892
192.168.0.170  192.168.0.128  Backdoor                  4864
                              Ransomware                4796
192.168.

In [51]:
# Count the number of unique IP address in the ip.src_host column that contains 192.168
print("Number of unique IP addresses in the ip.src_host column that contains 192.168:")
print(df[df["ip.src_host"].str.contains("192.168")]["ip.src_host"].nunique())

# Show them
print(df[df["ip.src_host"].str.contains("192.168")]["ip.src_host"].unique())

# Count the number of unique IP addresses in the ip.src_host column that contains 192.168 and Attack_label == 1
print("\nNumber of unique IP addresses in the ip.src_host column that contains 192.168 and Attack_label == 1:")
print(df[(df["ip.src_host"].str.contains("192.168")) & (df["Attack_label"] == 1)]["ip.src_host"].nunique())
print(df[(df["ip.src_host"].str.contains("192.168")) & (df["Attack_label"] == 1)]["ip.src_host"].unique())
# Show the count of each unique value in the ip.src_host column that contains 192.168 and Attack_label == 1
print(df[(df["ip.src_host"].str.contains("192.168")) & (df["Attack_label"] == 1)]["ip.src_host"].value_counts())

# Do the same again for the ip.dst_host column
print("\nNumber of unique IP addresses in the ip.dst_host column that contains 192.168:")
print(df[df["ip.dst_host"].str.contains("192.168")]["ip.dst_host"].nunique())

# Show them
print("\nUnique IP addresses in the ip.dst_host column that contains 192.168:")
print(df[df["ip.dst_host"].str.contains("192.168")]["ip.dst_host"].unique())

# Count the number of unique IP addresses in the ip.dst_host column that contains 192.168 and Attack_label == 1 
print("\nNumber of unique IP addresses in the ip.dst_host column that contains 192.168 and Attack_label == 1:")
print(df[(df["ip.dst_host"].str.contains("192.168")) & (df["Attack_label"] == 1)]["ip.dst_host"].nunique())
print(df[(df["ip.dst_host"].str.contains("192.168")) & (df["Attack_label"] == 1)]["ip.dst_host"].unique())
# Show the count of each unique value in the ip.dst_host column that contains 192.168 and Attack_label == 1
print(df[(df["ip.dst_host"].str.contains("192.168")) & (df["Attack_label"] == 1)]["ip.dst_host"].value_counts())

# Count the number of unique ip.src_host and ip.dst_host combinations where Attack_label == 1 and neither column contains 192.168
print("\nNumber of unique ip.src_host and ip.dst_host combinations where Attack_label == 1 and neither column contains 192.168:")
print(df[(~df["ip.src_host"].str.contains("192.168")) & (~df["ip.dst_host"].str.contains("192.168")) & (df["Attack_label"] == 1)][["ip.src_host", "ip.dst_host"]].nunique())
# Show counts of each unique combination
print(df[(~df["ip.src_host"].str.contains("192.168")) & (~df["ip.dst_host"].str.contains("192.168")) & (df["Attack_label"] == 1)][["ip.src_host", "ip.dst_host"]].value_counts())

# Show the attack types for the unique combinations of ip.src_host and ip.dst_host where Attack_label == 1 and neither column contains 192.168
print("\nAttack types for the unique combinations of ip.src_host and ip.dst_host where Attack_label == 1 and neither column contains 192.168:")
print(df[(~df["ip.src_host"].str.contains("192.168")) & (~df["ip.dst_host"].str.contains("192.168")) & (df["Attack_label"] == 1)][["ip.src_host", "ip.dst_host", "Attack_type"]].value_counts())



Number of unique IP addresses in the ip.src_host column that contains 192.168:
5
['192.168.0.152' '192.168.0.101' '192.168.0.128' '192.168.0.1'
 '192.168.0.170']

Number of unique IP addresses in the ip.src_host column that contains 192.168 and Attack_label == 1:
5
['192.168.0.152' '192.168.0.101' '192.168.0.128' '192.168.0.1'
 '192.168.0.170']
192.168.0.128    62421
192.168.0.170    47688
192.168.0.101      155
192.168.0.152      120
192.168.0.1          2
Name: ip.src_host, dtype: int64

Number of unique IP addresses in the ip.dst_host column that contains 192.168:
4

Unique IP addresses in the ip.dst_host column that contains 192.168:
['192.168.0.128' '192.168.0.170' '192.168.0.101' '192.168.0.1']

Number of unique IP addresses in the ip.dst_host column that contains 192.168 and Attack_label == 1:
3
['192.168.0.128' '192.168.0.170' '192.168.0.101']
192.168.0.128    65392
192.168.0.170    42290
192.168.0.101        8
Name: ip.dst_host, dtype: int64

Number of unique ip.src_host and i

In [52]:
# How many rows have the same ip.src_host and ip.dst_host value in the same row
print("Number of rows where ip.src_host and ip.dst_host are the same:")
print(df[df["ip.src_host"] == df["ip.dst_host"]].shape[0])

# Show the first 10 rows where ip.src_host and ip.dst_host are the same
print(df[df["ip.src_host"] == df["ip.dst_host"]].head(10))


Number of rows where ip.src_host and ip.dst_host are the same:
8520
                     frame.time ip.src_host ip.dst_host arp.dst.proto_ipv4  \
1221   2021 22:14:35.855090000            0           0      192.168.0.128   
1222   2021 22:14:35.855154000            0           0      192.168.0.170   
1223   2021 22:14:35.995475000            0           0      192.168.0.170   
1224   2021 22:14:35.999272000            0           0      192.168.0.128   
1242   2021 22:15:13.959913000            0           0      192.168.0.128   
1243   2021 22:15:13.959976000            0           0      192.168.0.170   
1244   2021 22:15:14.005505000            0           0      192.168.0.170   
1245   2021 22:15:14.008899000            0           0      192.168.0.128   
1251   2021 22:15:16.545755000            0           0        192.168.0.1   
1256   2021 22:15:28.561712000            0           0        192.168.0.1   

      arp.opcode  arp.hw.size arp.src.proto_ipv4  icmp.checksum  icmp.seq

zero IP addresses appear to be ARP messages.