**Edge-IIoTset Dataset Exploration and Preprocessing**

In [None]:
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set() # set seaborn to default settings

**Import the data into a pandas data-frame and inspect it’s properties.**

In [None]:
df = pd.read_csv('/content/drive/MyDrive/DNN-EdgeIIoT-dataset.csv', low_memory=False) 
df.shape # (2219201, 63)
df.head()

Unnamed: 0,frame.time,ip.src_host,ip.dst_host,arp.dst.proto_ipv4,arp.opcode,arp.hw.size,arp.src.proto_ipv4,icmp.checksum,icmp.seq_le,icmp.transmit_timestamp,...,mqtt.proto_len,mqtt.protoname,mqtt.topic,mqtt.topic_len,mqtt.ver,mbtcp.len,mbtcp.trans_id,mbtcp.unit_id,Attack_label,Attack_type
0,2021 11:44:10.081753000,192.168.0.128,192.168.0.101,0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,Normal
1,2021 11:44:10.162218000,192.168.0.101,192.168.0.128,0,0.0,0.0,0,0.0,0.0,0.0,...,4.0,MQTT,0,0.0,4.0,0.0,0.0,0.0,0,Normal
2,2021 11:44:10.162271000,192.168.0.128,192.168.0.101,0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,Normal
3,2021 11:44:10.162641000,192.168.0.128,192.168.0.101,0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,Normal
4,2021 11:44:10.166132000,192.168.0.101,192.168.0.128,0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0,Temperature_and_Humidity,24.0,0.0,0.0,0.0,0.0,0,Normal


**The data is in tabular form, with 2219201 entries, each having 63 distinct attributes.**

In [None]:
# let's list all the columns
columns = list(df.columns)
columns


['frame.time',
 'ip.src_host',
 'ip.dst_host',
 'arp.dst.proto_ipv4',
 'arp.opcode',
 'arp.hw.size',
 'arp.src.proto_ipv4',
 'icmp.checksum',
 'icmp.seq_le',
 'icmp.transmit_timestamp',
 'icmp.unused',
 'http.file_data',
 'http.content_length',
 'http.request.uri.query',
 'http.request.method',
 'http.referer',
 'http.request.full_uri',
 'http.request.version',
 'http.response',
 'http.tls_port',
 'tcp.ack',
 'tcp.ack_raw',
 'tcp.checksum',
 'tcp.connection.fin',
 'tcp.connection.rst',
 'tcp.connection.syn',
 'tcp.connection.synack',
 'tcp.dstport',
 'tcp.flags',
 'tcp.flags.ack',
 'tcp.len',
 'tcp.options',
 'tcp.payload',
 'tcp.seq',
 'tcp.srcport',
 'udp.port',
 'udp.stream',
 'udp.time_delta',
 'dns.qry.name',
 'dns.qry.name.len',
 'dns.qry.qu',
 'dns.qry.type',
 'dns.retransmission',
 'dns.retransmit_request',
 'dns.retransmit_request_in',
 'mqtt.conack.flags',
 'mqtt.conflag.cleansess',
 'mqtt.conflags',
 'mqtt.hdrflags',
 'mqtt.len',
 'mqtt.msg_decoded_as',
 'mqtt.msg',
 'mqtt.m

**Information About Data**

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2219201 entries, 0 to 2219200
Data columns (total 63 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   frame.time                 object 
 1   ip.src_host                object 
 2   ip.dst_host                object 
 3   arp.dst.proto_ipv4         object 
 4   arp.opcode                 float64
 5   arp.hw.size                float64
 6   arp.src.proto_ipv4         object 
 7   icmp.checksum              float64
 8   icmp.seq_le                float64
 9   icmp.transmit_timestamp    float64
 10  icmp.unused                float64
 11  http.file_data             object 
 12  http.content_length        float64
 13  http.request.uri.query     object 
 14  http.request.method        object 
 15  http.referer               object 
 16  http.request.full_uri      object 
 17  http.request.version       object 
 18  http.response              float64
 19  http.tls_port              float64
 20  tc

**Let’s see how many unique values are there in each column.**

In [None]:
df.nunique(axis=0)

frame.time            2206364
ip.src_host            137167
ip.dst_host             52425
arp.dst.proto_ipv4          9
arp.opcode                  3
                       ...   
mbtcp.len                   8
mbtcp.trans_id            151
mbtcp.unit_id               6
Attack_label                2
Attack_type                15
Length: 63, dtype: int64

**Now, inspect for NaNs in the data.**

In [None]:
df.isna().sum()

frame.time            0
ip.src_host           0
ip.dst_host           0
arp.dst.proto_ipv4    0
arp.opcode            0
                     ..
mbtcp.len             0
mbtcp.trans_id        0
mbtcp.unit_id         0
Attack_label          0
Attack_type           0
Length: 63, dtype: int64

In [None]:
df.isnull().values.any()   #No null values in dataset

False

**Let’s see how many duplicate entries are there and drop them if there are any.**

In [None]:
# find total duplicate entries and drop them if any
print(f'Total duplicate rows: {df.duplicated().sum()}')

Total duplicate rows: 815


In [None]:
# drop duplicate rows if any
df = df[~df.duplicated()]

In [None]:
df.shape

(2218386, 63)

In [None]:
print(f'total duplicate rows: {df.duplicated().sum()}')

total duplicate rows: 0


In [None]:
df.head()

Unnamed: 0,frame.time,ip.src_host,ip.dst_host,arp.dst.proto_ipv4,arp.opcode,arp.hw.size,arp.src.proto_ipv4,icmp.checksum,icmp.seq_le,icmp.transmit_timestamp,...,mqtt.proto_len,mqtt.protoname,mqtt.topic,mqtt.topic_len,mqtt.ver,mbtcp.len,mbtcp.trans_id,mbtcp.unit_id,Attack_label,Attack_type
0,2021 11:44:10.081753000,192.168.0.128,192.168.0.101,0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,Normal
1,2021 11:44:10.162218000,192.168.0.101,192.168.0.128,0,0.0,0.0,0,0.0,0.0,0.0,...,4.0,MQTT,0,0.0,4.0,0.0,0.0,0.0,0,Normal
2,2021 11:44:10.162271000,192.168.0.128,192.168.0.101,0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,Normal
3,2021 11:44:10.162641000,192.168.0.128,192.168.0.101,0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,Normal
4,2021 11:44:10.166132000,192.168.0.101,192.168.0.128,0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0,Temperature_and_Humidity,24.0,0.0,0.0,0.0,0.0,0,Normal


**Remove insignificant columns from data frame**

In [None]:
from sklearn.utils import shuffle
drop_columns = ["frame.time", "ip.src_host", "ip.dst_host", "arp.src.proto_ipv4","arp.dst.proto_ipv4", 
                "http.file_data","http.request.full_uri","icmp.transmit_timestamp",
                "http.request.uri.query", "tcp.options","tcp.payload","tcp.srcport",
                "tcp.dstport", "udp.port", "mqtt.msg", "Attack_label"]

df.drop(drop_columns, axis=1, inplace=True)
df.dropna(axis=0, how='any', inplace=True)
df.drop_duplicates(subset=None, keep="first", inplace=True)
df = shuffle(df)
df.isna().sum()
df.head()

Unnamed: 0,arp.opcode,arp.hw.size,icmp.checksum,icmp.seq_le,icmp.unused,http.content_length,http.request.method,http.referer,http.request.version,http.response,...,mqtt.msgtype,mqtt.proto_len,mqtt.protoname,mqtt.topic,mqtt.topic_len,mqtt.ver,mbtcp.len,mbtcp.trans_id,mbtcp.unit_id,Attack_type
1860485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Password
1526476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Normal
1796404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DDoS_TCP
143399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Normal
859515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Normal


In [None]:
df.shape

(1909671, 47)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1909671 entries, 1860485 to 1293281
Data columns (total 47 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   arp.opcode                 float64
 1   arp.hw.size                float64
 2   icmp.checksum              float64
 3   icmp.seq_le                float64
 4   icmp.unused                float64
 5   http.content_length        float64
 6   http.request.method        object 
 7   http.referer               object 
 8   http.request.version       object 
 9   http.response              float64
 10  http.tls_port              float64
 11  tcp.ack                    float64
 12  tcp.ack_raw                float64
 13  tcp.checksum               float64
 14  tcp.connection.fin         float64
 15  tcp.connection.rst         float64
 16  tcp.connection.syn         float64
 17  tcp.connection.synack      float64
 18  tcp.flags                  float64
 19  tcp.flags.ack              float64
 

**Visulalize the colums with objects.**

In [None]:
df['http.request.method'].value_counts()

0.0         1658469
0            219116
GET           30473
POST           1212
TRACE           387
OPTIONS           6
PROPFIND          4
PUT               2
SEARCH            2
Name: http.request.method, dtype: int64

In [None]:
df['http.referer'].value_counts() 

0.0                                                                     1796035
0                                                                        113066
127.0.0.1                                                                   383
() { _; } >_[$($())] { echo 93e4r0-CVE-2014-6278: true; echo;echo; }        185
TESTING_PURPOSES_ONLY                                                         2
Name: http.referer, dtype: int64

In [None]:
df['http.request.version'].value_counts() 

0.0                                                                                1658111
0                                                                                   219474
HTTP/1.1                                                                             28681
HTTP/1.0                                                                              3382
-a HTTP/1.1                                                                              4
Src=javascript:alert('Vulnerable')><Img Src=\" HTTP/1.1                                  4
-al&ABSOLUTE_PATH_STUDIP=http://cirt.net/rfiinc.txt?? HTTP/1.1                           3
script>alert(1)/script><\" HTTP/1.1                                                      3
-al&_PHPLIB[libdir]=http://cirt.net/rfiinc.txt?? HTTP/1.1                                2
> HTTP/1.1                                                                               2
/etc/passwd|?data=Download HTTP/1.1                                                      2

In [None]:
df['dns.qry.name.len'].value_counts() 

0                         1362163
0.0                        545446
0.debian.pool.ntp.org         450
3.debian.pool.ntp.org         448
2.debian.pool.ntp.org         444
1.debian.pool.ntp.org         443
1.0                           227
raspberrypi.local              38
_googlecast._tcp.local          6
null-null.local                 6
Name: dns.qry.name.len, dtype: int64

In [None]:
df['mqtt.conack.flags'].value_counts()

0             1280941
0.0            545673
0x00000000      83017
1574358             9
1574359             9
1461383             4
1461384             4
1461589             3
1461591             3
1461074             2
1471198             2
1471199             2
1461073             2
Name: mqtt.conack.flags, dtype: int64

In [None]:
df['mqtt.protoname'].value_counts()

0       1280980
0.0      545673
MQTT      83018
Name: mqtt.protoname, dtype: int64

In [None]:
df['mqtt.topic'].value_counts()

0                           1280983
0.0                          545673
Temperature_and_Humidity      83015
Name: mqtt.topic, dtype: int64

**Apply Dummy Encoding for Text**

In [None]:
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn import preprocessing

def encode_text_dummy(df, name):

    dummies = pd.get_dummies(df[name])

    for x in dummies.columns:

        dummy_name = f"{name}-{x}"

        df[dummy_name] = dummies[x]

    df.drop(name, axis=1, inplace=True)

encode_text_dummy(df,'http.request.method')

encode_text_dummy(df,'http.referer')

encode_text_dummy(df,"http.request.version")

encode_text_dummy(df,"dns.qry.name.len")

encode_text_dummy(df,"mqtt.conack.flags")

encode_text_dummy(df,"mqtt.protoname")

encode_text_dummy(df,"mqtt.topic")

In [None]:
df.head()

Unnamed: 0,arp.opcode,arp.hw.size,icmp.checksum,icmp.seq_le,icmp.unused,http.content_length,http.response,http.tls_port,tcp.ack,tcp.ack_raw,...,mqtt.conack.flags-1471198,mqtt.conack.flags-1471199,mqtt.conack.flags-1574358,mqtt.conack.flags-1574359,mqtt.protoname-0,mqtt.protoname-0.0,mqtt.protoname-MQTT,mqtt.topic-0,mqtt.topic-0.0,mqtt.topic-Temperature_and_Humidity
1860485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,87.0,1093946000.0,...,0,0,0,0,0,1,0,0,1,0
1526476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4189800000.0,...,0,0,0,0,1,0,0,1,0,0
1796404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1069571000.0,1069571000.0,...,0,0,0,0,0,1,0,0,1,0
143399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,1,0,0
859515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59.0,1200409000.0,...,0,0,0,0,1,0,0,1,0,0


In [None]:
df.shape

(1909671, 96)

**Move Attack_Type at the End**

In [None]:
cols = list(df.columns.values)
cols.pop(cols.index('Attack_type'))
df = df[cols+['Attack_type']]

In [None]:
df.head()

Unnamed: 0,arp.opcode,arp.hw.size,icmp.checksum,icmp.seq_le,icmp.unused,http.content_length,http.response,http.tls_port,tcp.ack,tcp.ack_raw,...,mqtt.conack.flags-1471199,mqtt.conack.flags-1574358,mqtt.conack.flags-1574359,mqtt.protoname-0,mqtt.protoname-0.0,mqtt.protoname-MQTT,mqtt.topic-0,mqtt.topic-0.0,mqtt.topic-Temperature_and_Humidity,Attack_type
1860485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,87.0,1093946000.0,...,0,0,0,0,1,0,0,1,0,Password
1526476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4189800000.0,...,0,0,0,1,0,0,1,0,0,Normal
1796404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1069571000.0,1069571000.0,...,0,0,0,0,1,0,0,1,0,DDoS_TCP
143399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,1,0,0,Normal
859515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59.0,1200409000.0,...,0,0,0,1,0,0,1,0,0,Normal


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1909671 entries, 1860485 to 1293281
Data columns (total 96 columns):
 #   Column                                                                                                Dtype  
---  ------                                                                                                -----  
 0   arp.opcode                                                                                            float64
 1   arp.hw.size                                                                                           float64
 2   icmp.checksum                                                                                         float64
 3   icmp.seq_le                                                                                           float64
 4   icmp.unused                                                                                           float64
 5   http.content_length                                                    

**Export Complete Preprocessed Dataset**

In [None]:
df.to_csv('/content/drive/MyDrive/processed_IIoT.csv', encoding='utf-8', index=False)