## IoT Botnet Detection

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from glob import glob

In [2]:
os.getcwd()

'C:\\Users\\avnee\\OneDrive\\Desktop\\IoT-botnet-attack-detection-master'

In [3]:
# Data folder path
base_directory = 'C:/Users/avnee/OneDrive/Desktop/IoT-botnet-attack-detection-master/raw_data'
file_extension = "*.csv"

In [4]:
arr= os.listdir('C:/Users/avnee/OneDrive/Desktop/IoT-botnet-attack-detection-master/raw_data')
arr

['Danmini_Doorbell',
 'Ecobee_Thermostat',
 'Ennio_Doorbell',
 'Philips_B120N10_Baby_Monitor',
 'Provision_PT_737E_Security_Camera',
 'Provision_PT_838_Security_Camera',
 'Samsung_SNH_1011_N_Webcam',
 'SimpleHome_XCS7_1002_WHT_Security_Camera',
 'SimpleHome_XCS7_1003_WHT_Security_Camera']

In [5]:
# This function will read all the bengin data and load into bengin dataframe.
def load_iot_device_data(PATH, EXT):
    """
    Creates a data frame consisting of all the .csv-files in a given directory. The directory should
    be where the unzipped data files are stored. Assumes the file structurce is
        device name
            mirai_attacks(folder)
            gafgyt_attacks(folder)
            benign_traffic.csv
    Parameters
    ----------
    PATH : str
        The directory in which the data files are stored. 
    EXT : str
        Extension of the file
        
    Returns
    -------
    benign_data : pandas data frame 
        consisting of all the bengin data.
            
    """
    benign_dfs = []
    mirai_dfs = []
    gafgyt_dfs = []
    for path, subdir, files in os.walk(PATH):
        for file in glob(os.path.join(path, EXT)):
            if 'benign_traffic' in file:
                data = pd.read_csv(file)
                data['label'] = 'Benign'
                data['device'] = file.split('\\')[1]
                benign_dfs.append(data)
            if 'mirai_attacks' in file:
                data = pd.read_csv(file)
                data['label'] = 'Mirai'
                data['device'] = file.split('\\')[1]
                mirai_dfs.append(data)
            if 'gafgyt_attacks' in file:
                data = pd.read_csv(file)
                data['label'] = 'Gafgyt'
                data['device'] = file.split('\\')[1]
                gafgyt_dfs.append(data)

    benign_data = pd.concat(benign_dfs, ignore_index=True)
    mirai_data = pd.concat(mirai_dfs, ignore_index=True)
    gafgyt_data = pd.concat(gafgyt_dfs, ignore_index=True)

    return benign_data, mirai_data, gafgyt_data

In [6]:
benign_df, mirai_df, gafgyt_df = load_iot_device_data(base_directory, file_extension)

In [7]:
type(benign_df)

pandas.core.frame.DataFrame

In [8]:
# Checking first few row of the data
benign_df.head(3)

Unnamed: 0,MI_dir_L5_weight,MI_dir_L5_mean,MI_dir_L5_variance,MI_dir_L3_weight,MI_dir_L3_mean,MI_dir_L3_variance,MI_dir_L1_weight,MI_dir_L1_mean,MI_dir_L1_variance,MI_dir_L0.1_weight,...,HpHp_L0.1_pcc,HpHp_L0.01_weight,HpHp_L0.01_mean,HpHp_L0.01_std,HpHp_L0.01_magnitude,HpHp_L0.01_radius,HpHp_L0.01_covariance,HpHp_L0.01_pcc,label,device
0,1.0,60.0,0.0,1.0,60.0,0.0,1.0,60.0,0.0,1.0,...,0.0,1.0,60.0,0.0,60.0,0.0,0.0,0.0,Benign,Danmini_Doorbell
1,1.0,354.0,0.0,1.0,354.0,0.0,1.0,354.0,0.0,1.0,...,0.0,5.319895,344.262695,4.710446,344.262695,22.188299,0.0,0.0,Benign,Danmini_Doorbell
2,1.857879,360.45898,35.789338,1.912127,360.275733,35.923972,1.969807,360.091968,35.991542,1.996939,...,0.0,6.318264,347.703087,9.03466,347.703087,81.625077,0.0,0.0,Benign,Danmini_Doorbell


In [9]:
# Dimention of the data
benign_df.shape

(555932, 117)

In [10]:
# Information about Dataframe
benign_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555932 entries, 0 to 555931
Columns: 117 entries, MI_dir_L5_weight to device
dtypes: float64(115), object(2)
memory usage: 496.2+ MB


In [11]:
# Individual device data count
benign_df['device'].value_counts()

Philips_B120N10_Baby_Monitor                175240
Provision_PT_838_Security_Camera             98514
Provision_PT_737E_Security_Camera            62154
Samsung_SNH_1011_N_Webcam                    52150
Danmini_Doorbell                             49548
SimpleHome_XCS7_1002_WHT_Security_Camera     46585
Ennio_Doorbell                               39100
SimpleHome_XCS7_1003_WHT_Security_Camera     19528
Ecobee_Thermostat                            13113
Name: device, dtype: int64

In [12]:
# Checking data contain any null value
benign_df.isnull().values.any()

False

In [13]:
list(benign_df.columns)

['MI_dir_L5_weight',
 'MI_dir_L5_mean',
 'MI_dir_L5_variance',
 'MI_dir_L3_weight',
 'MI_dir_L3_mean',
 'MI_dir_L3_variance',
 'MI_dir_L1_weight',
 'MI_dir_L1_mean',
 'MI_dir_L1_variance',
 'MI_dir_L0.1_weight',
 'MI_dir_L0.1_mean',
 'MI_dir_L0.1_variance',
 'MI_dir_L0.01_weight',
 'MI_dir_L0.01_mean',
 'MI_dir_L0.01_variance',
 'H_L5_weight',
 'H_L5_mean',
 'H_L5_variance',
 'H_L3_weight',
 'H_L3_mean',
 'H_L3_variance',
 'H_L1_weight',
 'H_L1_mean',
 'H_L1_variance',
 'H_L0.1_weight',
 'H_L0.1_mean',
 'H_L0.1_variance',
 'H_L0.01_weight',
 'H_L0.01_mean',
 'H_L0.01_variance',
 'HH_L5_weight',
 'HH_L5_mean',
 'HH_L5_std',
 'HH_L5_magnitude',
 'HH_L5_radius',
 'HH_L5_covariance',
 'HH_L5_pcc',
 'HH_L3_weight',
 'HH_L3_mean',
 'HH_L3_std',
 'HH_L3_magnitude',
 'HH_L3_radius',
 'HH_L3_covariance',
 'HH_L3_pcc',
 'HH_L1_weight',
 'HH_L1_mean',
 'HH_L1_std',
 'HH_L1_magnitude',
 'HH_L1_radius',
 'HH_L1_covariance',
 'HH_L1_pcc',
 'HH_L0.1_weight',
 'HH_L0.1_mean',
 'HH_L0.1_std',
 'HH_L0.1

### Mirai data

In [14]:
type(mirai_df)

pandas.core.frame.DataFrame

In [15]:
mirai_df.shape

(3845085, 117)

In [16]:
mirai_df.head()

Unnamed: 0,MI_dir_L5_weight,MI_dir_L5_mean,MI_dir_L5_variance,MI_dir_L3_weight,MI_dir_L3_mean,MI_dir_L3_variance,MI_dir_L1_weight,MI_dir_L1_mean,MI_dir_L1_variance,MI_dir_L0.1_weight,...,HpHp_L0.1_pcc,HpHp_L0.01_weight,HpHp_L0.01_mean,HpHp_L0.01_std,HpHp_L0.01_magnitude,HpHp_L0.01_radius,HpHp_L0.01_covariance,HpHp_L0.01_pcc,label,device
0,1.0,98.0,0.0,1.0,98.0,0.0,1.0,98.0,0.0,1.0,...,0.0,1.0,98.0,0.0,98.0,0.0,0.0,0.0,Mirai,Danmini_Doorbell
1,1.029,98.0,1.818989e-12,1.11952,98.0,0.0,1.492583,98.0,3.637979e-12,1.93164,...,0.0,1.992944,98.0,1e-06,138.592929,1.818989e-12,0.0,0.0,Mirai,Danmini_Doorbell
2,1.504156,76.725612,228.1808,1.729662,79.499272,249.746357,2.294102,84.051188,251.7926,2.904273,...,0.0,1.0,66.0,0.0,114.856432,0.0,0.0,0.0,Mirai,Danmini_Doorbell
3,2.460087,75.617679,137.22,2.699075,77.461807,164.269331,3.280499,80.987267,196.4467,3.902546,...,0.0,1.0,74.0,0.0,74.0,0.0,0.0,0.0,Mirai,Danmini_Doorbell
4,3.460055,75.150149,98.09937,3.699054,76.525944,122.224798,4.28049,79.354915,159.2943,4.902545,...,0.0,1.0,74.0,0.0,74.0,0.0,0.0,0.0,Mirai,Danmini_Doorbell


In [17]:
mirai_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3845085 entries, 0 to 3845084
Columns: 117 entries, MI_dir_L5_weight to device
dtypes: float64(115), object(2)
memory usage: 3.4+ GB


In [18]:
mirai_df['device'].value_counts()

Philips_B120N10_Baby_Monitor                610714
SimpleHome_XCS7_1003_WHT_Security_Camera    514860
SimpleHome_XCS7_1002_WHT_Security_Camera    513248
Ecobee_Thermostat                           512133
Ennio_Doorbell                              512133
Provision_PT_737E_Security_Camera           436010
Provision_PT_838_Security_Camera            429337
Danmini_Doorbell                            316650
Name: device, dtype: int64

In [19]:
# Checking data contain any null value
mirai_df.isnull().values.any()

False

### Gafgyt Data

In [20]:
type(gafgyt_df)

pandas.core.frame.DataFrame

In [21]:
gafgyt_df.head(3)

Unnamed: 0,MI_dir_L5_weight,MI_dir_L5_mean,MI_dir_L5_variance,MI_dir_L3_weight,MI_dir_L3_mean,MI_dir_L3_variance,MI_dir_L1_weight,MI_dir_L1_mean,MI_dir_L1_variance,MI_dir_L0.1_weight,...,HpHp_L0.1_pcc,HpHp_L0.01_weight,HpHp_L0.01_mean,HpHp_L0.01_std,HpHp_L0.01_magnitude,HpHp_L0.01_radius,HpHp_L0.01_covariance,HpHp_L0.01_pcc,label,device
0,1.0,98.0,0.0,1.0,98.0,0.0,1.0,98.0,0.0,1.0,...,0.0,1.0,98.0,0.0,98.0,0.0,0.0,0.0,Gafgyt,Danmini_Doorbell
1,1.029,98.0,1.818989e-12,1.11952,98.0,0.0,1.492583,98.0,3.637979e-12,1.93164,...,0.0,1.992944,98.0,1e-06,138.592929,1.818989e-12,0.0,0.0,Gafgyt,Danmini_Doorbell
2,1.504156,76.725612,228.1808,1.729662,79.499272,249.746357,2.294102,84.051188,251.7926,2.904273,...,0.0,1.0,66.0,0.0,114.856432,0.0,0.0,0.0,Gafgyt,Danmini_Doorbell


In [22]:
gafgyt_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1895539 entries, 0 to 1895538
Columns: 117 entries, MI_dir_L5_weight to device
dtypes: float64(115), object(2)
memory usage: 1.7+ GB


In [23]:
gafgyt_df.shape

(1895539, 117)

In [24]:
gafgyt_df['device'].value_counts()

Provision_PT_737E_Security_Camera    330096
Danmini_Doorbell                     316650
Ennio_Doorbell                       316400
Philips_B120N10_Baby_Monitor         312723
Ecobee_Thermostat                    310630
Provision_PT_838_Security_Camera     309040
Name: device, dtype: int64

In [26]:
gafgyt_df.isnull().values.any()

False

### Combin all data into one CSV File

In [27]:
dfs = [benign_df, mirai_df, gafgyt_df]

In [28]:
df = pd.concat(dfs)

In [29]:
df.head()

Unnamed: 0,MI_dir_L5_weight,MI_dir_L5_mean,MI_dir_L5_variance,MI_dir_L3_weight,MI_dir_L3_mean,MI_dir_L3_variance,MI_dir_L1_weight,MI_dir_L1_mean,MI_dir_L1_variance,MI_dir_L0.1_weight,...,HpHp_L0.1_pcc,HpHp_L0.01_weight,HpHp_L0.01_mean,HpHp_L0.01_std,HpHp_L0.01_magnitude,HpHp_L0.01_radius,HpHp_L0.01_covariance,HpHp_L0.01_pcc,label,device
0,1.0,60.0,0.0,1.0,60.0,0.0,1.0,60.0,0.0,1.0,...,0.0,1.0,60.0,0.0,60.0,0.0,0.0,0.0,Benign,Danmini_Doorbell
1,1.0,354.0,0.0,1.0,354.0,0.0,1.0,354.0,0.0,1.0,...,0.0,5.319895,344.262695,4.710446,344.262695,22.188299,0.0,0.0,Benign,Danmini_Doorbell
2,1.857879,360.45898,35.789338,1.912127,360.275733,35.923972,1.969807,360.091968,35.991542,1.996939,...,0.0,6.318264,347.703087,9.03466,347.703087,81.625077,0.0,0.0,Benign,Danmini_Doorbell
3,1.0,337.0,0.0,1.0,337.0,0.0,1.0,337.0,0.0,1.0,...,0.0,1.0,337.0,0.0,337.0,0.0,0.0,0.0,Benign,Danmini_Doorbell
4,1.680223,172.140917,18487.44875,1.79358,182.560279,18928.1753,1.925828,193.165753,19153.79581,1.992323,...,0.0,1.0,60.0,0.0,60.0,0.0,0.0,0.0,Benign,Danmini_Doorbell


In [30]:
df.sample(10)

Unnamed: 0,MI_dir_L5_weight,MI_dir_L5_mean,MI_dir_L5_variance,MI_dir_L3_weight,MI_dir_L3_mean,MI_dir_L3_variance,MI_dir_L1_weight,MI_dir_L1_mean,MI_dir_L1_variance,MI_dir_L0.1_weight,...,HpHp_L0.1_pcc,HpHp_L0.01_weight,HpHp_L0.01_mean,HpHp_L0.01_std,HpHp_L0.01_magnitude,HpHp_L0.01_radius,HpHp_L0.01_covariance,HpHp_L0.01_pcc,label,device
721204,160.32511,303.846223,60999.05,239.998449,321.437576,60800.56,659.754845,381.588669,55445.51817,6549.568571,...,0.0,1.0,554.0,0.0,554.0,0.0,0.0,0.0,Mirai,Ecobee_Thermostat
3113584,166.188993,375.791733,56276.7,246.891207,410.413506,50314.65,671.351182,436.390853,44266.998569,6177.807133,...,0.0,1.0,60.0,0.0,60.0,0.0,0.0,0.0,Mirai,SimpleHome_XCS7_1002_WHT_Security_Camera
186288,1.043036,66.0,1.82e-12,1.151561,66.0,2.64e-08,1.614444,66.004526,0.103274,10.118946,...,2.25e-09,18.972952,66.000799,0.097899,115.779841,229.3482,-0.002787989,-0.00188,Benign,Philips_B120N10_Baby_Monitor
1526198,134.379207,60.008925,0.1337945,154.929177,60.021265,0.3185232,293.192253,60.030871,0.46221,2337.759301,...,0.0,1.0,60.0,0.0,60.0,0.0,0.0,0.0,Mirai,Philips_B120N10_Baby_Monitor
1200856,153.922215,432.700655,45199.14,251.584247,417.328045,48813.86,669.407898,418.513076,48538.810472,6464.435908,...,0.0,1.0,554.0,0.0,554.0,0.0,0.0,0.0,Mirai,Ennio_Doorbell
96422,1.0,60.000003,8.06e-05,1.000092,60.002749,0.08247508,1.093785,62.57231,70.552528,3.639087,...,0.0,4.427358,60.0,0.0,84.852814,9.09e-13,3.99e-29,0.0,Benign,Ennio_Doorbell
1882564,112.019526,319.110732,60862.33,188.438884,313.247289,60969.97,538.946236,325.806865,60654.908605,5003.402316,...,0.0,9541.363074,554.0,3.7e-05,554.0,1.396984e-09,0.0,0.0,Mirai,Philips_B120N10_Baby_Monitor
1255271,99.574914,450.703789,40358.22,174.991384,422.309877,47712.63,500.790988,400.234926,52316.244977,3654.733183,...,0.0,4667.383147,554.0,1.7e-05,554.0,2.910383e-10,0.0,0.0,Mirai,Ennio_Doorbell
1081487,104.268081,73.101524,11.77141,183.29527,72.117398,22.81223,639.922757,71.054282,32.5628,7040.355215,...,0.0,1.0,74.0,0.0,74.0,0.0,0.0,0.0,Mirai,Ennio_Doorbell
662580,165.154846,74.005883,0.1411569,275.118864,74.014123,0.3387567,816.522022,74.030556,0.966024,7739.517653,...,0.0,1.701902,74.0,0.0,95.268043,0.0,0.0,0.0,Gafgyt,Ennio_Doorbell


In [31]:
df.shape

(6296556, 117)

In [32]:
sample_data = df.sample(500)

In [33]:
sample_data.to_csv('sample.csv')