# IoT Intrusion Detection

The N-BaIoT Dataset contains traffic data for 9 IoT devices. The data comprise of both benign traffic and of a variety of malicious attacks. Here we run three deep neural networks to identify cyberattacks on a Provision PT-737E Security Camera.

In [1]:
import numpy as np
import pandas as pd

In [27]:
benign=pd.read_csv('../input/nbaiot-dataset/5.benign.csv')
g_c=pd.read_csv('../input/nbaiot-dataset/5.gafgyt.combo.csv')
g_j=pd.read_csv('../input/nbaiot-dataset/5.gafgyt.junk.csv')
g_s=pd.read_csv('../input/nbaiot-dataset/5.gafgyt.scan.csv')
g_t=pd.read_csv('../input/nbaiot-dataset/5.gafgyt.tcp.csv')
g_u=pd.read_csv('../input/nbaiot-dataset/5.gafgyt.udp.csv')
m_a=pd.read_csv('../input/nbaiot-dataset/5.mirai.ack.csv')
m_sc=pd.read_csv('../input/nbaiot-dataset/5.mirai.scan.csv')
m_sy=pd.read_csv('../input/nbaiot-dataset/5.mirai.syn.csv')
m_u=pd.read_csv('../input/nbaiot-dataset/5.mirai.udp.csv')
m_u_p=pd.read_csv('../input/nbaiot-dataset/5.mirai.udpplain.csv')

benign=benign.sample(frac=0.25,replace=False)
g_c=g_c.sample(frac=0.25,replace=False)
g_j=g_j.sample(frac=0.5,replace=False)
g_s=g_s.sample(frac=0.5,replace=False)
g_t=g_t.sample(frac=0.15,replace=False)
g_u=g_u.sample(frac=0.15,replace=False)
m_a=m_a.sample(frac=0.25,replace=False)
m_sc=m_sc.sample(frac=0.15,replace=False)
m_sy=m_sy.sample(frac=0.25,replace=False)
m_u=m_u.sample(frac=0.1,replace=False)
m_u_p=m_u_p.sample(frac=0.27,replace=False)

benign['type']='benign'
m_u['type']='mirai_udp'
g_c['type']='gafgyt_combo'
g_j['type']='gafgyt_junk'
g_s['type']='gafgyt_scan'
g_t['type']='gafgyt_tcp'
g_u['type']='gafgyt_udp'
m_a['type']='mirai_ack'
m_sc['type']='mirai_scan'
m_sy['type']='mirai_syn'
m_u_p['type']='mirai_udpplain'

data=pd.concat([benign,m_u,g_c,g_j,g_s,g_t,g_u,m_a,m_sc,m_sy,m_u_p],
               axis=0, sort=False, ignore_index=True)

In [28]:
#how many instances of each class
data.groupby('type')['type'].count()

type
benign            15538
gafgyt_combo      15345
gafgyt_junk       15449
gafgyt_scan       14648
gafgyt_tcp        15676
gafgyt_udp        15602
mirai_ack         15138
mirai_scan        14517
mirai_syn         16436
mirai_udp         15625
mirai_udpplain    15304
Name: type, dtype: int64

In [29]:
#shuffle rows of dataframe 
sampler=np.random.permutation(len(data))
data=data.take(sampler)
data.head()

Unnamed: 0,MI_dir_L5_weight,MI_dir_L5_mean,MI_dir_L5_variance,MI_dir_L3_weight,MI_dir_L3_mean,MI_dir_L3_variance,MI_dir_L1_weight,MI_dir_L1_mean,MI_dir_L1_variance,MI_dir_L0.1_weight,...,HpHp_L0.1_covariance,HpHp_L0.1_pcc,HpHp_L0.01_weight,HpHp_L0.01_mean,HpHp_L0.01_std,HpHp_L0.01_magnitude,HpHp_L0.01_radius,HpHp_L0.01_covariance,HpHp_L0.01_pcc,type
60031,178.995599,74.050143,1.200907,290.819612,74.05109,1.22355,842.04511,74.046596,1.30069,7133.068383,...,0.0,0.0,1.0,74.0,0.0,74.0,0.0,0.0,0.0,gafgyt_junk
64001,101.851537,74.224578,1.895037,145.156551,74.201216,2.085855,235.881182,74.232467,4.439082,1145.050353,...,0.0,0.0,2.898556,74.0,1e-06,74.0,1.818989e-12,0.0,0.0,gafgyt_scan
396,1.0,89.999995,0.000142,1.000083,89.997512,0.074627,1.044969,88.725856,36.314277,3.859209,...,-1.4100000000000001e-22,-8.19e-14,4.144635,88.766027,5.957895,107.166871,35.49662,-0.051091,-0.0291,benign
160761,117.712393,241.659409,56738.482239,148.313014,268.062774,59487.822578,327.181582,298.674125,60918.005347,3168.853599,...,0.0,0.0,1.0,60.0,0.0,60.0,0.0,0.0,0.0,mirai_udpplain
16788,72.610169,396.720117,52959.175999,132.006313,366.940755,57414.813814,385.989764,350.781922,59080.172651,3912.156872,...,0.0,0.0,1.0,554.0,0.0,554.0,0.0,0.0,0.0,mirai_udp


In [30]:
#dummy encode labels, store separately
labels_full=pd.get_dummies(data['type'], prefix='type')
labels_full.head()

Unnamed: 0,type_benign,type_gafgyt_combo,type_gafgyt_junk,type_gafgyt_scan,type_gafgyt_tcp,type_gafgyt_udp,type_mirai_ack,type_mirai_scan,type_mirai_syn,type_mirai_udp,type_mirai_udpplain
60031,0,0,1,0,0,0,0,0,0,0,0
64001,0,0,0,1,0,0,0,0,0,0,0
396,1,0,0,0,0,0,0,0,0,0,0
160761,0,0,0,0,0,0,0,0,0,0,1
16788,0,0,0,0,0,0,0,0,0,1,0


In [31]:
#drop labels from training dataset
data=data.drop(columns='type')
data.head()

Unnamed: 0,MI_dir_L5_weight,MI_dir_L5_mean,MI_dir_L5_variance,MI_dir_L3_weight,MI_dir_L3_mean,MI_dir_L3_variance,MI_dir_L1_weight,MI_dir_L1_mean,MI_dir_L1_variance,MI_dir_L0.1_weight,...,HpHp_L0.1_radius,HpHp_L0.1_covariance,HpHp_L0.1_pcc,HpHp_L0.01_weight,HpHp_L0.01_mean,HpHp_L0.01_std,HpHp_L0.01_magnitude,HpHp_L0.01_radius,HpHp_L0.01_covariance,HpHp_L0.01_pcc
60031,178.995599,74.050143,1.200907,290.819612,74.05109,1.22355,842.04511,74.046596,1.30069,7133.068383,...,0.0,0.0,0.0,1.0,74.0,0.0,74.0,0.0,0.0,0.0
64001,101.851537,74.224578,1.895037,145.156551,74.201216,2.085855,235.881182,74.232467,4.439082,1145.050353,...,9.094947e-13,0.0,0.0,2.898556,74.0,1e-06,74.0,1.818989e-12,0.0,0.0
396,1.0,89.999995,0.000142,1.000083,89.997512,0.074627,1.044969,88.725856,36.314277,3.859209,...,6.55e-06,-1.4100000000000001e-22,-8.19e-14,4.144635,88.766027,5.957895,107.166871,35.49662,-0.051091,-0.0291
160761,117.712393,241.659409,56738.482239,148.313014,268.062774,59487.822578,327.181582,298.674125,60918.005347,3168.853599,...,0.0,0.0,0.0,1.0,60.0,0.0,60.0,0.0,0.0,0.0
16788,72.610169,396.720117,52959.175999,132.006313,366.940755,57414.813814,385.989764,350.781922,59080.172651,3912.156872,...,0.0,0.0,0.0,1.0,554.0,0.0,554.0,0.0,0.0,0.0


In [32]:
#standardize numerical columns
def standardize(df,col):
    df[col]= (df[col]-df[col].mean())/df[col].std()

data_st=data.copy()
for i in (data_st.iloc[:,:-1].columns):
    standardize (data_st,i)

data_st.head()

Unnamed: 0,MI_dir_L5_weight,MI_dir_L5_mean,MI_dir_L5_variance,MI_dir_L3_weight,MI_dir_L3_mean,MI_dir_L3_variance,MI_dir_L1_weight,MI_dir_L1_mean,MI_dir_L1_variance,MI_dir_L0.1_weight,...,HpHp_L0.1_radius,HpHp_L0.1_covariance,HpHp_L0.1_pcc,HpHp_L0.01_weight,HpHp_L0.01_mean,HpHp_L0.01_std,HpHp_L0.01_magnitude,HpHp_L0.01_radius,HpHp_L0.01_covariance,HpHp_L0.01_pcc
60031,1.59945,-0.507971,-0.604957,1.712892,-0.53279,-0.626508,1.874677,-0.561921,-0.643784,1.894803,...,-0.100527,-0.098394,-0.0714,-0.1991,-0.384781,-0.129166,-0.406459,-0.099967,-0.095797,0.0
64001,0.407907,-0.506628,-0.604927,0.280967,-0.531579,-0.626474,-0.282697,-0.560348,-0.643667,-0.651224,...,-0.100527,-0.098394,-0.0714,-0.198116,-0.384781,-0.129166,-0.406459,-0.099967,-0.095797,0.0
396,-1.149815,-0.385175,-0.605008,-1.136147,-0.404194,-0.626554,-1.118493,-0.437698,-0.642472,-1.136444,...,-0.100527,-0.098394,-0.0714,-0.197469,-0.300168,0.061202,-0.218981,-0.097936,-0.095812,-0.0291
160761,0.652889,0.782431,1.81822,0.311997,1.031775,1.750574,0.042247,1.338988,1.639342,0.209271,...,-0.100527,-0.098394,-0.0714,-0.1991,-0.465006,-0.129166,-0.485594,-0.099967,-0.095797,0.0
16788,-0.043747,1.976223,1.656811,0.151695,1.829155,1.667737,0.251548,1.77995,1.570461,0.525314,...,-0.100527,-0.098394,-0.0714,-0.1991,2.365759,-0.129166,2.306766,-0.099967,-0.095797,0.0


In [33]:
#training data for the neural net
train_data_st=data_st.values
train_data_st

array([[ 1.59945016, -0.50797071, -0.60495687, ..., -0.09996714,
        -0.09579688,  0.        ],
       [ 0.40790665, -0.50662775, -0.60492723, ..., -0.09996714,
        -0.09579688,  0.        ],
       [-1.14981525, -0.38517491, -0.60500815, ..., -0.09793573,
        -0.09581209, -0.02910016],
       ...,
       [ 1.60525428,  0.9524347 ,  2.02359887, ..., -0.09996714,
        -0.09579688,  0.        ],
       [-1.14981525, -0.61614089, -0.60500816, ..., -0.09996714,
        -0.09579688,  0.        ],
       [-0.9177174 ,  0.26633739,  1.59479183, ...,  4.30946084,
         6.58235836,  0.41975018]])

In [34]:
#labels for training
labels=labels_full.values
labels

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=uint8)

### Keras model

In [35]:
#import libraries
from sklearn.model_selection import train_test_split
from sklearn import metrics
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping


# test/train split  25% test
x_train_st, x_test_st, y_train_st, y_test_st = train_test_split(
    train_data_st, labels, test_size=0.25, random_state=42)

#  create and fit model
model = Sequential()
model.add(Dense(10, input_dim=train_data_st.shape[1], activation='relu'))
model.add(Dense(40, input_dim=train_data_st.shape[1], activation='relu'))
model.add(Dense(10, input_dim=train_data_st.shape[1], activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.add(Dense(labels.shape[1],activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, 
                        patience=5, verbose=1, mode='auto')
model.fit(x_train_st,y_train_st,validation_data=(x_test_st,y_test_st),
          callbacks=[monitor],verbose=2,epochs=500)

Epoch 1/500
3968/3968 - 6s - loss: 1.2674 - val_loss: 0.9334
Epoch 2/500
3968/3968 - 5s - loss: 0.7948 - val_loss: 0.7126
Epoch 3/500
3968/3968 - 5s - loss: 0.6303 - val_loss: 0.5619
Epoch 4/500
3968/3968 - 5s - loss: 0.5403 - val_loss: 0.4992
Epoch 5/500
3968/3968 - 5s - loss: 0.4787 - val_loss: 0.4690
Epoch 6/500
3968/3968 - 6s - loss: 0.4314 - val_loss: 0.4915
Epoch 7/500
3968/3968 - 5s - loss: 0.4025 - val_loss: 0.4030
Epoch 8/500
3968/3968 - 5s - loss: 0.3806 - val_loss: 0.4125
Epoch 9/500
3968/3968 - 5s - loss: 0.3598 - val_loss: 0.3538
Epoch 10/500
3968/3968 - 5s - loss: 0.3487 - val_loss: 0.4265
Epoch 11/500
3968/3968 - 5s - loss: 0.3412 - val_loss: 0.3331
Epoch 12/500
3968/3968 - 5s - loss: 0.3322 - val_loss: 0.3347
Epoch 13/500
3968/3968 - 5s - loss: 0.3227 - val_loss: 0.3229
Epoch 14/500
3968/3968 - 5s - loss: 0.3154 - val_loss: 0.3312
Epoch 15/500
3968/3968 - 6s - loss: 0.3102 - val_loss: 0.3388
Epoch 16/500
3968/3968 - 6s - loss: 0.3017 - val_loss: 0.3154
Epoch 17/500
3968

<tensorflow.python.keras.callbacks.History at 0x7fe9581cba50>

In [36]:
# metrics
pred_st = model.predict(x_test_st)
pred_st = np.argmax(pred_st,axis=1)
y_eval_st = np.argmax(y_test_st,axis=1)
score_st = metrics.accuracy_score(y_eval_st, pred_st)
print("accuracy: {}".format(score_st))

accuracy: 0.9005907372400757


In [39]:
#second model
model2 = Sequential()
model2.add(Dense(32, input_dim=train_data_st.shape[1], activation='relu'))
model2.add(Dense(72, input_dim=train_data_st.shape[1], activation='relu'))
model2.add(Dense(32, input_dim=train_data_st.shape[1], activation='relu'))
model2.add(Dense(1, kernel_initializer='normal'))
model2.add(Dense(labels.shape[1],activation='softmax'))
model2.compile(loss='categorical_crossentropy', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, 
                        patience=5, verbose=1, mode='auto')
model2.fit(x_train_st,y_train_st,validation_data=(x_test_st,y_test_st),
          callbacks=[monitor],verbose=2,epochs=100)

Epoch 1/100
3968/3968 - 6s - loss: 1.1276 - val_loss: 0.8256
Epoch 2/100
3968/3968 - 6s - loss: 0.7210 - val_loss: 0.6107
Epoch 3/100
3968/3968 - 6s - loss: 0.5550 - val_loss: 0.4708
Epoch 4/100
3968/3968 - 5s - loss: 0.4604 - val_loss: 0.4115
Epoch 5/100
3968/3968 - 6s - loss: 0.3906 - val_loss: 0.3522
Epoch 6/100
3968/3968 - 6s - loss: 0.3606 - val_loss: 0.3247
Epoch 7/100
3968/3968 - 6s - loss: 0.3429 - val_loss: 0.3278
Epoch 8/100
3968/3968 - 6s - loss: 0.3107 - val_loss: 0.2856
Epoch 9/100
3968/3968 - 6s - loss: 0.2837 - val_loss: 0.2830
Epoch 10/100
3968/3968 - 6s - loss: 0.2680 - val_loss: 0.2414
Epoch 11/100
3968/3968 - 6s - loss: 0.2535 - val_loss: 0.2352
Epoch 12/100
3968/3968 - 6s - loss: 0.2162 - val_loss: 0.2018
Epoch 13/100
3968/3968 - 6s - loss: 0.2297 - val_loss: 0.2170
Epoch 14/100
3968/3968 - 6s - loss: 0.2098 - val_loss: 0.3826
Epoch 15/100
3968/3968 - 6s - loss: 0.1917 - val_loss: 0.2199
Epoch 16/100
3968/3968 - 6s - loss: 0.2061 - val_loss: 0.3002
Epoch 17/100
3968

<tensorflow.python.keras.callbacks.History at 0x7fe95a786310>

In [40]:
# metrics
pred_st = model2.predict(x_test_st)
pred_st = np.argmax(pred_st,axis=1)
y_eval_st = np.argmax(y_test_st,axis=1)
score_st = metrics.accuracy_score(y_eval_st, pred_st)
print("accuracy: {}".format(score_st))

accuracy: 0.8926748582230624
