# Multi-class Classification with DNS data, Combined Dataset and K-Fold Cross Validation

This model will use the combined dataset and predict specific IoT devices.  The following parameters apply:

 - Using DNS
 - Dropping srcport and dstport 
 - Use categorical cross entropy




Modified version created on Google drive

In [43]:
from pathlib import Path
import os
import re
import apsw
import pandas as pd
import time

# The fun starts here...

In [44]:
cwd = Path.cwd().parent.parent
print(cwd)

/home/ricdeez/uni/projects/iotnetlearn


In [45]:
dbs = [
    os.path.join(cwd, 'db', 'db01', 'NetCollector.sqlite'),
    os.path.join(cwd, 'db', 'db02', 'NetCollector.sqlite'),
    os.path.join(cwd, 'db', 'db03', 'NetCollector.sqlite'),
]

In [46]:
from sqlalchemy import create_engine
df1 = None
df2 = None
df3 = None
for idx, db_path in enumerate(dbs):
    engine = create_engine(f'sqlite:////{db_path}')

    print(engine)

    sql = """

    select d.srcPort as srcport, 
           d.dstPort as dstport, 
           sum(d.frameSize) as totalframesize, 
           min(d.frameSize) as minframesize,
           max(d.frameSize) as maxframesize,
           avg(d.frameTTL) as framettl,
           dns.dnsquery,
           d.manufacturer,
           d.device_type
    from deviceLog d inner join dnsqueries dns on d.frameNum = dns.pktnum
    where d.device_type is not null
    group by d.srcAddr, d.dstAddr, d.srcPort, d.dstPort

    """

    if idx == 0:
        df1 = pd.read_sql_query(sql, engine)
    elif idx == 1:
        df2 = pd.read_sql_query(sql, engine)
    elif idx == 2:
        df3 = pd.read_sql_query(sql, engine)


Engine(sqlite://///home/ricdeez/uni/projects/iotnetlearn/db/db01/NetCollector.sqlite)
Engine(sqlite://///home/ricdeez/uni/projects/iotnetlearn/db/db02/NetCollector.sqlite)
Engine(sqlite://///home/ricdeez/uni/projects/iotnetlearn/db/db03/NetCollector.sqlite)


In [47]:
df1 = pd.concat([df1, df2], axis='rows', ignore_index=True)
df1 = pd.concat([df1, df3], axis='rows', ignore_index=True)
df = df1
df.describe()


Unnamed: 0,totalframesize,minframesize,maxframesize,framettl
count,32799.0,32799.0,32799.0,32799.0
mean,164.587701,84.173207,85.463063,92.856752
std,3267.692046,8.832032,9.917332,48.956212
min,65.0,65.0,65.0,1.0
25%,81.0,78.0,78.0,64.0
50%,91.0,81.0,81.0,64.0
75%,146.0,91.0,93.0,128.0
max,481258.0,182.0,366.0,255.0


In [48]:
from keras.preprocessing.text import Tokenizer
dns = df['dnsquery']

tk = Tokenizer()
tk.fit_on_texts(dns)
num_words = 10
tk.word_index = {e:i for e,i in tk.word_index.items() if i < num_words} 

encoded_dns=tk.texts_to_matrix(dns, mode='binary')
encoded_dns.shape

cols = [f'word_idx{i+1}' for i in range(num_words)]

df2 = pd.DataFrame(data=encoded_dns, columns=cols)
df = pd.concat([df, df2], axis='columns')


In [49]:
df.head()

Unnamed: 0,srcport,dstport,totalframesize,minframesize,maxframesize,framettl,dnsquery,manufacturer,device_type,word_idx1,word_idx2,word_idx3,word_idx4,word_idx5,word_idx6,word_idx7,word_idx8,word_idx9,word_idx10
0,5353,5353,13758,70,82,1.0,_googlecast._tcp.local.,Rivet Networks,Other,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5353,5353,481258,82,132,255.0,chromecast-audio-46eabec7bd7a728a79fb6231c4ae5...,"Google, Inc.",Chromecast-Audio,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5353,5353,1891,120,127,255.0,_00000000-06a1-86ad-4c60-2ffbf090480e._sub._ho...,"Apple, Inc.",Apple TV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5353,5353,882,104,190,255.0,_sleep-proxy._udp.local.,"Apple, Inc.",Other,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5353,5353,8755,74,190,250.637931,_sleep-proxy._udp.local.,"Apple, Inc.",Other,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Skipping this bit

We will not be processing the next cell for the model with the DNS data as most
srcport and dstport entries will be either 53 or 5353.

In [50]:
"""
import numpy as np

df['is_iot'] = np.where(df['device_type'] == 'Other', 0, 1)
ports = ['53', '5353', '443', '5000', '49152', '80', '8080', '123', '49153']
features = ['dstport', 'srcport']
for feature in features:
    for port in ports:
        exec('df[\'{}_is_{}\'] = np.where(df[\'{}\'] == \'{}\', 1, 0)'.format(
                feature, port, feature, port))
"""

"\nimport numpy as np\n\ndf['is_iot'] = np.where(df['device_type'] == 'Other', 0, 1)\nports = ['53', '5353', '443', '5000', '49152', '80', '8080', '123', '49153']\nfeatures = ['dstport', 'srcport']\nfor feature in features:\n    for port in ports:\n        exec('df['{}_is_{}'] = np.where(df['{}'] == '{}', 1, 0)'.format(\n                feature, port, feature, port))\n"

In [51]:
drop_columns = ['srcport', 'dstport']
df.drop(drop_columns, axis='columns', inplace=True)

In [52]:
df.drop('dnsquery', axis='columns', inplace=True)

## Categorical Data Mapping

For the deep learning algorithm to work, we need to get rid of all categorical data.  For the Manufacturer's we will create a 1:1 mapping of the manufacturer name as per the wireshark OUI lookup dataset and the relative position of that in our ordered array of unique entries

In [53]:
mapping = {k: v for v, k in enumerate((x for x in df['manufacturer'].unique() if len(x)>0),1)}
print(mapping)

df['manufacturer'] = df['manufacturer'].map(mapping)


{'Rivet Networks': 1, 'Google, Inc.': 2, 'Apple, Inc.': 3, 'Technicolor CH USA Inc.': 4, 'Microsoft Corporation': 5, 'Intel Corporate': 6, 'Realtek Semiconductor Corp.': 7, 'Pegatron Corporation': 8, 'Ubiquiti Networks Inc.': 9, 'Sony Corporation': 10, 'ASUSTek COMPUTER INC.': 11, 'Microsoft': 12, 'Samsung Electro-Mechanics(Thailand)': 13, 'Netatmo': 14, 'Lifi Labs Management Pty Ltd': 15, 'Invoxia': 16, 'Withings': 17, 'Shenzhen Reecam Tech.Ltd.': 18, 'Belkin International Inc.': 19, 'Physical Graph Corporation': 20, 'AzureWave Technology Inc.': 21, 'Samsung Electronics Co.,Ltd': 22, 'Amazon Technologies Inc.': 23, 'HTC Corporation': 24}


## MinMax Scaling for Numerical Features

We now apply minmax scaling to each feature as per the formula below:

\begin{align}
\dot{x\tiny{i}} & = \frac{x\tiny{i} \small- min(X)}{max(X) - min(X)} \Large{\forall} \normalsize{x}\tiny{i} \small\in X
\end{align}



In [54]:
features = ['totalframesize', 'minframesize', 'maxframesize', 'framettl']
for feature in features:
    df[feature] = (df[feature] - df[feature].min()) / \
        (df[feature].max() - df[feature].min())

We will initially train the model with 10% of the data to see how well it performs.  Once we have tuned it a bit, we will repeat with the rest of the data.

In [55]:
df_sample = df.sample(frac=0.1)
df_sample.describe()

Unnamed: 0,totalframesize,minframesize,maxframesize,framettl,manufacturer,word_idx1,word_idx2,word_idx3,word_idx4,word_idx5,word_idx6,word_idx7,word_idx8,word_idx9,word_idx10
count,3280.0,3280.0,3280.0,3280.0,3280.0,3280.0,3280.0,3280.0,3280.0,3280.0,3280.0,3280.0,3280.0,3280.0,3280.0
mean,0.000428,0.164613,0.068038,0.355664,9.88628,0.0,0.850305,0.358537,0.199085,0.199085,0.199085,0.19878,0.157622,0.125305,0.102134
std,0.009389,0.076225,0.032889,0.185145,3.404094,0.0,0.356827,0.479644,0.399373,0.399373,0.399373,0.399143,0.364442,0.331115,0.302871
min,6e-06,0.008547,0.009967,0.248031,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.3e-05,0.111111,0.046512,0.248031,9.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5.4e-05,0.136752,0.056478,0.248031,9.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.000164,0.222222,0.089701,0.5,9.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.420091,1.0,0.734219,1.0,24.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Machine learning - session 1

Firstly, we will try to see if we can converge our model, we'll split the dataset as follows:

1. Training set (80%)
2. Test set (20%)
3. We have not set up a validation set as we will continue to validate the model against new captures

The first model will be pretty basic and will only be able to `predict` if session data corresponds to an IoT device or not.

Later runs will refine the model to try to predict the device name from the data that we pass to it.  The DNN will be compriosed of the following layers:

[Input Layer: x inputs] -> [Hidden layer 1: 32 nodes] -> [Hidden layer 2: 64 nodes] -> [Hidden layer 3: 32 nodes] -> [Output layer: 1 output]

the `relu` activation function defined as f(x) = max(0, x) will be used for each layer apart from the last layer which will use the sigmoid function.  The adam optimiser will be used and the loss function will be the binary_crossentropy which is best for binary classification problems.

The Number of inputs on the Input layer is determined by the number of words that we keep for the DNS word bag


In [56]:
from sklearn.model_selection import train_test_split


In [57]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

We drop the device_type column from the main dataframe, df to generate our set of features.  We generate labels by only indexing the device_type column.   By calling out values, we drop the headers and turn the data into numpy arrays.

In [70]:
# features = df.drop('device_type', axis=1).values
# labels = df['device_type'].values

features = df_sample.drop('device_type', axis=1).values
labels = df_sample['device_type'].values


In [71]:
features

array([[2.22363999e-04, 9.40170940e-02, 1.02990033e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.90943551e-05, 1.19658120e-01, 4.65116279e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.32506915e-05, 1.36752137e-01, 5.31561462e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [2.01582317e-04, 1.36752137e-01, 5.31561462e-02, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.01582317e-04, 1.36752137e-01, 5.31561462e-02, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.90943551e-05, 1.19658120e-01, 4.65116279e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [74]:
df

Unnamed: 0,totalframesize,minframesize,maxframesize,framettl,manufacturer,device_type,word_idx1,word_idx2,word_idx3,word_idx4,word_idx5,word_idx6,word_idx7,word_idx8,word_idx9,word_idx10
0,0.028456,0.042735,0.056478,0.000000,1,Other,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.000000,0.145299,0.222591,1.000000,2,Chromecast-Audio,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.003795,0.470085,0.205980,1.000000,3,Apple TV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.001698,0.333333,0.415282,1.000000,3,Other,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.018059,0.076923,0.415282,0.982827,3,Other,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32794,0.000037,0.153846,0.059801,0.248031,22,Samsung SmartCam,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
32795,0.000037,0.153846,0.059801,0.248031,22,Samsung SmartCam,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
32796,0.000210,0.153846,0.059801,0.248031,22,Samsung SmartCam,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
32797,0.000172,0.076923,0.029900,0.248031,22,Samsung SmartCam,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
encoder = LabelEncoder()
encoder.fit(labels)
encoded_labels = encoder.transform(labels)

In [79]:
dummy_labels = np_utils.to_categorical(encoded_labels)
dummy_labels.shape

(3280, 16)

In [80]:
def create_model():
  model = Sequential([
    Dense(16, input_dim=features.shape[1], activation='relu'),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(dummy_labels.shape[1], activation='softmax')
  ])
  model.compile(loss= 'categorical_crossentropy' , optimizer= 'adam' , metrics=['accuracy'])
  return model


In [81]:
estimator = KerasClassifier(build_fn=create_model, epochs=50, batch_size=1, verbose=2)
kfold = KFold(n_splits=10, shuffle=True)
results = cross_val_score(estimator, features, dummy_labels, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Epoch 1/50
 - 5s - loss: 0.8058 - accuracy: 0.8008
Epoch 2/50
 - 4s - loss: 0.4094 - accuracy: 0.8740
Epoch 3/50
 - 4s - loss: 0.3207 - accuracy: 0.9072
Epoch 4/50
 - 4s - loss: 0.2895 - accuracy: 0.9143
Epoch 5/50
 - 4s - loss: 0.2772 - accuracy: 0.9177
Epoch 6/50
 - 4s - loss: 0.2638 - accuracy: 0.9204
Epoch 7/50
 - 4s - loss: 0.2468 - accuracy: 0.9197
Epoch 8/50
 - 4s - loss: 0.2416 - accuracy: 0.9251
Epoch 9/50
 - 4s - loss: 0.2381 - accuracy: 0.9228
Epoch 10/50
 - 4s - loss: 0.2288 - accuracy: 0.9224
Epoch 11/50
 - 4s - loss: 0.2226 - accuracy: 0.9248
Epoch 12/50
 - 4s - loss: 0.2191 - accuracy: 0.9255
Epoch 13/50
 - 4s - loss: 0.2165 - accuracy: 0.9238
Epoch 14/50
 - 4s - loss: 0.2125 - accuracy: 0.9268
Epoch 15/50
 - 4s - loss: 0.2134 - accuracy: 0.9262
Epoch 16/50
 - 4s - loss: 0.2061 - accuracy: 0.9241
Epoch 17/50
 - 4s - loss: 0.2058 - accuracy: 0.9275
Epoch 18/50
 - 4s - loss: 0.2088 - accuracy: 0.9255
Epoch 19/50
 - 4s - loss: 0.2021 - accuracy: 0.9285
Epoch 20/50
 - 4s - l

Epoch 10/50
 - 4s - loss: 0.2272 - accuracy: 0.9201
Epoch 11/50
 - 4s - loss: 0.2243 - accuracy: 0.9187
Epoch 12/50
 - 4s - loss: 0.2106 - accuracy: 0.9245
Epoch 13/50
 - 4s - loss: 0.2161 - accuracy: 0.9231
Epoch 14/50
 - 4s - loss: 0.2045 - accuracy: 0.9224
Epoch 15/50
 - 4s - loss: 0.2033 - accuracy: 0.9234
Epoch 16/50
 - 4s - loss: 0.2036 - accuracy: 0.9258
Epoch 17/50
 - 4s - loss: 0.1993 - accuracy: 0.9245
Epoch 18/50
 - 4s - loss: 0.1940 - accuracy: 0.9238
Epoch 19/50
 - 4s - loss: 0.1942 - accuracy: 0.9319
Epoch 20/50
 - 4s - loss: 0.1906 - accuracy: 0.9316
Epoch 21/50
 - 4s - loss: 0.1893 - accuracy: 0.9268
Epoch 22/50
 - 4s - loss: 0.1860 - accuracy: 0.9306
Epoch 23/50
 - 4s - loss: 0.1848 - accuracy: 0.9306
Epoch 24/50
 - 4s - loss: 0.1851 - accuracy: 0.9278
Epoch 25/50
 - 4s - loss: 0.1838 - accuracy: 0.9292
Epoch 26/50
 - 4s - loss: 0.1859 - accuracy: 0.9302
Epoch 27/50
 - 4s - loss: 0.1763 - accuracy: 0.9326
Epoch 28/50
 - 4s - loss: 0.1799 - accuracy: 0.9289
Epoch 29/50


 - 4s - loss: 0.2242 - accuracy: 0.9211
Epoch 19/50
 - 4s - loss: 0.2200 - accuracy: 0.9201
Epoch 20/50
 - 4s - loss: 0.2199 - accuracy: 0.9207
Epoch 21/50
 - 4s - loss: 0.2167 - accuracy: 0.9238
Epoch 22/50
 - 4s - loss: 0.2167 - accuracy: 0.9214
Epoch 23/50
 - 4s - loss: 0.2131 - accuracy: 0.9228
Epoch 24/50
 - 4s - loss: 0.2134 - accuracy: 0.9217
Epoch 25/50
 - 4s - loss: 0.2127 - accuracy: 0.9238
Epoch 26/50
 - 4s - loss: 0.2121 - accuracy: 0.9207
Epoch 27/50
 - 4s - loss: 0.2076 - accuracy: 0.9231
Epoch 28/50
 - 4s - loss: 0.2065 - accuracy: 0.9245
Epoch 29/50
 - 4s - loss: 0.2022 - accuracy: 0.9251
Epoch 30/50
 - 5s - loss: 0.2084 - accuracy: 0.9228
Epoch 31/50
 - 4s - loss: 0.1990 - accuracy: 0.9255
Epoch 32/50
 - 5s - loss: 0.2029 - accuracy: 0.9238
Epoch 33/50
 - 4s - loss: 0.2012 - accuracy: 0.9255
Epoch 34/50
 - 3s - loss: 0.1916 - accuracy: 0.9268
Epoch 35/50
 - 4s - loss: 0.1983 - accuracy: 0.9241
Epoch 36/50
 - 3s - loss: 0.1915 - accuracy: 0.9234
Epoch 37/50
 - 3s - loss

Epoch 27/50
 - 4s - loss: 0.1877 - accuracy: 0.9309
Epoch 28/50
 - 4s - loss: 0.1829 - accuracy: 0.9295
Epoch 29/50
 - 4s - loss: 0.1911 - accuracy: 0.9282
Epoch 30/50
 - 4s - loss: 0.1815 - accuracy: 0.9299
Epoch 31/50
 - 4s - loss: 0.1854 - accuracy: 0.9292
Epoch 32/50
 - 4s - loss: 0.1797 - accuracy: 0.9343
Epoch 33/50
 - 4s - loss: 0.1822 - accuracy: 0.9319
Epoch 34/50
 - 4s - loss: 0.1836 - accuracy: 0.9322
Epoch 35/50
 - 4s - loss: 0.1775 - accuracy: 0.9356
Epoch 36/50
 - 4s - loss: 0.1741 - accuracy: 0.9336
Epoch 37/50
 - 4s - loss: 0.1736 - accuracy: 0.9360
Epoch 38/50
 - 4s - loss: 0.1714 - accuracy: 0.9356
Epoch 39/50
 - 4s - loss: 0.1722 - accuracy: 0.9353
Epoch 40/50
 - 4s - loss: 0.1764 - accuracy: 0.9329
Epoch 41/50
 - 4s - loss: 0.1695 - accuracy: 0.9329
Epoch 42/50
 - 4s - loss: 0.1745 - accuracy: 0.9319
Epoch 43/50
 - 4s - loss: 0.1789 - accuracy: 0.9373
Epoch 44/50
 - 4s - loss: 0.1713 - accuracy: 0.9394
Epoch 45/50
 - 4s - loss: 0.1720 - accuracy: 0.9394
Epoch 46/50
