# 01 - Data Preparation for No DNS Classification - Fitt Dataset

This notebook is intended to show the steps required to prepare the data for the ML training.

We use NTLK (the natural language toolkit) to process just the country_name geolocation records.  Other characteristics of this notebook include:

  * Fitt dataset used as it has the most number of country matches by IP geolocation lookup
  * Geolocation uses CSV from ip2location.com imported into a sqlite database

In [1]:
from pathlib import Path
import os
import re
import apsw
import pandas as pd
import time


In [2]:
cwd = Path.cwd().parent.parent
print(cwd)

/home/ricdeez/uni/projects/iotnetlearn


In [3]:
dbs = [
    # os.path.join(cwd, 'db', 'db01', 'NetCollector.sqlite'),
    os.path.join(cwd, 'db', 'db02', 'NetCollector.sqlite'),
    # os.path.join(cwd, 'db', 'db03', 'NetCollector.sqlite'),
]

In [4]:
from sqlalchemy import create_engine
df1 = None
df2 = None
df3 = None
for idx, db_path in enumerate(dbs):
    engine = create_engine(f'sqlite:////{db_path}')

    print(engine)

    sql = """

        select d.srcPort as srcport,
               d.dstPort as dstport,
               sum(d.frameSize) as totalframesize,
               min(d.frameSize) as minframesize,
               max(d.frameSize) as maxframesize,
               avg(d.frameTTL) as framettl,
               d.country_name,
               d.manufacturer,
               d.device_type
        from deviceLog d
        where d.device_type is not null and d.country_name is not null
        group by d.srcAddr, d.dstAddr, d.srcPort, d.dstPort

    """

    if idx == 0:
        df1 = pd.read_sql_query(sql, engine)
    elif idx == 1:
        df2 = pd.read_sql_query(sql, engine)
    elif idx == 2:
        df3 = pd.read_sql_query(sql, engine)



Engine(sqlite://///home/ricdeez/uni/projects/iotnetlearn/db/db02/NetCollector.sqlite)


In [5]:
df1 = pd.concat([df1, df2], axis='rows', ignore_index=True)
df1 = pd.concat([df1, df3], axis='rows', ignore_index=True)
global df
df = df1
df.head()

Unnamed: 0,srcport,dstport,totalframesize,minframesize,maxframesize,framettl,country_name,manufacturer,device_type
0,53,36741,142,142,142,57.0,United States,ASUSTek COMPUTER INC.,Samsung Phone
1,53,53,327344,112,124,56.906792,United States,ASUSTek COMPUTER INC.,Other
2,123,31488,540,90,90,50.0,Australia,ASUSTek COMPUTER INC.,Other
3,123,30001,180,90,90,50.0,Australia,ASUSTek COMPUTER INC.,Ring Doorbell
4,123,33302,90,90,90,49.0,Australia,ASUSTek COMPUTER INC.,Amplifi mesh


## One-hot encode most common country names

This uses a simple technique to one-hot encode the most common values for the country names.

In [6]:
from keras.preprocessing.text import Tokenizer

def tokenize_and_onehotencode(df, feature_name, num_words=19):
    """
    Augments the dataframe to include a one-hot encoded
    set of columns, limited by the number of words that 
    are passed in by argument
    """
    feature = df[feature_name]    
    tk = Tokenizer()
    tk.fit_on_texts(feature)
    # https://github.com/keras-team/keras/issues/8092
    tk.word_index = {e:i for e,i in tk.word_index.items() if i < num_words} 
    encoded_feature=tk.texts_to_matrix(feature, mode='freq')
    cols = [f'word_idx{i+1}' for i in range(num_words)]
    df2 = pd.DataFrame(data=encoded_feature, columns=cols)
    return df2

Using TensorFlow backend.


In [7]:
df2 = tokenize_and_onehotencode(df, feature_name='country_name', num_words=19)
df = pd.concat([df, df2], axis='columns')
df.head()

Unnamed: 0,srcport,dstport,totalframesize,minframesize,maxframesize,framettl,country_name,manufacturer,device_type,word_idx1,...,word_idx10,word_idx11,word_idx12,word_idx13,word_idx14,word_idx15,word_idx16,word_idx17,word_idx18,word_idx19
0,53,36741,142,142,142,57.0,United States,ASUSTek COMPUTER INC.,Samsung Phone,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,53,53,327344,112,124,56.906792,United States,ASUSTek COMPUTER INC.,Other,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,123,31488,540,90,90,50.0,Australia,ASUSTek COMPUTER INC.,Other,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,123,30001,180,90,90,50.0,Australia,ASUSTek COMPUTER INC.,Ring Doorbell,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,123,33302,90,90,90,49.0,Australia,ASUSTek COMPUTER INC.,Amplifi mesh,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## One-hot encode most common Destination Ports and Source Ports

This uses a simple technique to one-hot encode the most common values for the destination ports and source ports. 

In [8]:
import numpy as np

def build_feature_attribute_columns(func, features, vals):
    """
    Augments the dataframe to include a one-hot encoded
    set of columns, limited by the actual vals that are 
    passed in by argument.  

    Also includes a new feature
    for each of the features passed in for the negative
    case where none of the values are matched.
    """
    for feature in features:
        arr = []
        for val in vals:
            func('df[\'{}_is_{}\'] = np.where(df[\'{}\'] == \'{}\', 1, 0)'.format(
                    feature, val, feature, val))
            str = '(df[\'{}\'] != \'{}\')'.format(feature, val)
            arr.append(str)
        where_clause = ' & '.join(arr)
        str = 'df[\'{}_is_other\'] = np.where(({}),1,0)'.format(feature, where_clause)
        func(str)

## Trial run

In [9]:
vals = ['80', '443', '4070']
features = ['dstport', 'srcport']

# Change func to exec to change the dataset
# Warning! this will add features to the current df
build_feature_attribute_columns(func=print,
                                    features=features,
                                    vals=vals)

df['dstport_is_80'] = np.where(df['dstport'] == '80', 1, 0)
df['dstport_is_443'] = np.where(df['dstport'] == '443', 1, 0)
df['dstport_is_4070'] = np.where(df['dstport'] == '4070', 1, 0)
df['dstport_is_other'] = np.where(((df['dstport'] != '80') & (df['dstport'] != '443') & (df['dstport'] != '4070')),1,0)
df['srcport_is_80'] = np.where(df['srcport'] == '80', 1, 0)
df['srcport_is_443'] = np.where(df['srcport'] == '443', 1, 0)
df['srcport_is_4070'] = np.where(df['srcport'] == '4070', 1, 0)
df['srcport_is_other'] = np.where(((df['srcport'] != '80') & (df['srcport'] != '443') & (df['srcport'] != '4070')),1,0)


## Final Run

If happy with the output of the above step, we can run the same
function, but this time changing the function that is passed in
to the exec function, causing the dynamically created strings
to be executed and augmenting the df with the new synthetic features.

`Note that the use of the exec feature would make this implementation insecure for any uses other than in an interactive session`

In [10]:
build_feature_attribute_columns(func=exec,
                                    features=features,
                                    vals=vals)
df.head()

Unnamed: 0,srcport,dstport,totalframesize,minframesize,maxframesize,framettl,country_name,manufacturer,device_type,word_idx1,...,word_idx18,word_idx19,dstport_is_80,dstport_is_443,dstport_is_4070,dstport_is_other,srcport_is_80,srcport_is_443,srcport_is_4070,srcport_is_other
0,53,36741,142,142,142,57.0,United States,ASUSTek COMPUTER INC.,Samsung Phone,0.0,...,0.0,0.0,0,0,0,1,0,0,0,1
1,53,53,327344,112,124,56.906792,United States,ASUSTek COMPUTER INC.,Other,0.0,...,0.0,0.0,0,0,0,1,0,0,0,1
2,123,31488,540,90,90,50.0,Australia,ASUSTek COMPUTER INC.,Other,0.0,...,0.0,0.0,0,0,0,1,0,0,0,1
3,123,30001,180,90,90,50.0,Australia,ASUSTek COMPUTER INC.,Ring Doorbell,0.0,...,0.0,0.0,0,0,0,1,0,0,0,1
4,123,33302,90,90,90,49.0,Australia,ASUSTek COMPUTER INC.,Amplifi mesh,0.0,...,0.0,0.0,0,0,0,1,0,0,0,1


In [11]:
drop_columns = ['srcport', 'dstport', 'country_name']
df.drop(drop_columns, axis='columns', inplace=True)

In [12]:
df.head()

Unnamed: 0,totalframesize,minframesize,maxframesize,framettl,manufacturer,device_type,word_idx1,word_idx2,word_idx3,word_idx4,...,word_idx18,word_idx19,dstport_is_80,dstport_is_443,dstport_is_4070,dstport_is_other,srcport_is_80,srcport_is_443,srcport_is_4070,srcport_is_other
0,142,142,142,57.0,ASUSTek COMPUTER INC.,Samsung Phone,0.0,0.5,0.5,0.0,...,0.0,0.0,0,0,0,1,0,0,0,1
1,327344,112,124,56.906792,ASUSTek COMPUTER INC.,Other,0.0,0.5,0.5,0.0,...,0.0,0.0,0,0,0,1,0,0,0,1
2,540,90,90,50.0,ASUSTek COMPUTER INC.,Other,0.0,0.0,0.0,1.0,...,0.0,0.0,0,0,0,1,0,0,0,1
3,180,90,90,50.0,ASUSTek COMPUTER INC.,Ring Doorbell,0.0,0.0,0.0,1.0,...,0.0,0.0,0,0,0,1,0,0,0,1
4,90,90,90,49.0,ASUSTek COMPUTER INC.,Amplifi mesh,0.0,0.0,0.0,1.0,...,0.0,0.0,0,0,0,1,0,0,0,1


## Categorical data mapping

For the deep learning algorithm to work, we need to get rid of all categorical data.  For the Manufacturer's we will create a 1:1 mapping of the manufacturer name as per the wireshark OUI lookup dataset and the relative
position of that in our ordered array of unique entries

In [13]:
mapping = {k: v for v, k in enumerate((x for x in df['manufacturer'].unique() if len(x)>0),1)}
print(mapping)

{'ASUSTek COMPUTER INC.': 1, 'Yamaha Corporation': 2, 'Ubiquiti Networks Inc.': 3, 'Sonos, Inc.': 4, 'Samsung Electro-Mechanics(Thailand)': 5, 'Sony Corporation': 6, 'Apple, Inc.': 7, 'Liteon Technology Corporation': 8, 'Microsoft': 9, 'Unknown': 10, 'Intel Corporate': 11, 'Hewlett Packard': 12, 'Microchip Technology Inc.': 13, 'ICP Electronics Inc.': 14}


In [14]:
df['manufacturer'] = df['manufacturer'].map(mapping)

In [15]:
df.head()

Unnamed: 0,totalframesize,minframesize,maxframesize,framettl,manufacturer,device_type,word_idx1,word_idx2,word_idx3,word_idx4,...,word_idx18,word_idx19,dstport_is_80,dstport_is_443,dstport_is_4070,dstport_is_other,srcport_is_80,srcport_is_443,srcport_is_4070,srcport_is_other
0,142,142,142,57.0,1.0,Samsung Phone,0.0,0.5,0.5,0.0,...,0.0,0.0,0,0,0,1,0,0,0,1
1,327344,112,124,56.906792,1.0,Other,0.0,0.5,0.5,0.0,...,0.0,0.0,0,0,0,1,0,0,0,1
2,540,90,90,50.0,1.0,Other,0.0,0.0,0.0,1.0,...,0.0,0.0,0,0,0,1,0,0,0,1
3,180,90,90,50.0,1.0,Ring Doorbell,0.0,0.0,0.0,1.0,...,0.0,0.0,0,0,0,1,0,0,0,1
4,90,90,90,49.0,1.0,Amplifi mesh,0.0,0.0,0.0,1.0,...,0.0,0.0,0,0,0,1,0,0,0,1


## Min-max scaling of values

The following code performs min-max scaling of our numerical features:

\begin{align}
\mathcal{Z} & = \frac{x\tiny{i} \small- min(X)}{max(X) - min(X)} 
\end{align}

In [16]:
features = ['totalframesize', 'minframesize', 'maxframesize', 'framettl']
for feature in features:
    df[feature] = ((df[feature] - df[feature].min()) / 
                   (df[feature].max() - df[feature].min()))


In [17]:
df.head()

Unnamed: 0,totalframesize,minframesize,maxframesize,framettl,manufacturer,device_type,word_idx1,word_idx2,word_idx3,word_idx4,...,word_idx18,word_idx19,dstport_is_80,dstport_is_443,dstport_is_4070,dstport_is_other,srcport_is_80,srcport_is_443,srcport_is_4070,srcport_is_other
0,1.144632e-07,0.056396,0.002341,0.220472,1.0,Samsung Phone,0.0,0.5,0.5,0.0,...,0.0,0.0,0,0,0,1,0,0,0,1
1,0.0004568535,0.035763,0.001827,0.220105,1.0,Other,0.0,0.5,0.5,0.0,...,0.0,0.0,0,0,0,1,0,0,0,1
2,6.700288e-07,0.020633,0.000856,0.192913,1.0,Other,0.0,0.0,0.0,1.0,...,0.0,0.0,0,0,0,1,0,0,0,1
3,1.675072e-07,0.020633,0.000856,0.192913,1.0,Ring Doorbell,0.0,0.0,0.0,1.0,...,0.0,0.0,0,0,0,1,0,0,0,1
4,4.18768e-08,0.020633,0.000856,0.188976,1.0,Amplifi mesh,0.0,0.0,0.0,1.0,...,0.0,0.0,0,0,0,1,0,0,0,1


## Take a random sample or not

The following code allows us to check our model on a small subset
of the data.  In this case, with frac set to 0.1 we are randomly sampling 10% of the data

In [18]:
# Comment one of the lines below accordingly
# df_sample = df.sample(frac=0.1)
df_sample = df
df_sample.describe()

Unnamed: 0,totalframesize,minframesize,maxframesize,framettl,manufacturer,word_idx1,word_idx2,word_idx3,word_idx4,word_idx5,...,word_idx18,word_idx19,dstport_is_80,dstport_is_443,dstport_is_4070,dstport_is_other,srcport_is_80,srcport_is_443,srcport_is_4070,srcport_is_other
count,94668.0,94668.0,94668.0,94668.0,94667.0,94668.0,94668.0,94668.0,94668.0,94668.0,...,94668.0,94668.0,94668.0,94668.0,94668.0,94668.0,94668.0,94668.0,94668.0,94668.0
mean,8.554994e-05,0.003628,0.022215,0.37508,2.182144,0.0,0.342909,0.341346,0.14928,0.093115,...,6.3e-05,4.2e-05,0.25788,0.144146,0.080545,0.517429,0.257838,0.144072,0.080545,0.517546
std,0.005584256,0.014295,0.040047,0.218561,1.030119,0.0,0.232096,0.232716,0.356366,0.290595,...,0.007961,0.0065,0.43747,0.351239,0.272136,0.499699,0.437447,0.351164,0.272136,0.499695
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.811959e-07,0.0,0.00254,0.248031,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.234668e-06,0.004127,0.012017,0.248031,3.0,0.0,0.5,0.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
75%,2.409312e-06,0.004127,0.018867,0.5,3.0,0.0,0.5,0.5,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,14.0,0.0,0.5,0.5,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Save  df_sample DataFrame for future use

In order to allow us to commence our training without having to repeat these steps, we will now save the df_sample dataframe using the pandas built-in to_json method.

In [19]:
import os
from pathlib import Path
cwd = Path.cwd()

file_path = os.path.join(cwd, 'saved_datasets', 'fitt_dataset_nodns.json' )
df_sample.to_json(file_path)
print("[+] Stored pandas dataframe to disk")

[+] Stored pandas dataframe to disk


In [20]:
df_sample.head()

Unnamed: 0,totalframesize,minframesize,maxframesize,framettl,manufacturer,device_type,word_idx1,word_idx2,word_idx3,word_idx4,...,word_idx18,word_idx19,dstport_is_80,dstport_is_443,dstport_is_4070,dstport_is_other,srcport_is_80,srcport_is_443,srcport_is_4070,srcport_is_other
0,1.144632e-07,0.056396,0.002341,0.220472,1.0,Samsung Phone,0.0,0.5,0.5,0.0,...,0.0,0.0,0,0,0,1,0,0,0,1
1,0.0004568535,0.035763,0.001827,0.220105,1.0,Other,0.0,0.5,0.5,0.0,...,0.0,0.0,0,0,0,1,0,0,0,1
2,6.700288e-07,0.020633,0.000856,0.192913,1.0,Other,0.0,0.0,0.0,1.0,...,0.0,0.0,0,0,0,1,0,0,0,1
3,1.675072e-07,0.020633,0.000856,0.192913,1.0,Ring Doorbell,0.0,0.0,0.0,1.0,...,0.0,0.0,0,0,0,1,0,0,0,1
4,4.18768e-08,0.020633,0.000856,0.188976,1.0,Amplifi mesh,0.0,0.0,0.0,1.0,...,0.0,0.0,0,0,0,1,0,0,0,1
