In [1]:
from pathlib import Path
import os
import re
import apsw
import pandas as pd
import time
import pprint


In [2]:
cwd = Path.cwd()
db_path = os.path.join(cwd.parent, 'Data', 'NetCollector.sqlite')


In [3]:
conn = apsw.Connection(db_path)
cur = conn.cursor()

sql = """
-- We still have the smell on some endtime entries being less than
-- some starttime entries

select s.srcport,
       s.dstport,
       s.lensum,
       s.pktcount,
       abs(s.endtime - s.starttime) as durn, -- SMELL --
       d.manufacturer,
       d.device_type
from sessions s
         inner join v_session_mnf_devicetype d on s.sessionid = d.sessionid
where d.device_type is not null

"""

In [4]:
arr = []
for row in cur.execute(sql):
    arr.append(row)


In [6]:
cols = ['SourcePort',
        'DestnPort',
        'SessionPackets',
        'SessionPktLen',
        'SessionPktArrivalTime',
        'Manufacturer',
        'DeviceType']
df = pd.DataFrame(data=arr, columns=cols)

In [7]:
from sklearn import preprocessing

In [8]:
df

Unnamed: 0,SourcePort,DestnPort,SessionPackets,SessionPktLen,SessionPktArrivalTime,Manufacturer,DeviceType
0,55630,63960,36384156,126998,419.030715,Ubiquiti Networks Inc.,Other
1,63960,55630,9039457,125631,419.074508,"Apple, Inc.",Other
2,49322,7550,306906857,228847,419.074864,Ubiquiti Networks Inc.,UVC-G3-Flex Camera
3,49323,7550,84313180,83489,419.072925,Ubiquiti Networks Inc.,UVC-G3-Flex Camera
4,443,58235,18288,127,434.025740,Ubiquiti Networks Inc.,Other
...,...,...,...,...,...,...,...
9683,53,61777,151,1,0.000000,Ubiquiti Networks Inc.,Other
9684,55463,443,1999,5,0.203312,Technicolor CH USA Inc.,Other
9685,443,55463,1840,6,0.175634,Ubiquiti Networks Inc.,Other
9686,54459,53,87,1,0.000000,Intel Corporate,Other


**Outstanding**

The following needs to be done:

1. Normalise the values in place (all non-categorical is between 0 and 1)
2. Create category maps for Source Port, Destination Port and Manufacturer
3. One-hot encode DNS queries (there are like 250 of them, so we may need to only do the top 100, rest is other)
4. Create test, training and validation sets




In [9]:
df.describe()

Unnamed: 0,SessionPackets,SessionPktLen,SessionPktArrivalTime
count,9688.0,9688.0,9688.0
mean,290412.2,349.180429,48.017694
std,11063290.0,8618.014923,123.989873
min,60.0,1.0,0.0
25%,242.0,2.0,0.031813
50%,1178.0,6.0,14.333285
75%,3627.25,13.0,30.437062
max,740233800.0,496770.0,2088.069581


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9688 entries, 0 to 9687
Data columns (total 7 columns):
SourcePort               9688 non-null object
DestnPort                9688 non-null object
SessionPackets           9688 non-null int64
SessionPktLen            9688 non-null int64
SessionPktArrivalTime    9688 non-null float64
Manufacturer             9688 non-null object
DeviceType               9688 non-null object
dtypes: float64(1), int64(2), object(4)
memory usage: 529.9+ KB


In [11]:
import numpy as np
df['is_iot'] = np.where(df['DeviceType'] == 'Other', 0, 1)


In [12]:
df.head()

Unnamed: 0,SourcePort,DestnPort,SessionPackets,SessionPktLen,SessionPktArrivalTime,Manufacturer,DeviceType,is_iot
0,55630,63960,36384156,126998,419.030715,Ubiquiti Networks Inc.,Other,0
1,63960,55630,9039457,125631,419.074508,"Apple, Inc.",Other,0
2,49322,7550,306906857,228847,419.074864,Ubiquiti Networks Inc.,UVC-G3-Flex Camera,1
3,49323,7550,84313180,83489,419.072925,Ubiquiti Networks Inc.,UVC-G3-Flex Camera,1
4,443,58235,18288,127,434.02574,Ubiquiti Networks Inc.,Other,0


In [13]:
df.groupby(df['SessionPackets'] > df['SessionPackets'].mean() + 2*df['SessionPackets'].std() ).mean()

Unnamed: 0_level_0,SessionPackets,SessionPktLen,SessionPktArrivalTime,is_iot
SessionPackets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,44996.44,158.915848,47.780708,0.14525
True,158550800.0,123044.466667,200.842118,0.466667


In [14]:
for entry in sorted(df['Manufacturer'].unique()):
    print(entry)

Amazon Technologies Inc.
Apple, Inc.
AzureWave Technology Inc.
Beijing LT Honway Technology Co.,Ltd
Google, Inc.
Hewlett Packard
Intel Corporate
Murata Manufacturing Co., Ltd.
Raspberry Pi Foundation
Realtek Semiconductor Corp.
Rivet Networks
Samsung Electro-Mechanics(Thailand)
Technicolor CH USA Inc.
Topwell International Holdinds Limited
Ubiquiti Networks Inc.


In [20]:
df.groupby(df['Manufacturer'] == 'Raspberry Pi Foundation').mean()

Unnamed: 0_level_0,SessionPackets,SessionPktLen,SessionPktArrivalTime,is_iot
Manufacturer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,290561.972942,349.35898,47.767782,0.145306
True,306.0,3.4,531.997779,1.0


In [17]:
df.groupby(df['SessionPktArrivalTime'] > df['SessionPktArrivalTime'].mean() + 2*df['SessionPktArrivalTime'].std() ).mean()

Unnamed: 0_level_0,SessionPackets,SessionPktLen,SessionPktArrivalTime,is_iot
SessionPktArrivalTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,91567.01,123.960769,26.410619,0.146871
True,4520100.0,5139.887356,507.627501,0.121839


**Analysis**

The above seems to show the following:

1. Likelihood of something being an IoT device is correlated with the Session Packets
2. If more than 1e8 packets then it is guaranteed to be IoT (probable CCTV cameras
3. Not too much correlation between SessionPktLen and IoT
4. More of a chance of it being an IoT device if it does not have DNS packets
5. SessionPktArrivalTime tends to be a weak indicator for IoT



In [22]:
import tensorflow as tf
print(tf.__version__)

2.0.0-rc1


In [24]:
import sklearn


In [25]:
from sklearn.model_selection import train_test_split

In [26]:
df.head()


Unnamed: 0,SourcePort,DestnPort,SessionPackets,SessionPktLen,SessionPktArrivalTime,Manufacturer,DeviceType,is_iot
0,55630,63960,36384156,126998,419.030715,Ubiquiti Networks Inc.,Other,0
1,63960,55630,9039457,125631,419.074508,"Apple, Inc.",Other,0
2,49322,7550,306906857,228847,419.074864,Ubiquiti Networks Inc.,UVC-G3-Flex Camera,1
3,49323,7550,84313180,83489,419.072925,Ubiquiti Networks Inc.,UVC-G3-Flex Camera,1
4,443,58235,18288,127,434.02574,Ubiquiti Networks Inc.,Other,0
