In [2]:
from __future__ import division
import numpy as np
import os, sys
import matplotlib
#matplotlib.use('Agg')
%matplotlib nbagg
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict


In [14]:
def getCDF(data):
    xdata = np.sort(data)
    ydata = [i/len(xdata) for i in range(len(xdata))]
    return xdata, ydata

In [12]:
df = pd.read_csv("../data/dns_data_cs/dns_all_20170124_20170130_sample.csv", sep="\t", error_bad_lines=False,
                names=['time', 'srcip', 'dstip', 'dnsquery']).fillna('')

In [15]:
df.head()


Unnamed: 0,time,srcip,dstip,dnsquery
0,0.0,140.180.220.42,128.112.129.32,pu.win.princeton.edu
1,0.001957,140.180.220.42,128.112.129.32,pu.win.princeton.edu
2,0.00292,10.9.79.26,65.111.161.119,localhost.local
3,0.003532,10.8.233.128,10.8.0.5,a.dolimg.com
4,0.005694,10.8.244.89,10.8.0.5,ce.lijit.com


## STATS

In [16]:
print "Total time of dns record [1.7GB]:", (df.iloc[-1]['time'] - df.iloc[0]['time']), "s"
print "Total number of queries: ", len(df)
print "Queries without source IP (are probably IPv6 so IP was not extracted): ", len(df[df['srcip']==''])

print "Unique source IPs (devices or homes):", len(df['srcip'].unique())
print "Unique destination IPs (DNS servers):", len(df['dstip'].unique())

print "Unique queries:", len(df['dnsquery'].unique())

Total time of dns record [1.7GB]: 50945.441681 s
Total number of queries:  27537421
Queries without source IP (are probably IPv6 so IP was not extracted):  148
Unique source IPs (devices or homes): 17743
Unique destination IPs (DNS servers): 1792
Unique queries: 666723


## KNOWN IOT DEVICES AND SPECIFIC DOMAINS

In [18]:
DNS_moderate_lookups = {
    'nest_therm': '.nest.com',
    'nest_cam': '.dropcam.com',
    'smartthings': '.smartthings.com',
    'pixstar': '.pix-star.com',
    'echo': 'device-metrics-us.amazon.com',
    'sharx': 'sharxsecurity.com',
    'august': '.august.com',
    'hue': '.meethue.com',
    #'apple': 'time-ios.apple.com',
    'apple': 'gs-loc.ls-apple.com',
    'ps': '.playstation.net',
    'xbox': 'xboxlive.com',
    'nintendo': 'nintendo.net',
}

In [19]:
iot_list = defaultdict(int)
for domain in DNS_moderate_lookups.values():
    if len(iot_list) == 0:
        iot_list = df['dnsquery'].str.contains(domain)
    else:
        iot_list = iot_list | df['dnsquery'].str.contains(domain)

In [24]:
df_iot = df[iot_list].copy()

In [25]:
len(df_iot)

28605

### Add corresponding device identified

In [27]:
inverse_DNS_lookups = {v: k for k, v in DNS_moderate_lookups.items()}
df_iot['device'] = ''

for domain in DNS_moderate_lookups.values():
    df_iot.loc[ df_iot['dnsquery'].str.contains(domain), 'device'] = inverse_DNS_lookups[domain]

In [28]:
df_iot.head()

Unnamed: 0,time,srcip,dstip,dnsquery,device
277,0.326137,10.8.252.6,10.8.0.5,device-metrics-us.amazon.com,echo
978,1.37333,10.8.117.252,10.8.0.6,us-prof.np.community.playstation.net,ps
3319,4.8464,10.8.75.213,10.8.0.5,device-metrics-us.amazon.com,echo
3491,5.10355,10.8.28.249,10.8.0.5,xsts.auth.xboxlive.com,xbox
3503,5.12828,10.8.28.249,10.8.0.6,xsts.auth.xboxlive.com,xbox


### Device vs unique source IPs

In [32]:
df_iot_count = df_iot.groupby(['srcip', 'device'])['dnsquery'].count().reset_index()
df_iot_count.head()

Unnamed: 0,srcip,device,dnsquery
0,10.8.0.179,apple,1
1,10.8.0.184,apple,2
2,10.8.0.22,apple,1
3,10.8.0.238,xbox,1
4,10.8.0.57,apple,3


In [39]:
gp = df_iot_count.groupby('device')

df_iot_dev_stats = pd.DataFrame( { 'count_srcip': gp['srcip'].count(),
                                  'mean_dnsquery_count': gp['dnsquery'].apply(lambda x: x.mean()),
                                  'std_dnsquery_count': gp['dnsquery'].apply(lambda x: x.std()) })

df_iot_dev_stats

Unnamed: 0_level_0,count_srcip,mean_dnsquery_count,std_dnsquery_count
device,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
apple,2231,4.076647,7.413595
august,3,2.333333,2.309401
echo,531,15.80226,34.900566
hue,7,2.714286,3.302236
nest_cam,14,4.357143,3.855209
nest_therm,32,10.84375,19.597744
nintendo,39,23.461538,27.091314
ps,62,69.709677,98.517933
smartthings,5,4.4,2.701851
xbox,732,7.412568,18.132111


# IDENTIFY IOT DEVICES

## NUM UNIQUE QUERIES PER DEVICE

In [42]:
gp1 = df.groupby(['srcip', 'dnsquery'])

In [54]:
df1 = gp1['time'].count().reset_index()

df1.head(20)

Unnamed: 0,srcip,dnsquery,time
0,,,148
1,10.24.117.145,_ldap._tcp.dc._msdcs.WORKGROUP.princeton.edu,1
2,10.24.117.145,_ldap._tcp.dc._msdcs.princeton.edu,2
3,10.24.117.145,ajax.cdnjs.com,2
4,10.24.117.145,http-test1.hola.org,2
5,10.24.117.145,isatap.princeton.edu,4
6,10.24.117.145,win8.ipv6.microsoft.com,2
7,10.24.117.145,wpad.princeton.edu,1
8,10.24.117.145,www.google.com,2
9,10.24.125.176,,6


In [129]:
len(df1)

5062521

In [61]:
# filter away empty/unknown queries or those not containing a period
#df2 = df1[~( (df1['dnsquery']=='') | (df1['dnsquery'].str.contains("<Unknown extended label>")) )]
df2 = df1[ (df1['dnsquery'].str.contains(".")) & (~df1['dnsquery'].str.contains("<"))]

In [130]:
len(df2)

5061798

In [131]:
# group query by sourceip and count num of individual queries per ip

gp2 = df2.groupby('srcip')
df3 = gp2['dnsquery'].count()

In [67]:
print "Number of source IPs =", len(df3)

Number of source IPs = 17633


In [134]:
len(df3)

17633

In [135]:
len( df3[df3<30] )

3012

In [133]:
x,y = getCDF(df3.values)

fig1, ax1 = plt.subplots()
ax1.plot(x,y)
ax1.set_xscale("log", nonposx='clip')
ax1.grid(1)
ax1.set_xlabel('log number of unique queries')
ax1.set_ylabel('CDF')
fig1.show()

<IPython.core.display.Javascript object>

In [87]:
df3.sort()

  if __name__ == '__main__':


In [137]:
df_30 = df3[df3<=30]
print len(df_30)
print len(df3)
df_30

3075
17633


srcip
10.24.117.145     8
10.24.175.34      1
10.24.193.171     1
10.24.231.132     1
10.25.115.108     1
10.25.128.192     2
10.25.181.202     1
10.254.1.56       1
10.254.4.156      1
10.254.4.235      1
10.254.4.83       1
10.254.4.84       1
10.254.4.86       1
10.254.4.87       1
10.254.4.88       1
10.254.5.138      1
10.254.5.163      1
10.254.5.164      1
10.254.5.165      1
10.254.5.166      1
10.254.5.177      1
10.254.5.204      1
10.254.5.47       1
10.254.6.122      1
10.254.6.134      1
10.254.6.136      1
10.254.6.160      1
10.254.6.161      1
10.254.6.162      1
10.254.6.163      1
                 ..
66.180.183.34     1
66.180.183.36     1
66.180.183.37     1
66.180.183.38     1
66.180.183.39     1
66.180.183.43     1
66.180.183.45     1
66.180.183.46     1
66.180.183.6      1
66.180.183.64     1
66.180.183.65     1
66.180.183.80     1
66.180.183.82     1
66.180.183.87     1
66.180.183.89     1
66.180.183.90     1
66.180.183.93     1
66.180.183.94     1
66.180.183.95 

In [108]:
df2.loc[gp2.indices['128.112.212.96']]

Unnamed: 0,srcip,dnsquery,time
4838733,128.112.134.236,ims314.Princeton.EDU,2


In [109]:
df2.loc[gp2.indices['10.9.211.18']]

Unnamed: 0,srcip,dnsquery,time
3623605,10.9.211.167,ipv4_1-cxl0-c109.1.mia003.ix.nflxvideo.net,8
3623606,10.9.211.167,ipv4_1-cxl0-c111.1.mia003.ix.nflxvideo.net,10
3623607,10.9.211.167,ipv4_1-cxl0-c115.1.mia003.ix.nflxvideo.net,4
3623608,10.9.211.167,ipv4_1-cxl0-c116.1.mia003.ix.nflxvideo.net,4
3623609,10.9.211.167,ipv4_1-cxl0-c117.1.mia003.ix.nflxvideo.net,18
3623610,10.9.211.167,ipv4_1-cxl0-c118.1.mia003.ix.nflxvideo.net,6
3623611,10.9.211.167,ipv4_1-cxl0-c119.1.mia003.ix.nflxvideo.net,8
3623612,10.9.211.167,ipv4_1-cxl0-c120.1.mia003.ix.nflxvideo.net,8
3623613,10.9.211.167,ipv4_1-cxl0-c123.1.mia003.ix.nflxvideo.net,10
3623614,10.9.211.167,ipv4_1-cxl0-c124.1.mia003.ix.nflxvideo.net,8


### Check IPs against df_iot

In [127]:
df_known_iot = df3.loc[df_iot_count.srcip].reset_index()
df_known_iot

Unnamed: 0,srcip,dnsquery
0,10.8.0.179,1326
1,10.8.0.184,318
2,10.8.0.22,59
3,10.8.0.238,147
4,10.8.0.57,151
5,10.8.0.85,448
6,10.8.1.10,526
7,10.8.1.115,158
8,10.8.1.135,625
9,10.8.1.157,348


In [128]:
df_iot_count.merge(df_known_iot, on='srcip')

Unnamed: 0,srcip,device,dnsquery_x,dnsquery_y
0,10.8.0.179,apple,1,1326
1,10.8.0.184,apple,2,318
2,10.8.0.22,apple,1,59
3,10.8.0.238,xbox,1,147
4,10.8.0.57,apple,3,151
5,10.8.0.85,apple,10,448
6,10.8.1.10,apple,3,526
7,10.8.1.115,echo,5,158
8,10.8.1.135,echo,1,625
9,10.8.1.157,apple,8,348
