In [1]:
from __future__ import division
import numpy as np
import os, sys
import matplotlib
#matplotlib.use('Agg')
%matplotlib nbagg
import matplotlib.pyplot as plt
import pandas as pd
import subprocess
from collections import defaultdict, Counter


def getCDF(data):
    xdata = np.sort(data)
    ydata = [i/len(xdata) for i in range(len(xdata))]
    return xdata, ydata

# STEPS
- load pcap -> tshark extract
- load tshark csv
- filter to a record only
- key srcip
- apply feature functions
 - number of dst
 - number of A queries
 - number of unique A queries
 - diff all: avg, 50%, min, max
 - most popular query 1,2,3
 - most popular query diff 1,2,3: avg
 - least popular query
 - FREQUENCY BASED FEATURES (need to finish resampling code #todo)

In [2]:
# load from data/ and extract to processed/
infile='data/split_6hour_00000_20170206235900.pcap'
outfile='processed/split_6hour_00000_20170206235900.csv'

cmd_extract = 'tshark -r '+infile+' -E separator="|" -T fields -e frame.time_epoch \
-e frame.time_relative -e ip.src -e ip.dst -e dns.qry.name -e dns.qry.type -Y "dns.flags.response eq 0" \
> '+outfile

# ONLY DO THIS FOR NEW PCAP FILES
# subprocess.check_output(cmd_extract, shell=True)

In [3]:
# load csv from processed
df = pd.read_csv("processed/split_hour_00000_20170206235900.csv", sep="|", error_bad_lines=False,
                names=['time', 'time_relative', 'srcip', 'dstip', 'dnsqry', 'dnstype'],
                #dtype={'time': pd.np.float, 'time_relative': pd.np.float64, 'dnstype': pd.np.int},
                ).dropna()

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df['time'] = df['time'].astype(float)
df['time_relative'] = df['time_relative'].astype(float)

### Stats by dns.qry.type
- convert qry type hex -> int
- A record == 1
- for 2 or more qry type take the first one after splitting

In [5]:
dnstype_count = df.groupby('dnstype')['dnsqry'].count()
dnstype_count.sort_values(inplace=True, ascending=False)

print "Percentage of A Records = ", dnstype_count.iloc[0]/dnstype_count.sum()*100

dnstype_count.head(15)

Percentage of A Records =  77.8158606734


dnstype
0x00000001               1505933
0x0000001c                196059
0x0000000c                153732
0x0000000f                 37187
0x00000021                 33905
0x00000010                  3572
0x00000006                  3331
0x0000002b                   500
0x000000ff                   319
0x000000f9                   185
0x00000001,0x0000001c        147
0x00000002                   110
0x0000b71f                    36
0x0000a21f                    14
0x0000a11f                    11
Name: dnsqry, dtype: int64

In [6]:
# convert dnstype to int (take only first type in case of 2)
df['dnstype'] = df['dnstype'].apply(lambda x: int( x.split(',')[0], 16))

dnstype_count = df.groupby('dnstype')['dnsqry'].count()
dnstype_count.sort_values(inplace=True, ascending=False)

print "New percentage of A Records = ", dnstype_count.iloc[0]/dnstype_count.sum()*100

New percentage of A Records =  77.8234565834


In [7]:
df.head()

Unnamed: 0,time,time_relative,srcip,dstip,dnsqry,dnstype
0,1486444000.0,0.000567,10.9.151.75,10.8.0.5,safebrowsing.google.com,1
1,1486444000.0,0.000744,140.180.223.155,128.112.129.209,51.255.61.103.sbl-xbl.spamhaus.dnsbl,1
2,1486444000.0,0.000853,140.180.223.155,128.112.129.209,51.255.61.103.sbl-xbl.spamhaus.dnsbl,1
3,1486444000.0,0.003419,10.9.38.65,10.8.0.5,tpc.googlesyndication.com,1
4,1486444000.0,0.003675,10.6.63.107,128.112.129.209,98.63.6.10.in-addr.arpa,12


### filter to A records only

In [11]:
df_a = (df[df.dnstype == 1]).copy()

## Add 1LD and 2LD as column

In [13]:
def reduce_domain(domain, level):
    tmp = domain.split('.')
    out = '.'.join( tmp[-level:] )
    return out
    
reduce_domain('www.youtube.com', 2)

'youtube.com'

In [14]:
df_a['1LD'] = df_a['dnsqry'].apply(lambda x: reduce_domain(x, 1))
df_a['2LD'] = df_a['dnsqry'].apply(lambda x: reduce_domain(x, 2))

df_a.head()

Unnamed: 0,time,time_relative,srcip,dstip,dnsqry,dnstype,1LD,2LD
0,1486444000.0,0.000567,10.9.151.75,10.8.0.5,safebrowsing.google.com,1,com,google.com
1,1486444000.0,0.000744,140.180.223.155,128.112.129.209,51.255.61.103.sbl-xbl.spamhaus.dnsbl,1,dnsbl,spamhaus.dnsbl
2,1486444000.0,0.000853,140.180.223.155,128.112.129.209,51.255.61.103.sbl-xbl.spamhaus.dnsbl,1,dnsbl,spamhaus.dnsbl
3,1486444000.0,0.003419,10.9.38.65,10.8.0.5,tpc.googlesyndication.com,1,com,googlesyndication.com
6,1486444000.0,0.003945,10.9.38.65,10.8.0.5,www.google.com,1,com,google.com


# FILTER OUT princeton.edu

In [34]:
df_b = df_a[ df_a['2LD'] != 'princeton.edu' ]
#df_c = df_b[ df_b['srcip'].str.contains('140.180.128') ]
df_b

Unnamed: 0,time,time_relative,srcip,dstip,dnsqry,dnstype,1LD,2LD
0,1.486444e+09,0.000567,10.9.151.75,10.8.0.5,safebrowsing.google.com,1,com,google.com
1,1.486444e+09,0.000744,140.180.223.155,128.112.129.209,51.255.61.103.sbl-xbl.spamhaus.dnsbl,1,dnsbl,spamhaus.dnsbl
2,1.486444e+09,0.000853,140.180.223.155,128.112.129.209,51.255.61.103.sbl-xbl.spamhaus.dnsbl,1,dnsbl,spamhaus.dnsbl
3,1.486444e+09,0.003419,10.9.38.65,10.8.0.5,tpc.googlesyndication.com,1,com,googlesyndication.com
6,1.486444e+09,0.003945,10.9.38.65,10.8.0.5,www.google.com,1,com,google.com
7,1.486444e+09,0.004137,10.8.155.147,10.8.0.5,www.google-analytics.com,1,com,google-analytics.com
8,1.486444e+09,0.006005,10.9.38.65,10.8.0.5,s.adroll.com,1,com,adroll.com
11,1.486444e+09,0.013177,10.8.239.134,10.8.0.5,safebrowsing.google.com,1,com,google.com
12,1.486444e+09,0.013527,10.8.12.117,10.8.0.5,choices-vpc.truste.com,1,com,truste.com
13,1.486444e+09,0.024620,10.9.168.1,10.8.0.5,images-na.ssl-images-amazon.com,1,com,ssl-images-amazon.com


## group by srcip as KEY and extract features

In [24]:
df_sample = df_b#.head(5000).copy()

In [25]:
grouped = df_sample.groupby('srcip')

In [26]:
# grouped.agg({'C' : np.sum, 'D' : lambda x: np.std(x, ddof=1)})
data = defaultdict(list)

for srcip, group in grouped:
    # key
    data['srcip'].append( srcip )
    
    # DSTIP
    # number of unique dstip
    data['dstip.count'].append( len( set( group['dstip'] ) ) )
    
    # most common dstip
    data['dstip.most_popular'].append( Counter( group['dstip'] ).most_common(1)[0][0] )
    
    # DNS QRY
    # count of total domains
    data['dnsqry.count'].append( group['dnsqry'].count() )
    
    # count of unique domains
    data['dnsqry.unique.count'].append( len( set( group['dnsqry'] )) )
    
    # count of unique 1LDs
    data['1LD.unique.count'].append( len( set( group['1LD'] )) )
    # count of unique 2LDs
    data['2LD.unique.count'].append( len( set( group['2LD'] )) )
    
    # TIME DIFF ALL - DESCRIBE
    td = group['time'].diff()  #fold(1)
    data['time_diff.mean'].append( td.mean() )
    data['time_diff.std'].append( td.std() )
    data['time_diff.min'].append( td.min() ) 
    data['time_diff.max'].append( td.max() )
    data['time_diff.median'].append( td.median() )
    
    # TOP 3 most popular domains and their time diffs
    top3 = Counter( group['dnsqry'] ).most_common(3)
    for num in range(3):
        column_name = 'dnsqry.most_pop'+ str(num)
        
        if len(top3)>num:
            pop_domains = top3[num]
            domain = pop_domains[0]  #domain
            occurance = pop_domains[1]   #number of lookups
            data[column_name + '.domain'].append( domain )   
            data[column_name + '.count'].append( occurance )
            # time diff
            td = group[ group['dnsqry']==domain ]['time'].diff()
            data[column_name + '.time_diff.mean'].append( td.mean() )
            data[column_name + '.time_diff.std'].append( td.std() )
            data[column_name + '.time_diff.min'].append( td.min() )
            data[column_name + '.time_diff.max'].append( td.max() )
            data[column_name + '.time_diff.median'].append( td.median() )
            
        else:
            data[column_name + '.domain'].append( np.nan )   
            data[column_name + '.count'].append( np.nan )
            # time diff
            data[column_name + '.time_diff.mean'].append( np.nan )
            data[column_name + '.time_diff.std'].append( np.nan )
            data[column_name + '.time_diff.min'].append( np.nan )
            data[column_name + '.time_diff.max'].append( np.nan )
            data[column_name + '.time_diff.median'].append( np.nan )
            
    # Least popular domain
    bottom = Counter( group['dnsqry'] ).most_common()[-1]
    data['dnsqry.least_pop.domain'].append( bottom[0] )    #domain
    data['dnsqry.least_pop.count'].append( bottom[1] )     #occurances

    # FREQUENCY BASED FEATURES
    #TODO

In [27]:
# check num entries before converting to dataframe
for k,v in data.items():
    print len(v), "\t", k

9263 	dstip.most_popular
9263 	dnsqry.most_pop0.time_diff.median
9263 	dnsqry.count
9263 	dnsqry.most_pop0.domain
9263 	time_diff.median
9263 	dnsqry.most_pop0.time_diff.max
9263 	dstip.count
9263 	1LD.unique.count
9263 	dnsqry.least_pop.count
9263 	dnsqry.most_pop2.time_diff.min
9263 	dnsqry.most_pop1.time_diff.std
9263 	dnsqry.most_pop1.time_diff.median
9263 	2LD.unique.count
9263 	dnsqry.most_pop2.time_diff.median
9263 	dnsqry.most_pop2.count
9263 	dnsqry.most_pop0.count
9263 	dnsqry.most_pop1.time_diff.min
9263 	time_diff.std
9263 	dnsqry.most_pop0.time_diff.mean
9263 	dnsqry.most_pop1.time_diff.mean
9263 	dnsqry.least_pop.domain
9263 	dnsqry.most_pop2.time_diff.std
9263 	dnsqry.most_pop1.time_diff.max
9263 	srcip
9263 	dnsqry.most_pop1.domain
9263 	dnsqry.unique.count
9263 	time_diff.max
9263 	dnsqry.most_pop1.count
9263 	dnsqry.most_pop2.time_diff.max
9263 	time_diff.mean
9263 	dnsqry.most_pop2.domain
9263 	time_diff.min
9263 	dnsqry.most_pop0.time_diff.min
9263 	dnsqry.most_pop2

In [28]:
df_features = pd.DataFrame(data)

In [29]:
df_features

Unnamed: 0,1LD.unique.count,2LD.unique.count,dnsqry.count,dnsqry.least_pop.count,dnsqry.least_pop.domain,dnsqry.most_pop0.count,dnsqry.most_pop0.domain,dnsqry.most_pop0.time_diff.max,dnsqry.most_pop0.time_diff.mean,dnsqry.most_pop0.time_diff.median,...,dnsqry.most_pop2.time_diff.std,dnsqry.unique.count,dstip.count,dstip.most_popular,srcip,time_diff.max,time_diff.mean,time_diff.median,time_diff.min,time_diff.std
0,3,17,82,1,www.google.com,12,e1863.dspb.akamaiedge.net,826.509368,279.426812,283.408938,...,458.755275,23,1,128.112.129.32,10.6.48.10,226.720494,42.408970,27.007246,0.017376,50.710004
1,2,2,7,1,vortex-win.data.microsoft.com,5,settings-win.data.microsoft.com,964.020238,731.008915,964.006575,...,,3,1,128.112.128.1,10.6.48.100,964.020238,543.515157,650.527262,0.000284,475.421893
2,1,1,1,1,bogus-mname.Princeton.EDU,1,bogus-mname.Princeton.EDU,,,,...,,1,1,128.112.128.1,10.6.48.103,,,,,
3,1,1,1,1,bogus-mname.Princeton.EDU,1,bogus-mname.Princeton.EDU,,,,...,,1,1,128.112.128.1,10.6.48.104,,,,,
4,2,2,2,1,dns.msftncsi.com,1,bogus-mname.Princeton.EDU,,,,...,,2,1,128.112.128.1,10.6.48.105,1580.184178,1580.184178,1580.184178,1580.184178,
5,1,1,1,1,bogus-mname.Princeton.EDU,1,bogus-mname.Princeton.EDU,,,,...,,1,1,128.112.128.1,10.6.48.106,,,,,
6,1,1,1,1,bogus-mname.Princeton.EDU,1,bogus-mname.Princeton.EDU,,,,...,,1,1,128.112.128.1,10.6.48.107,,,,,
7,1,1,1,1,bogus-mname.Princeton.EDU,1,bogus-mname.Princeton.EDU,,,,...,,1,1,128.112.128.1,10.6.48.109,,,,,
8,1,1,1,1,bogus-mname.Princeton.EDU,1,bogus-mname.Princeton.EDU,,,,...,,1,1,128.112.128.1,10.6.48.11,,,,,
9,1,1,1,1,ldap2.Princeton.EDU,1,ldap2.Princeton.EDU,,,,...,,1,1,128.112.129.209,10.6.48.110,,,,,


In [30]:
df_features.to_csv( 'processed/features_key_srcip.csv', sep="|", header=True)

# FILTERING

In [35]:
df_filtered = df_features[(df_features['2LD.unique.count']<=10) & (df_features['dnsqry.count']>10)]

In [36]:
df_filtered.set_index('srcip')

Unnamed: 0_level_0,1LD.unique.count,2LD.unique.count,dnsqry.count,dnsqry.least_pop.count,dnsqry.least_pop.domain,dnsqry.most_pop0.count,dnsqry.most_pop0.domain,dnsqry.most_pop0.time_diff.max,dnsqry.most_pop0.time_diff.mean,dnsqry.most_pop0.time_diff.median,...,dnsqry.most_pop2.time_diff.min,dnsqry.most_pop2.time_diff.std,dnsqry.unique.count,dstip.count,dstip.most_popular,time_diff.max,time_diff.mean,time_diff.median,time_diff.min,time_diff.std
srcip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10.6.48.112,2,3,13,1,go.microsoft.com,6,statsfe2.update.microsoft.com,541.406385,541.193684,541.114677,...,,,4,1,128.112.128.1,541.365839,240.545927,140.653147,2.837583e+01,199.201854
10.6.48.12,2,2,2329,2,sdp104w.Princeton.EDU,17,pdom06.pu.win.Princeton.EDU,300.463862,216.455659,241.480809,...,29.999331,92.900299,223,1,128.112.129.7,9.165232,1.544635,0.998108,2.861023e-06,1.738372
10.6.48.122,2,2,11,1,r20swj13mr.microsoft.com,7,statsfe2.update.microsoft.com,541.456863,451.062572,541.231790,...,,,5,1,128.112.128.1,541.270871,311.014199,357.851537,2.653757e+01,223.241558
10.6.48.13,3,8,106,1,bogus-mname.Princeton.EDU,28,clients1.google.com,180.060638,126.698233,120.024491,...,30.013770,167.972535,15,2,128.112.128.1,150.035113,34.008451,28.500631,3.357172e-03,34.954120
10.6.48.14,2,2,132,1,bogus-mname.Princeton.EDU,131,endpoint.ingress.rapid7.com,40.077696,27.467939,20.093167,...,,,2,1,128.112.128.1,40.077696,27.258260,20.093108,2.789777e+00,12.174265
10.6.48.147,1,1,318,1,bogus-mname.Princeton.EDU,5,apcats-87prospect-bdf.Princeton.EDU,905.728533,677.348888,901.673478,...,0.027087,454.427315,80,2,128.112.128.1,82.758642,11.113241,6.213902,2.009869e-04,14.023410
10.6.48.148,3,5,25,1,bogus-mname.Princeton.EDU,6,statsfe2.update.microsoft.com,601.741539,601.680134,601.663142,...,900.053469,0.013093,7,2,128.112.128.1,396.812208,133.667694,97.626576,2.448916e-02,130.840253
10.6.48.157,2,2,11,1,ieonlinews.microsoft.com,8,statsfe2.update.microsoft.com,540.955172,463.661216,540.916075,...,,,4,1,128.112.128.1,540.955172,324.562851,350.669952,3.056625e+00,219.162559
10.6.48.160,1,1,12,12,linux-update.oracle.com,12,linux-update.oracle.com,45.112617,19.120410,15.001348,...,,,1,2,128.112.129.7,45.112617,19.120410,15.001348,7.097721e-04,20.221434
10.6.48.161,2,2,13,1,ftp.ecsi.net,12,linux-update.oracle.com,45.088661,20.285702,15.040573,...,,,2,2,128.112.129.7,780.083216,83.602161,21.521682,1.168013e-03,219.726417


# TODO (meeting Apr 20)
- autocorr coefficient per domain per IP
- add feature: find best autocorr coeff domain and its period
- assume dormnet is outside campus only capture and wireless is inside campus recursive capture