In [3]:
from __future__ import division
import numpy as np
import os, sys
import matplotlib
#matplotlib.use('Agg')
%matplotlib nbagg
import matplotlib.pyplot as plt
import pandas as pd
import subprocess
from collections import defaultdict, Counter


def getCDF(data):
    xdata = np.sort(data)
    ydata = [i/len(xdata) for i in range(len(xdata))]
    return xdata, ydata

# STEPS
- load pcap -> tshark extract
- load tshark csv
- filter to a record only
- key srcip
- apply feature functions
 - number of dst
 - number of A queries
 - number of unique A queries
 - diff all: avg, 50%, min, max
 - most popular query 1,2,3
 - most popular query diff 1,2,3: avg
 - least popular query
 - FREQUENCY BASED FEATURES (need to finish resampling code #todo)

In [88]:
# load from data/ and extract to processed/
infile='data/split_6hour_00000_20170206235900.pcap'
outfile='processed/split_6hour_00000_20170206235900.csv'

cmd_extract = 'tshark -r '+infile+' -E separator="|" -T fields -e frame.time_epoch \
-e frame.time_relative -e ip.src -e ip.dst -e dns.qry.name -e dns.qry.type -Y "dns.flags.response eq 0" \
> '+outfile

# ONLY DO THIS FOR NEW PCAP FILES
# subprocess.check_output(cmd_extract, shell=True)

In [89]:
# load csv from processed
df = pd.read_csv("processed/split_hour_00000_20170206235900.csv", sep="|", error_bad_lines=False,
                names=['time', 'time_relative', 'srcip', 'dstip', 'dnsqry', 'dnstype'],
                #dtype={'time': pd.np.float, 'time_relative': pd.np.float64, 'dnstype': pd.np.int},
                ).dropna()

In [90]:
df['time'] = df['time'].astype(float)
df['time_relative'] = df['time_relative'].astype(float)
df['dnsqry'] = df['dnsqry'].str.lower()

### Stats by dns.qry.type
- convert qry type hex -> int
- A record == 1
- for 2 or more qry type take the first one after splitting

In [91]:
dnstype_count = df.groupby('dnstype')['dnsqry'].count()
dnstype_count.sort_values(inplace=True, ascending=False)

print "Percentage of A Records = ", dnstype_count.iloc[0]/dnstype_count.sum()*100

dnstype_count.head(15)

Percentage of A Records =  77.8158606734


dnstype
0x00000001               1505933
0x0000001c                196059
0x0000000c                153732
0x0000000f                 37187
0x00000021                 33905
0x00000010                  3572
0x00000006                  3331
0x0000002b                   500
0x000000ff                   319
0x000000f9                   185
0x00000001,0x0000001c        147
0x00000002                   110
0x0000b71f                    36
0x0000a21f                    14
0x0000a11f                    11
Name: dnsqry, dtype: int64

In [92]:
# convert dnstype to int (take only first type in case of 2)
df['dnstype'] = df['dnstype'].apply(lambda x: int( x.split(',')[0], 16))

dnstype_count = df.groupby('dnstype')['dnsqry'].count()
dnstype_count.sort_values(inplace=True, ascending=False)

print "New percentage of A Records = ", dnstype_count.iloc[0]/dnstype_count.sum()*100

New percentage of A Records =  77.8234565834


In [93]:
df.head()

Unnamed: 0,time,time_relative,srcip,dstip,dnsqry,dnstype
0,1486444000.0,0.000567,10.9.151.75,10.8.0.5,safebrowsing.google.com,1
1,1486444000.0,0.000744,140.180.223.155,128.112.129.209,51.255.61.103.sbl-xbl.spamhaus.dnsbl,1
2,1486444000.0,0.000853,140.180.223.155,128.112.129.209,51.255.61.103.sbl-xbl.spamhaus.dnsbl,1
3,1486444000.0,0.003419,10.9.38.65,10.8.0.5,tpc.googlesyndication.com,1
4,1486444000.0,0.003675,10.6.63.107,128.112.129.209,98.63.6.10.in-addr.arpa,12


### filter to A records only

In [94]:
df_a = (df[df.dnstype == 1]).copy()

## Add 1LD and 2LD as column

In [95]:
def reduce_domain(domain, level):
    tmp = domain.split('.')
    out = '.'.join( tmp[-level:] )
    return out
    
reduce_domain('www.youtube.com', 2)

'youtube.com'

In [96]:
df_a['1LD'] = df_a['dnsqry'].apply(lambda x: reduce_domain(x, 1))
df_a['2LD'] = df_a['dnsqry'].apply(lambda x: reduce_domain(x, 2))

df_a.head()

Unnamed: 0,time,time_relative,srcip,dstip,dnsqry,dnstype,1LD,2LD
0,1486444000.0,0.000567,10.9.151.75,10.8.0.5,safebrowsing.google.com,1,com,google.com
1,1486444000.0,0.000744,140.180.223.155,128.112.129.209,51.255.61.103.sbl-xbl.spamhaus.dnsbl,1,dnsbl,spamhaus.dnsbl
2,1486444000.0,0.000853,140.180.223.155,128.112.129.209,51.255.61.103.sbl-xbl.spamhaus.dnsbl,1,dnsbl,spamhaus.dnsbl
3,1486444000.0,0.003419,10.9.38.65,10.8.0.5,tpc.googlesyndication.com,1,com,googlesyndication.com
6,1486444000.0,0.003945,10.9.38.65,10.8.0.5,www.google.com,1,com,google.com


# FILTER OUT princeton.edu

In [97]:
df_b = df_a[ df_a['2LD'] != 'princeton.edu' ]
#df_c = df_b[ df_b['srcip'].str.contains('140.180.128') ]
df_b

Unnamed: 0,time,time_relative,srcip,dstip,dnsqry,dnstype,1LD,2LD
0,1.486444e+09,0.000567,10.9.151.75,10.8.0.5,safebrowsing.google.com,1,com,google.com
1,1.486444e+09,0.000744,140.180.223.155,128.112.129.209,51.255.61.103.sbl-xbl.spamhaus.dnsbl,1,dnsbl,spamhaus.dnsbl
2,1.486444e+09,0.000853,140.180.223.155,128.112.129.209,51.255.61.103.sbl-xbl.spamhaus.dnsbl,1,dnsbl,spamhaus.dnsbl
3,1.486444e+09,0.003419,10.9.38.65,10.8.0.5,tpc.googlesyndication.com,1,com,googlesyndication.com
6,1.486444e+09,0.003945,10.9.38.65,10.8.0.5,www.google.com,1,com,google.com
7,1.486444e+09,0.004137,10.8.155.147,10.8.0.5,www.google-analytics.com,1,com,google-analytics.com
8,1.486444e+09,0.006005,10.9.38.65,10.8.0.5,s.adroll.com,1,com,adroll.com
11,1.486444e+09,0.013177,10.8.239.134,10.8.0.5,safebrowsing.google.com,1,com,google.com
12,1.486444e+09,0.013527,10.8.12.117,10.8.0.5,choices-vpc.truste.com,1,com,truste.com
13,1.486444e+09,0.024620,10.9.168.1,10.8.0.5,images-na.ssl-images-amazon.com,1,com,ssl-images-amazon.com


## group by srcip as KEY and extract features

In [98]:
df_sample = df_b#.head(5000).copy()

In [99]:
grouped = df_sample.groupby('srcip')

In [100]:
# grouped.agg({'C' : np.sum, 'D' : lambda x: np.std(x, ddof=1)})
data = defaultdict(list)

for srcip, group in grouped:
    # key
    data['srcip'].append( srcip )
    
    # DSTIP
    # number of unique dstip
    data['dstip.count'].append( len( set( group['dstip'] ) ) )
    
    # most common dstip
    data['dstip.most_popular'].append( Counter( group['dstip'] ).most_common(1)[0][0] )
    
    # DNS QRY
    # count of total domains
    data['dnsqry.count'].append( group['dnsqry'].count() )
    
    # count of unique domains
    data['dnsqry.unique.count'].append( len( set( group['dnsqry'] )) )
    
    # count of unique 1LDs
    data['1LD.unique.count'].append( len( set( group['1LD'] )) )
    # count of unique 2LDs
    data['2LD.unique.count'].append( len( set( group['2LD'] )) )
    
    # TIME DIFF ALL - DESCRIBE
    td = group['time'].diff()  #fold(1)
    data['time_diff.mean'].append( td.mean() )
    data['time_diff.std'].append( td.std() )
    data['time_diff.min'].append( td.min() ) 
    data['time_diff.max'].append( td.max() )
    data['time_diff.median'].append( td.median() )
    
    # TOP 3 most popular domains and their time diffs
    top3 = Counter( group['dnsqry'] ).most_common(3)
    for num in range(3):
        column_name = 'dnsqry.most_pop'+ str(num)
        
        if len(top3)>num:
            pop_domains = top3[num]
            domain = pop_domains[0]  #domain
            occurance = pop_domains[1]   #number of lookups
            data[column_name + '.domain'].append( domain )   
            data[column_name + '.count'].append( occurance )
            # time diff
            td = group[ group['dnsqry']==domain ]['time'].diff()
            data[column_name + '.time_diff.mean'].append( td.mean() )
            data[column_name + '.time_diff.std'].append( td.std() )
            data[column_name + '.time_diff.min'].append( td.min() )
            data[column_name + '.time_diff.max'].append( td.max() )
            data[column_name + '.time_diff.median'].append( td.median() )
            
        else:
            data[column_name + '.domain'].append( np.nan )   
            data[column_name + '.count'].append( np.nan )
            # time diff
            data[column_name + '.time_diff.mean'].append( np.nan )
            data[column_name + '.time_diff.std'].append( np.nan )
            data[column_name + '.time_diff.min'].append( np.nan )
            data[column_name + '.time_diff.max'].append( np.nan )
            data[column_name + '.time_diff.median'].append( np.nan )
            
    # Least popular domain
    bottom = Counter( group['dnsqry'] ).most_common()[-1]
    data['dnsqry.least_pop.domain'].append( bottom[0] )    #domain
    data['dnsqry.least_pop.count'].append( bottom[1] )     #occurances

    # FREQUENCY BASED FEATURES
    #TODO

In [2]:
# check num entries before converting to dataframe
for k,v in data.items():
    print len(v), "\t", k

NameError: name 'data' is not defined

In [102]:
df_features = pd.DataFrame(data)

In [103]:
df_features

Unnamed: 0,1LD.unique.count,2LD.unique.count,dnsqry.count,dnsqry.least_pop.count,dnsqry.least_pop.domain,dnsqry.most_pop0.count,dnsqry.most_pop0.domain,dnsqry.most_pop0.time_diff.max,dnsqry.most_pop0.time_diff.mean,dnsqry.most_pop0.time_diff.median,...,dnsqry.most_pop2.time_diff.std,dnsqry.unique.count,dstip.count,dstip.most_popular,srcip,time_diff.max,time_diff.mean,time_diff.median,time_diff.min,time_diff.std
0,3,17,82,1,www.google.com,12,e1863.dspb.akamaiedge.net,826.509368,279.426812,283.408938,...,458.755275,23,1,128.112.129.32,10.6.48.10,226.720494,42.408970,27.007246,0.017376,50.710004
1,1,1,6,1,vortex-win.data.microsoft.com,5,settings-win.data.microsoft.com,964.020238,731.008915,964.006575,...,,2,1,128.112.128.1,10.6.48.100,964.020238,584.807132,963.999243,0.000284,519.369585
2,1,1,1,1,dns.msftncsi.com,1,dns.msftncsi.com,,,,...,,1,1,128.112.128.1,10.6.48.105,,,,,
3,1,2,12,1,go.microsoft.com,6,statsfe2.update.microsoft.com,541.406385,541.193684,541.114677,...,,3,1,128.112.128.1,10.6.48.112,541.365839,262.413738,180.582701,76.617629,193.950951
4,1,2,2,1,ctldl.windowsupdate.com,1,crl.microsoft.com,,,,...,,2,1,128.112.128.1,10.6.48.116,0.009410,0.009410,0.009410,0.009410,
5,1,1,10,1,r20swj13mr.microsoft.com,7,statsfe2.update.microsoft.com,541.456863,451.062572,541.231790,...,,4,1,128.112.128.1,10.6.48.122,541.270871,345.571333,499.353389,26.537574,237.646267
6,2,7,105,1,go.microsoft.com,28,clients1.google.com,180.060638,126.698233,120.024491,...,167.972535,14,2,128.112.128.1,10.6.48.13,150.035113,34.335456,29.255084,0.003357,35.207985
7,1,2,3,1,crl.microsoft.com,2,tools.google.com,1017.267074,1017.267074,1017.267074,...,,2,1,128.112.128.1,10.6.48.131,907.008197,508.633537,508.633537,110.258877,563.386847
8,1,1,8,8,statsfe2.update.microsoft.com,8,statsfe2.update.microsoft.com,540.972311,463.656291,540.915727,...,,1,1,128.112.128.1,10.6.48.138,540.972311,463.656291,540.915727,41.827323,186.662131
9,1,1,131,131,endpoint.ingress.rapid7.com,131,endpoint.ingress.rapid7.com,40.077696,27.467939,20.093167,...,,1,1,128.112.128.1,10.6.48.14,40.077696,27.467939,20.093167,10.030509,12.012770


In [104]:
df_features.to_csv( 'processed/features_key_srcip.csv', sep="|", header=True)

# FILTERING

In [105]:
df_filtered = df_features[(df_features['2LD.unique.count']<=10) & (df_features['dnsqry.count']>10)]

In [106]:
df_filtered.set_index('srcip')

Unnamed: 0_level_0,1LD.unique.count,2LD.unique.count,dnsqry.count,dnsqry.least_pop.count,dnsqry.least_pop.domain,dnsqry.most_pop0.count,dnsqry.most_pop0.domain,dnsqry.most_pop0.time_diff.max,dnsqry.most_pop0.time_diff.mean,dnsqry.most_pop0.time_diff.median,...,dnsqry.most_pop2.time_diff.min,dnsqry.most_pop2.time_diff.std,dnsqry.unique.count,dstip.count,dstip.most_popular,time_diff.max,time_diff.mean,time_diff.median,time_diff.min,time_diff.std
srcip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10.6.48.112,1,2,12,1,go.microsoft.com,6,statsfe2.update.microsoft.com,541.406385,541.193684,541.114677,...,,,3,1,128.112.128.1,541.365839,262.413738,180.582701,7.661763e+01,193.950951
10.6.48.13,2,7,105,1,go.microsoft.com,28,clients1.google.com,180.060638,126.698233,120.024491,...,30.013770,167.972535,14,2,128.112.128.1,150.035113,34.335456,29.255084,3.357172e-03,35.207985
10.6.48.14,1,1,131,131,endpoint.ingress.rapid7.com,131,endpoint.ingress.rapid7.com,40.077696,27.467939,20.093167,...,,,1,1,128.112.128.1,40.077696,27.467939,20.093167,1.003051e+01,12.012770
10.6.48.148,2,4,24,3,dns.msftncsi.com,6,statsfe2.update.microsoft.com,601.741539,601.680134,601.663142,...,900.053469,0.013093,6,2,128.112.128.1,396.812208,139.479332,131.192425,2.448916e-02,131.158580
10.6.48.160,1,1,12,12,linux-update.oracle.com,12,linux-update.oracle.com,45.112617,19.120410,15.001348,...,,,1,2,128.112.129.7,45.112617,19.120410,15.001348,7.097721e-04,20.221434
10.6.48.161,2,2,13,1,ftp.ecsi.net,12,linux-update.oracle.com,45.088661,20.285702,15.040573,...,,,2,2,128.112.129.7,780.083216,83.602161,21.521682,1.168013e-03,219.726417
10.6.48.164,1,2,28,1,settings-win.data.microsoft.com,25,imap.gmail.com,203.389838,145.602746,162.410408,...,,,3,2,128.112.128.1,203.389838,129.424663,132.635867,2.506089e-02,53.526039
10.6.48.172,1,1,29,5,settings-win.data.microsoft.com,24,vortex-win.data.microsoft.com,264.683658,145.135925,132.005385,...,,,2,1,128.112.128.1,264.683658,122.051818,132.005055,2.366830e+01,55.021026
10.6.48.176,1,2,12,3,settings-win.data.microsoft.com,5,web1.compellent.com,601.393626,601.324487,601.308841,...,32.002036,1268.790304,3,1,128.112.128.1,601.286641,245.188991,237.424542,3.200204e+01,146.141117
10.6.48.179,1,2,12,1,sls.update.microsoft.com,7,statsfe2.update.microsoft.com,556.636144,453.438827,540.977577,...,,,3,2,128.112.128.1,540.982407,247.330269,190.363977,2.532530e-02,241.940925


## Correlation problem

In [4]:
# mixed periodic + random activity
X = [0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0]
# purely periodic
Y = [0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
# random activity
Z = [0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0]

c = np.correlate(X,X, mode='full')
corr_arr = c[len(c)/2 :]

print corr_arr

[6 0 3 0 0 0 0 0 0 0 0 0 0 0 2 0 4 0 2 0 0 0 0 0 0 0 0 0 0 0 1 0 2 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0]




In [5]:
from numpy.fft import fft, ifft

def periodic_corr(x, y):
    """Periodic correlation, implemented using the FFT.

    x and y must be real sequences with the same length.
    """
    return ifft(fft(x) * fft(y).conj()).real

In [8]:
periodic_corr(Z, Z)

array([  6.00000000e+00,  -2.32013955e-15,   4.64027909e-15,
         3.62521804e-16,  -1.23257413e-15,   2.00000000e+00,
        -1.59509594e-15,   1.00000000e+00,   1.00000000e+00,
         1.00000000e+00,   1.00000000e+00,  -6.50559285e-17,
        -1.52161932e-16,   1.00000000e+00,   1.00000000e+00,
         1.00000000e+00,   1.00000000e+00,  -4.89404435e-16,
         1.00000000e+00,   1.00000000e+00,   5.80034886e-16,
         1.00000000e+00,   1.00000000e+00,   1.00000000e+00,
         1.59509594e-15,  -1.35207915e-15,   1.00000000e+00,
         1.00000000e+00,   1.00000000e+00,   6.52539247e-16,
         1.00000000e+00,   1.00000000e+00,  -3.88025507e-16,
         1.00000000e+00,   1.00000000e+00,   1.00000000e+00,
         1.00000000e+00,  -1.16006977e-15,   6.34413157e-16,
         1.00000000e+00,   1.00000000e+00,   1.00000000e+00,
         1.00000000e+00,  -2.90017443e-16,   2.00000000e+00,
        -1.16006977e-15,   1.08011698e-15,   3.34235381e-15,
        -7.97547969e-16]

In [108]:
c = np.correlate(Y,Y, mode='full')
corr_arr = c[len(c)/2 :]

print corr_arr

[3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0]


  from ipykernel import kernelapp as app


In [109]:
c = np.correlate(Z,Z, mode='full')
corr_arr = c[len(c)/2 :]

print corr_arr

[6 0 0 0 0 2 0 1 1 1 1 0 0 1 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0]


  from ipykernel import kernelapp as app


In [85]:
d = df_b[df_b['srcip']=='10.6.48.13'][['time', 'dnsqry']]

d['pd_time'] = pd.to_datetime(d['time'],unit='s')

d = d.set_index('pd_time')

d.resample('s').fillna(0)

Unnamed: 0_level_0,time
pd_time,Unnamed: 1_level_1
2017-02-07 04:59:08,1.486444e+09
2017-02-07 04:59:09,0.000000e+00
2017-02-07 04:59:10,0.000000e+00
2017-02-07 04:59:11,0.000000e+00
2017-02-07 04:59:12,0.000000e+00
2017-02-07 04:59:13,0.000000e+00
2017-02-07 04:59:14,0.000000e+00
2017-02-07 04:59:15,0.000000e+00
2017-02-07 04:59:16,0.000000e+00
2017-02-07 04:59:17,0.000000e+00


# TODO
- add hostdb fields by joining
- autocorr coefficient per domain per IP
- add feature: find best autocorr coeff domain and its period
- assume dormnet is outside campus only capture and wireless is inside campus recursive capture