In [16]:
from __future__ import division
import numpy as np
import os, sys
import matplotlib
#matplotlib.use('Agg')
%matplotlib nbagg
import matplotlib.pyplot as plt
import pandas as pd
import subprocess
from collections import defaultdict, Counter


def getCDF(data):
    xdata = np.sort(data)
    ydata = [i/len(xdata) for i in range(len(xdata))]
    return xdata, ydata

# STEPS
- load pcap -> tshark extract
- load tshark csv
- filter to a record only
- key srcip
- apply feature functions
 - number of dst
 - number of A queries
 - number of unique A queries
 - diff all: avg, 50%, min, max
 - most popular query 1,2,3
 - most popular query diff 1,2,3: avg
 - least popular query
 - FREQUENCY BASED FEATURES (need to finish resampling code #todo)

In [2]:
# load from data/ and extract to processed/
infile='data/split_6hour_00000_20170206235900.pcap'
outfile='processed/split_6hour_00000_20170206235900.csv'

cmd_extract = 'tshark -r '+infile+' -E separator="|" -T fields -e frame.time_epoch \
-e frame.time_relative -e ip.src -e ip.dst -e dns.qry.name -e dns.qry.type -Y "dns.flags.response eq 0" \
> '+outfile

# ONLY DO THIS FOR NEW PCAP FILES
# subprocess.check_output(cmd_extract, shell=True)

In [3]:
# load csv from processed
df = pd.read_csv("processed/split_hour_00000_20170206235900.csv", sep="|", error_bad_lines=False,
                names=['time', 'time_relative', 'srcip', 'dstip', 'dnsqry', 'dnstype'],
                #dtype={'time': pd.np.float, 'time_relative': pd.np.float64, 'dnstype': pd.np.int},
                ).dropna()

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df['time'] = df['time'].astype(float)
df['time_relative'] = df['time_relative'].astype(float)

### Stats by dns.qry.type
- convert qry type hex -> int
- A record == 1
- for 2 or more qry type take the first one after splitting

In [5]:
dnstype_count = df.groupby('dnstype')['dnsqry'].count()
dnstype_count.sort_values(inplace=True, ascending=False)

print "Percentage of A Records = ", dnstype_count.iloc[0]/dnstype_count.sum()*100

dnstype_count.head(15)

Percentage of A Records =  77.8158606734


dnstype
0x00000001               1505933
0x0000001c                196059
0x0000000c                153732
0x0000000f                 37187
0x00000021                 33905
0x00000010                  3572
0x00000006                  3331
0x0000002b                   500
0x000000ff                   319
0x000000f9                   185
0x00000001,0x0000001c        147
0x00000002                   110
0x0000b71f                    36
0x0000a21f                    14
0x0000a11f                    11
Name: dnsqry, dtype: int64

In [6]:
# convert dnstype to int (take only first type in case of 2)
df['dnstype'] = df['dnstype'].apply(lambda x: int( x.split(',')[0], 16))

dnstype_count = df.groupby('dnstype')['dnsqry'].count()
dnstype_count.sort_values(inplace=True, ascending=False)

print "New percentage of A Records = ", dnstype_count.iloc[0]/dnstype_count.sum()*100

New percentage of A Records =  77.8234565834


In [34]:
df.head()

Unnamed: 0,time,time_relative,srcip,dstip,dnsqry,dnstype
0,1486444000.0,0.000567,10.9.151.75,10.8.0.5,safebrowsing.google.com,1
1,1486444000.0,0.000744,140.180.223.155,128.112.129.209,51.255.61.103.sbl-xbl.spamhaus.dnsbl,1
2,1486444000.0,0.000853,140.180.223.155,128.112.129.209,51.255.61.103.sbl-xbl.spamhaus.dnsbl,1
3,1486444000.0,0.003419,10.9.38.65,10.8.0.5,tpc.googlesyndication.com,1
4,1486444000.0,0.003675,10.6.63.107,128.112.129.209,98.63.6.10.in-addr.arpa,12


### filter to A records only

In [75]:
df_a = df[df.dnstype == 1]

## group by srcip as KEY and extract features

In [76]:
df_sample = df_a#.head(5000).copy()

In [77]:
grouped = df_sample.groupby('srcip')

In [78]:
# grouped.agg({'C' : np.sum, 'D' : lambda x: np.std(x, ddof=1)})
data = defaultdict(list)

for srcip, group in grouped:
    # key
    data['srcip'].append( srcip )
    
    # DSTIP
    # number of unique dstip
    data['dstip.count'].append( len( set( group['dstip'] ) ) )
    
    # most common dstip
    data['dstip.most_popular'].append( Counter( group['dstip'] ).most_common(1)[0][0] )
    
    # DNS QRY
    # count of total domains
    data['dnsqry.count'].append( group['dnsqry'].count() )
    
    # count of unique domains
    data['dnsqry.unique.count'].append( len( set( group['dnsqry'] )) )
    
    # TIME DIFF ALL - DESCRIBE
    td = group['time'].diff()  #fold(1)
    data['time_diff.mean'].append( td.mean() )
    data['time_diff.std'].append( td.std() )
    data['time_diff.min'].append( td.min() ) 
    data['time_diff.max'].append( td.max() )
    data['time_diff.median'].append( td.median() )
    
    # TOP 3 most popular domains and their time diffs
    top3 = Counter( group['dnsqry'] ).most_common(3)
    for num in range(3):
        column_name = 'dnsqry.most_pop'+ str(num)
        
        if len(top3)>num:
            pop_domains = top3[num]
            domain = pop_domains[0]  #domain
            occurance = pop_domains[1]   #number of lookups
            data[column_name + '.domain'].append( domain )   
            data[column_name + '.count'].append( occurance )
            # time diff
            td = group[ group['dnsqry']==domain ]['time'].diff()
            data[column_name + '.time_diff.mean'].append( td.mean() )
            data[column_name + '.time_diff.std'].append( td.std() )
            data[column_name + '.time_diff.min'].append( td.min() )
            data[column_name + '.time_diff.max'].append( td.max() )
            data[column_name + '.time_diff.median'].append( td.median() )
            
        else:
            data[column_name + '.domain'].append( np.nan )   
            data[column_name + '.count'].append( np.nan )
            # time diff
            data[column_name + '.time_diff.mean'].append( np.nan )
            data[column_name + '.time_diff.std'].append( np.nan )
            data[column_name + '.time_diff.min'].append( np.nan )
            data[column_name + '.time_diff.max'].append( np.nan )
            data[column_name + '.time_diff.median'].append( np.nan )
            
    # Least popular domain
    bottom = Counter( group['dnsqry'] ).most_common()[-1]
    data['dnsqry.least_pop.domain'].append( bottom[0] )    #domain
    data['dnsqry.least_pop.count'].append( bottom[1] )     #occurances

    # FREQUENCY BASED FEATURES
    #TODO

In [79]:
# check num entries before converting to dataframe
for k,v in data.items():
    print len(v), "\t", k

9375 	dstip.most_popular
9375 	dnsqry.most_pop0.time_diff.median
9375 	dnsqry.count
9375 	dnsqry.most_pop0.count
9375 	dnsqry.most_pop0.domain
9375 	time_diff.median
9375 	dnsqry.most_pop0.time_diff.max
9375 	dstip.count
9375 	dnsqry.least_pop.count
9375 	dnsqry.most_pop2.time_diff.min
9375 	dnsqry.most_pop1.time_diff.std
9375 	dnsqry.most_pop1.time_diff.median
9375 	dnsqry.most_pop2.time_diff.median
9375 	dnsqry.most_pop2.count
9375 	dnsqry.most_pop1.time_diff.min
9375 	time_diff.std
9375 	dnsqry.most_pop0.time_diff.mean
9375 	dnsqry.most_pop1.time_diff.mean
9375 	dnsqry.least_pop.domain
9375 	dnsqry.most_pop2.time_diff.std
9375 	dnsqry.most_pop1.time_diff.max
9375 	srcip
9375 	dnsqry.most_pop1.domain
9375 	dnsqry.unique.count
9375 	time_diff.max
9375 	dnsqry.most_pop1.count
9375 	dnsqry.most_pop2.time_diff.max
9375 	time_diff.mean
9375 	dnsqry.most_pop2.domain
9375 	time_diff.min
9375 	dnsqry.most_pop0.time_diff.min
9375 	dnsqry.most_pop2.time_diff.mean
9375 	dnsqry.most_pop0.time_di

In [80]:
df_features = pd.DataFrame(data)

In [81]:
df_features

Unnamed: 0,dnsqry.count,dnsqry.least_pop.count,dnsqry.least_pop.domain,dnsqry.most_pop0.count,dnsqry.most_pop0.domain,dnsqry.most_pop0.time_diff.max,dnsqry.most_pop0.time_diff.mean,dnsqry.most_pop0.time_diff.median,dnsqry.most_pop0.time_diff.min,dnsqry.most_pop0.time_diff.std,...,dnsqry.most_pop2.time_diff.std,dnsqry.unique.count,dstip.count,dstip.most_popular,srcip,time_diff.max,time_diff.mean,time_diff.median,time_diff.min,time_diff.std
0,88,1,www.google.com,12,e1863.dspb.akamaiedge.net,826.509368,279.426812,283.408938,2.607125e+01,226.417748,...,458.755275,27,1,128.112.129.32,10.6.48.10,226.720494,40.071250,21.833098,0.011699,49.146440
1,18,1,bogus-mname.Princeton.EDU,5,settings-win.data.microsoft.com,964.020238,731.008915,964.006575,3.200227e+01,466.004430,...,0.003664,8,1,128.112.128.1,10.6.48.100,864.917819,197.018033,60.301768,0.000284,250.537189
2,14,1,ssepo223w.princeton.edu,5,pdom06.pu.win.princeton.edu,1032.686643,675.018132,707.920267,2.515453e+02,357.411109,...,,8,1,128.112.128.1,10.6.48.103,941.199799,256.671835,14.409143,0.002368,359.942754
3,11,1,ssepo223w.princeton.edu,4,PDOM05.pu.win.princeton.edu,1378.745248,900.013809,900.015696,4.212805e+02,478.732383,...,,8,1,128.112.128.1,10.6.48.104,1377.868398,347.064279,71.666025,0.024225,479.866606
4,15,1,ssepo223w.princeton.edu,7,PDOM05.pu.win.princeton.edu,900.011345,450.006455,449.957294,1.008377e-01,324.419749,...,,8,1,128.112.128.1,10.6.48.105,900.011345,243.372014,14.431761,0.100838,323.225496
5,13,1,ssepo223w.princeton.edu,5,PDOM08.pu.win.princeton.edu,900.008569,527.011740,603.968997,1.003962e-01,448.658156,...,,9,1,128.112.128.1,10.6.48.106,900.008569,182.498424,14.416867,0.011361,337.212335
6,11,1,ssepo223w.princeton.edu,4,pdom06.pu.win.princeton.edu,900.009073,900.008557,900.008672,9.000079e+02,0.000583,...,,8,1,128.112.128.1,10.6.48.107,900.009073,341.498452,19.746308,1.132197,430.485603
7,14,1,ssepo223w.princeton.edu,6,PDOM07.pu.win.princeton.edu,900.009360,562.796289,900.008312,7.338810e-02,463.497015,...,,9,1,128.112.128.1,10.6.48.109,900.009360,263.900570,14.425064,0.073388,381.987052
8,9,1,ssepo223w.princeton.edu,5,PDOM07.pu.win.princeton.edu,1200.730113,629.953196,599.572026,1.199386e+02,453.270231,...,,5,1,128.112.128.1,10.6.48.11,1200.730113,438.678599,299.948846,7.212200,457.435714
9,1,1,ldap2.Princeton.EDU,1,ldap2.Princeton.EDU,,,,,,...,,1,1,128.112.129.209,10.6.48.110,,,,,


In [82]:
df_features.to_csv( 'processed/features_key_srcip.csv', sep="|", header=True)