# Feature Extraction per batch of T sec

In [3]:
import pandas as pd
import csv

from analyzer_clean import Flow, _FLOW_FIELDS

## Analyze each 10s batch
- every 10 s a batch of flows is read
- convert each batch into pandas df for feature extraction

### Definitions:
- entry is each line in csv (packet from netflow)
- conns is unique (src_ip, dst_ip)
- flow is unique flow tuple (src_ip, src_port, dst_ip, dst_port, ip_protocol)
- src_port is randomly assigned
- change in distribution of dst_port may indicate port scanning
- change in distribution of src_ip or dst_ip may indicate DoS

In [4]:
DATA = []
FLOW_LIST = []
T = 10

with open("data.csv", "r") as csvfile:
    # pass input data stream as open("data.csv", "r") to csv.reader for testing
    # read and process line by line don't read into list
    fin = csv.reader(csvfile)
    for e in fin:
        flow = Flow.from_csv(e)
        FLOW_LIST.append(flow)
        if int(flow.ts.strftime('%s')) % T == 0:
            DATA.append(FLOW_LIST)
            FLOW_LIST = []

## Dataframe of features extracted
- volumetric features per batch
    - number of entries (packets)
    - number of bytes (up/dw)
    - number of ports and ips
    - num packets per state (['closed', 'established', 'reset', 'new', 'closing', 'connecting'])
    
- distribution features:
    - count, mean, median, quartiles 25 and 75, 90 percentile
    - group by (src_ip, dst_ip, dst_port, conn, flow)

In [26]:
from collections import defaultdict

def describe_distribution(num_param1_dist_by_param2):
    description = {}
    description["max"] = num_param1_dist_by_param2.max()
    description["mean"] = num_param1_dist_by_param2.mean()
    description["perc25"] = num_param1_dist_by_param2.quantile(.25)
    description["perc50"] = num_param1_dist_by_param2.median()
    description["perc75"] = num_param1_dist_by_param2.quantile(.75)
    description["perc90"] = num_param1_dist_by_param2.quantile(.9)
    return description

In [27]:
def get_volumetric_features(df, stats):
    stats['num_entries'] = len(df)
    
    # num => len(); count => unique.len()
    stats['count_dst_port'] = len( df['dst_port'].unique() )
    stats['count_src_port'] = len( df['src_port'].unique() )
    stats['count_dst_ip'] = len( df['dst_ip'].unique() )
    stats['count_src_ip'] = len( df['src_ip'].unique() )
    
    # number of packets (entries) in each state
    df_state_count = df.groupby('state')['dst_tx'].count()
    for state in ['closed', 'established', 'reset', 'new', 'closing', 'connecting']:
        if state in df_state_count:
            stats['num_state_'+state] = df_state_count[state]
    
    return stats

In [28]:
def get_distribution_features(df, stats, key):
    if key=='conn':
        pass
    elif key=='flow':
        pass
    elif key=='dst_ip':
        pass
    elif key=='src_ip':
        pass
    elif key=='dst_port':
        pass
    else:
        print("key??", key)
        
    return stats

In [31]:
# this computation is performed in batches every 10s
all_stats = defaultdict(list)
flow_batch = 0  ##counter for flow batch in DATA

for flow_list in DATA:
    df = pd.DataFrame(flow_list)
    stats = {}
    stats['flow_batch'] = flow_batch
    flow_batch += 1
    # VOLUMETRIC FEATURES ##stats = get_volumetric_features(df, stats)
    # (1) num entries
    stats['num_entries'] = len(df)
    # num => len(); count => unique.len()
    stats['count_dst_port'] = len( df['dst_port'].unique() )
    stats['count_src_port'] = len( df['src_port'].unique() )
    stats['count_dst_ip'] = len( df['dst_ip'].unique() )
    stats['count_src_ip'] = len( df['src_ip'].unique() )
    # number of packets (entries) in each state
    df_state_count = df.groupby('state')['dst_tx'].count()
    for state in ['closed', 'established', 'reset', 'new', 'closing', 'connecting']:
        if state in df_state_count:
            stats['num_state_'+state] = df_state_count[state]
        else:
            stats['num_state_'+state] = 0
    
    # DISTRIBUTION FEATURES
    # (2) by conn => unique (dst_ip, src_ip)
    ##stats = get_distribution_features(df, stats, 'conn')
    gp1 = df.groupby(['dst_ip', 'src_ip'])
    stats['count_conns'] = len(gp1)
    # dst port per conn
    _dist = gp1['dst_port'].count()
    for k,v in describe_distribution(_dist).items():
        stats['dst_port_per_conn_'+k] = v
        
    # (3) by flow => (dst_ip, dst_port, src_ip, src_port, protocol)
    ##stats = get_distribution_features(df, stats, 'flow')
    gp2 = df.groupby(['dst_ip', 'dst_port', 'src_ip', 'src_port', 'ip_protocol'])
    stats['count_flows'] = len(gp2)
    # mean flows per conn
    stats['num_flows_per_conn_mean'] = stats['count_flows']/stats['count_conns']    
    ## entries per flow
    #num_conns_per_flow_dist = gp2['bytes_up'].count()
    #num_flow_per_conn_dist = gp[['src_port', 'dst_port', 'ip_protocol']].count()
    # ...
    
    # (4) by dst_ip
    gp3 = df.groupby(['dst_ip'])
    # dst_port by dst_ip
    _dist = gp3['dst_port'].count()
    for k,v in describe_distribution(_dist).items():
        stats['dst_port_per_dst_ip_'+k] = v
    # src_ip by dst_ip
    _dist = gp3['src_ip'].count()
    for k,v in describe_distribution(_dist).items():
        stats['src_ip_per_dst_ip_'+k] = v
        
    # (5) by src_ip
    gp4 = df.groupby(['src_ip'])
    # dst_port by src_ip
    _dist = gp4['dst_port'].count()
    for k,v in describe_distribution(_dist).items():
        stats['dst_port_per_src_ip_'+k] = v
    # dst_ip by src_ip
    _dist = gp4['dst_ip'].count()
    for k,v in describe_distribution(_dist).items():
        stats['dst_ip_per_src_ip_'+k] = v
    
    # (6) by dst_port
    gp5 = df.groupby(['dst_port'])
    # dst_ip by dst_port
    _dist = gp5['dst_ip'].count()
    for k,v in describe_distribution(_dist).items():
        stats['dst_ip_per_dst_port_'+k] = v
    # src_ip by dst_port
    _dist = gp5['src_ip'].count()
    for k,v in describe_distribution(_dist).items():
        stats['src_ip_per_dst_port_'+k] = v
        
    # put in all_stats
    for k,v in stats.items():
        all_stats[k].append(v)

In [33]:
# check dataframe length
for k,v in all_stats.items():
    print( k, len(v) )

print(flow_batch, len(DATA))

flow_batch 12904
num_entries 12904
count_dst_port 12904
count_src_port 12904
count_dst_ip 12904
count_src_ip 12904
num_state_closed 12904
num_state_established 12904
num_state_reset 12904
num_state_new 12904
num_state_closing 12904
num_state_connecting 12904
count_conns 12904
dst_port_per_conn_max 12904
dst_port_per_conn_mean 12904
dst_port_per_conn_perc25 12904
dst_port_per_conn_perc50 12904
dst_port_per_conn_perc75 12904
dst_port_per_conn_perc90 12904
count_flows 12904
num_flows_per_conn_mean 12904
dst_port_per_dst_ip_max 12904
dst_port_per_dst_ip_mean 12904
dst_port_per_dst_ip_perc25 12904
dst_port_per_dst_ip_perc50 12904
dst_port_per_dst_ip_perc75 12904
dst_port_per_dst_ip_perc90 12904
src_ip_per_dst_ip_max 12904
src_ip_per_dst_ip_mean 12904
src_ip_per_dst_ip_perc25 12904
src_ip_per_dst_ip_perc50 12904
src_ip_per_dst_ip_perc75 12904
src_ip_per_dst_ip_perc90 12904
dst_port_per_src_ip_max 12904
dst_port_per_src_ip_mean 12904
dst_port_per_src_ip_perc25 12904
dst_port_per_src_ip_perc50

### Plot stat vs flow_batch (time)

In [34]:
df_stats = pd.DataFrame( all_stats )

In [37]:
#df_stats.to_pickle("df_features_per_batch.pkl")

Unnamed: 0,flow_batch,num_entries,count_dst_port,count_src_port,count_dst_ip,count_src_ip,num_state_closed,num_state_established,num_state_reset,num_state_new,...,dst_ip_per_dst_port_perc25,dst_ip_per_dst_port_perc50,dst_ip_per_dst_port_perc75,dst_ip_per_dst_port_perc90,src_ip_per_dst_port_max,src_ip_per_dst_port_mean,src_ip_per_dst_port_perc25,src_ip_per_dst_port_perc50,src_ip_per_dst_port_perc75,src_ip_per_dst_port_perc90
0,0,484,41,458,112,71,19,429,13,22,...,1.0,1.0,10.0,34.0,116,11.804878,1.0,1.0,10.0,34.0
1,1,1,1,1,1,1,0,1,0,0,...,1.0,1.0,1.0,1.0,1,1.000000,1.0,1.0,1.0,1.0
2,2,1,1,1,1,1,0,1,0,0,...,1.0,1.0,1.0,1.0,1,1.000000,1.0,1.0,1.0,1.0
3,3,1,1,1,1,1,0,1,0,0,...,1.0,1.0,1.0,1.0,1,1.000000,1.0,1.0,1.0,1.0
4,4,1,1,1,1,1,0,1,0,0,...,1.0,1.0,1.0,1.0,1,1.000000,1.0,1.0,1.0,1.0
5,5,1,1,1,1,1,0,1,0,0,...,1.0,1.0,1.0,1.0,1,1.000000,1.0,1.0,1.0,1.0
6,6,1,1,1,1,1,0,1,0,0,...,1.0,1.0,1.0,1.0,1,1.000000,1.0,1.0,1.0,1.0
7,7,1,1,1,1,1,0,1,0,0,...,1.0,1.0,1.0,1.0,1,1.000000,1.0,1.0,1.0,1.0
8,8,1,1,1,1,1,0,1,0,0,...,1.0,1.0,1.0,1.0,1,1.000000,1.0,1.0,1.0,1.0
9,9,1,1,1,1,1,0,1,0,0,...,1.0,1.0,1.0,1.0,1,1.000000,1.0,1.0,1.0,1.0


### Flow analysis
- Are there more than 1 packets for the same flow? Distribution?
- Number of flows that are closed (request and response)
- States per flow distribution
- distribution of volumes

In [5]:
# Number of entries per flow > 1
for flow_list in DATA:
    df = pd.DataFrame(flow_list)
    gp1 = df.groupby(['dst_ip','src_ip'])
    gp2 = df.groupby(['dst_ip', 'dst_port', 'src_ip', 'src_port', 'ip_protocol'])
    # conns per flow
    temp = gp2['src_tx'].count()
    temp2 = temp[temp>1]
    if len(temp2)>0:
        print(temp2)

dst_ip           dst_port  src_ip           src_port  ip_protocol
10.12.0.31       80        10.2.1.20        49718     tcp            2
10.12.87.1       8125      10.2.20.4        36659     udp            2
                                            47465     udp            2
                                            58772     udp            2
52.85.77.134     443       192.168.100.96   34140     tcp            2
74.201.65.31     443       10.2.1.39        60854     tcp            2
                                            60855     tcp            2
                                            60857     tcp            2
                                            60858     tcp            2
91.189.89.22     443       192.168.100.237  33815     tcp            2
192.168.100.250  53        192.168.100.11   39145     udp            2
199.91.71.85     443       192.168.100.29   40388     tcp            2
199.91.71.89     443       10.2.38.2        41229     tcp            2
Name: src_t

dst_ip          dst_port  src_ip          src_port  ip_protocol
192.168.100.96  80        192.168.100.96  45394     tcp            2
                                          45396     tcp            2
                                          45398     tcp            2
                                          45400     tcp            2
Name: src_tx, dtype: int64
dst_ip          dst_port  src_ip          src_port  ip_protocol
192.168.100.96  80        192.168.100.96  45402     tcp            2
                                          45404     tcp            2
Name: src_tx, dtype: int64
dst_ip          dst_port  src_ip          src_port  ip_protocol
192.168.100.96  80        192.168.100.96  45406     tcp            2
                                          45408     tcp            2
Name: src_tx, dtype: int64
dst_ip          dst_port  src_ip          src_port  ip_protocol
192.168.100.96  80        192.168.100.96  45410     tcp            2
                                          

dst_ip           dst_port  src_ip          src_port  ip_protocol
192.168.100.190  80        192.168.100.96  136       tcp            4
Name: src_tx, dtype: int64
dst_ip           dst_port  src_ip           src_port  ip_protocol
10.2.33.4        443       192.168.111.46   56244     tcp            2
10.12.87.1       2013      192.168.100.243  53301     tcp            2
74.201.65.31     443       10.2.1.39        37425     tcp            2
192.168.100.250  53        192.168.100.11   14214     udp            2
199.91.71.85     443       10.2.38.2        49753     tcp            2
239.255.255.250  1900      192.168.100.10   58345     udp            2
Name: src_tx, dtype: int64
dst_ip          dst_port  src_ip          src_port  ip_protocol
192.168.100.42  53        192.168.100.96  52240     tcp            4
                                          52243     tcp            4
Name: src_tx, dtype: int64
dst_ip          dst_port  src_ip          src_port  ip_protocol
192.168.100.42  80        

dst_ip        dst_port  src_ip           src_port  ip_protocol
10.2.17.1     443       192.168.111.226  51324     tcp            2
10.12.0.22    443       192.168.111.202  55884     tcp            2
10.12.0.31    80        10.2.20.5        51066     tcp            2
                                         51112     tcp            2
10.12.87.1    443       192.168.111.178  35066     tcp            2
                                         35086     tcp            2
                                         35088     tcp            2
                                         35090     tcp            2
                                         35092     tcp            2
                                         35094     tcp            2
74.201.65.31  443       10.2.1.39        39810     tcp            2
199.91.71.89  443       10.2.38.2        45531     tcp            2
216.58.216.4  443       192.168.100.138  39214     tcp            2
Name: src_tx, dtype: int64
dst_ip           dst_port 

dst_ip          dst_port  src_ip          src_port  ip_protocol
192.168.100.16  80        192.168.100.96  234       tcp            3
Name: src_tx, dtype: int64
dst_ip           dst_port  src_ip           src_port  ip_protocol
10.12.0.31       80        10.2.1.20        50275     tcp            2
                           10.2.20.3        60196     tcp            2
                                            60199     tcp            2
74.125.28.189    443       192.168.100.131  54225     tcp            2
74.201.65.30     443       10.2.1.39        37843     tcp            2
192.168.100.250  53        192.168.100.11   41358     udp            2
216.58.216.5     443       192.168.100.116  40054     tcp            2
239.255.255.250  1900      192.168.100.131  53806     udp            2
Name: src_tx, dtype: int64
dst_ip      dst_port  src_ip     src_port  ip_protocol
10.12.0.31  80        10.2.20.4  60474     tcp            2
10.12.68.7  6801      10.2.33.4  36928     tcp            2
Name

dst_ip          dst_port  src_ip           src_port  ip_protocol
10.12.87.1      443       192.168.111.178  35518     tcp            2
                                           35520     tcp            2
                                           35522     tcp            2
                                           35530     tcp            2
                                           35532     tcp            2
17.253.27.204   80        192.168.111.158  50128     tcp            2
216.58.217.206  443       192.168.100.181  57235     udp            2
Name: src_tx, dtype: int64
dst_ip           dst_port  src_ip           src_port  ip_protocol
10.12.21.2       6800      10.2.20.8        54316     tcp            2
10.12.87.1       443       192.168.100.40   50316     tcp            2
                                            50328     tcp            2
74.201.65.31     443       10.2.1.39        41488     tcp            2
                                            41489     tcp           

dst_ip           dst_port  src_ip           src_port  ip_protocol
10.12.0.31       80        10.2.20.1        60258     tcp            2
                           10.2.20.6        39819     tcp            2
                                            39823     tcp            2
                           192.168.100.242  60273     tcp            2
10.12.21.2       6800      10.2.20.8        54364     tcp            2
                 6801      10.2.20.8        40490     tcp            2
10.12.21.3       6800      10.2.20.8        53059     tcp            2
                 6803      10.2.20.8        33219     tcp            2
10.12.68.6       6800      10.2.20.8        55444     tcp            2
                                            55550     tcp            2
10.12.68.7       6800      10.2.20.8        60574     tcp            2
10.12.231.2      6801      10.2.20.8        37671     tcp            2
                                            37774     tcp            2
           

dst_ip           dst_port  src_ip           src_port  ip_protocol
10.12.0.31       80        10.2.1.20        50003     tcp            2
                           10.2.20.1        60904     tcp            2
                           10.2.20.3        60854     tcp            2
74.201.65.31     443       10.2.1.39        43056     tcp            2
104.192.136.114  443       192.168.100.16   51075     tcp            2
199.91.71.85     443       10.2.38.2        50370     tcp            2
216.58.216.14    443       192.168.100.176  53054     tcp            2
Name: src_tx, dtype: int64
dst_ip          dst_port  src_ip           src_port  ip_protocol
10.12.0.31      80        10.2.20.6        40482     tcp            2
104.196.229.58  443       192.168.111.158  51587     tcp            2
Name: src_tx, dtype: int64
dst_ip           dst_port  src_ip           src_port  ip_protocol
10.12.0.21       5672      10.2.254.49      48318     tcp            2
10.12.0.31       80        10.2.1.20     

dst_ip           dst_port  src_ip           src_port  ip_protocol
10.2.17.1        443       192.168.111.226  52318     tcp            2
10.12.21.2       6801      10.2.20.8        40364     tcp            2
10.12.68.6       6800      10.2.20.8        55578     tcp            2
                 6801      10.2.20.8        36452     tcp            2
10.12.68.7       6801      10.2.20.8        38976     tcp            2
10.12.231.2      6803      10.2.20.8        45402     tcp            2
10.12.231.3      6801      10.2.20.8        53393     tcp            2
                                            53427     tcp            2
104.192.136.114  443       192.168.111.158  52297     tcp            2
192.168.100.250  53        192.168.100.11   50249     udp            2
199.91.71.89     1194      192.168.100.106  36367     udp            2
207.154.68.50    30233     192.168.100.183  2235      udp            2
216.58.216.14    443       192.168.100.40   44876     udp            2
Name: src_t