In [1]:
from __future__ import absolute_import, print_function
import os
import math 
from collections import Counter
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
# read data and assign column names 
cols = (
   'date', 'flow_start', 'duration', 'protocol', 'src_addrIP',
    'direction', 'dst_addrIP', 'flags', 'tos', 'packets',
    'bytes', 'flows', 'label', 
)
df = pd.read_csv(
    'capture20110818.pcap.netflow.labeled', header=0,names=cols,skiprows=1, sep='\s+'
)

In [3]:
df.head()

Unnamed: 0,date,flow_start,duration,protocol,src_addrIP,direction,dst_addrIP,flags,tos,packets,bytes,flows,label
0,2011-08-18,10:19:13.328,4.995,UDP,82.39.2.249:41915,->,147.32.84.59:43087,INT,0,617,40095,1,Background
1,2011-08-18,10:19:13.329,4.996,UDP,147.32.84.59:43087,->,82.39.2.249:41915,INT,0,1290,1909200,1,Background
2,2011-08-18,10:19:13.330,0.0,TCP,147.32.86.166:42020,->,147.32.192.34:993,A_,0,1,66,1,Background
3,2011-08-18,10:19:13.330,0.0,TCP,212.24.150.110:25443,->,147.32.86.166:33426,FPA_,0,2,169,1,Background
4,2011-08-18,10:19:13.333,4.185,TCP,115.184.37.24:49190,->,147.32.84.2:80,A_,0,25,1658,1,Background


In [4]:
# these columns are deleted as they are not required 
df1 = df.drop(columns=['tos', 'flows', 'direction'])



In [32]:

# background flows will not be required in the analysis as they do not exibit malicious activity
df2 = df1.drop(df1[df1['label'] == 'Background'].index)

In [33]:
#let us split the ip address and the port 
df2['src_addr'], df2['src_port'] = df2['src_addrIP'].str.split(':',1).str

#destunation address
df2['dst_addr'], df2['dst_port'] = df2['dst_addrIP'].str.split(':',1).str

In [34]:
df2.drop(columns=['src_addrIP', 'dst_addrIP'])

Unnamed: 0,date,flow_start,duration,protocol,flags,packets,bytes,label,src_addr,src_port,dst_addr,dst_port
19,2011-08-18,10:19:13.347,4.985,TCP,PA_,91,86277,LEGITIMATE,147.32.80.13,80,147.32.85.88,56949
50,2011-08-18,10:19:13.392,0.000,TCP,A_,1,66,LEGITIMATE,147.32.86.110,48102,74.125.232.214,443
56,2011-08-18,10:19:13.411,4.921,TCP,A_,49,3234,LEGITIMATE,147.32.85.88,56949,147.32.80.13,80
72,2011-08-18,10:19:13.460,4.742,TCP,A_,118,7080,LEGITIMATE,147.32.84.59,2768,74.125.108.208,80
74,2011-08-18,10:19:13.486,0.000,TCP,A_,1,60,LEGITIMATE,147.32.84.59,56058,74.125.232.215,443
126,2011-08-18,10:19:13.689,4.434,TCP,A_,30,1980,LEGITIMATE,147.32.84.164,34588,195.24.232.164,80
192,2011-08-18,10:19:13.848,0.000,TCP,A_,1,66,LEGITIMATE,147.32.86.110,53538,74.125.232.215,443
195,2011-08-18,10:19:13.858,0.174,TCP,PA_,5,559,LEGITIMATE,147.32.84.59,37578,74.125.232.215,443
256,2011-08-18,10:19:14.067,0.009,TCP,FA_,2,120,LEGITIMATE,147.32.85.26,58763,209.85.148.147,80
267,2011-08-18,10:19:14.134,0.000,TCP,A_,1,66,LEGITIMATE,147.32.84.21,44146,74.125.232.216,443


In [35]:
# convert to datetime
df2['datetime'] = pd.to_datetime(df2[['date', 'flow_start']].apply(lambda x: ' '.join(x), axis=1))


# make labels into categorical for easier view 
df2['label'] = df2['label'].astype('category')

In [36]:
#drop old columns 
df2.drop(columns=['date', 'flow_start'])

Unnamed: 0,duration,protocol,src_addrIP,dst_addrIP,flags,packets,bytes,label,src_addr,src_port,dst_addr,dst_port,datetime
19,4.985,TCP,147.32.80.13:80,147.32.85.88:56949,PA_,91,86277,LEGITIMATE,147.32.80.13,80,147.32.85.88,56949,2011-08-18 10:19:13.347
50,0.000,TCP,147.32.86.110:48102,74.125.232.214:443,A_,1,66,LEGITIMATE,147.32.86.110,48102,74.125.232.214,443,2011-08-18 10:19:13.392
56,4.921,TCP,147.32.85.88:56949,147.32.80.13:80,A_,49,3234,LEGITIMATE,147.32.85.88,56949,147.32.80.13,80,2011-08-18 10:19:13.411
72,4.742,TCP,147.32.84.59:2768,74.125.108.208:80,A_,118,7080,LEGITIMATE,147.32.84.59,2768,74.125.108.208,80,2011-08-18 10:19:13.460
74,0.000,TCP,147.32.84.59:56058,74.125.232.215:443,A_,1,60,LEGITIMATE,147.32.84.59,56058,74.125.232.215,443,2011-08-18 10:19:13.486
126,4.434,TCP,147.32.84.164:34588,195.24.232.164:80,A_,30,1980,LEGITIMATE,147.32.84.164,34588,195.24.232.164,80,2011-08-18 10:19:13.689
192,0.000,TCP,147.32.86.110:53538,74.125.232.215:443,A_,1,66,LEGITIMATE,147.32.86.110,53538,74.125.232.215,443,2011-08-18 10:19:13.848
195,0.174,TCP,147.32.84.59:37578,74.125.232.215:443,PA_,5,559,LEGITIMATE,147.32.84.59,37578,74.125.232.215,443,2011-08-18 10:19:13.858
256,0.009,TCP,147.32.85.26:58763,209.85.148.147:80,FA_,2,120,LEGITIMATE,147.32.85.26,58763,209.85.148.147,80,2011-08-18 10:19:14.067
267,0.000,TCP,147.32.84.21:44146,74.125.232.216:443,A_,1,66,LEGITIMATE,147.32.84.21,44146,74.125.232.216,443,2011-08-18 10:19:14.134


In [37]:
df2.head()

Unnamed: 0,date,flow_start,duration,protocol,src_addrIP,dst_addrIP,flags,packets,bytes,label,src_addr,src_port,dst_addr,dst_port,datetime
19,2011-08-18,10:19:13.347,4.985,TCP,147.32.80.13:80,147.32.85.88:56949,PA_,91,86277,LEGITIMATE,147.32.80.13,80,147.32.85.88,56949,2011-08-18 10:19:13.347
50,2011-08-18,10:19:13.392,0.0,TCP,147.32.86.110:48102,74.125.232.214:443,A_,1,66,LEGITIMATE,147.32.86.110,48102,74.125.232.214,443,2011-08-18 10:19:13.392
56,2011-08-18,10:19:13.411,4.921,TCP,147.32.85.88:56949,147.32.80.13:80,A_,49,3234,LEGITIMATE,147.32.85.88,56949,147.32.80.13,80,2011-08-18 10:19:13.411
72,2011-08-18,10:19:13.460,4.742,TCP,147.32.84.59:2768,74.125.108.208:80,A_,118,7080,LEGITIMATE,147.32.84.59,2768,74.125.108.208,80,2011-08-18 10:19:13.460
74,2011-08-18,10:19:13.486,0.0,TCP,147.32.84.59:56058,74.125.232.215:443,A_,1,60,LEGITIMATE,147.32.84.59,56058,74.125.232.215,443,2011-08-18 10:19:13.486


In [45]:
# label the dataset as malicious or not 
df2['malicious'] = np.nan

# select an infected host from the above list 
infected_host =  '147.32.96.69'#'147.32.84.205' (try both)

df2.loc[df2['src_addr'] == infected_host, 'malicious'] = True

# isolate normal hosts! u can increase the number of normal hosts too 
normal_hosts  = [
    '147.32.84.164',
    '147.32.84.170',
    '147.32.84.134',
    '147.32.87.11',
    '147.32.80.9',
    '147.32.87.36',
]

df2.loc[df2['src_addr'].isin(normal_hosts), 'malicious'] = False

df2.dropna(subset=['malicious'], inplace=True)

In [46]:
df2.head()

Unnamed: 0,date,flow_start,duration,protocol,src_addrIP,dst_addrIP,flags,packets,bytes,label,src_addr,src_port,dst_addr,dst_port,datetime,malicious
126,2011-08-18,10:19:13.689,4.434,TCP,147.32.84.164:34588,195.24.232.164:80,A_,30,1980,LEGITIMATE,147.32.84.164,34588,195.24.232.164,80,2011-08-18 10:19:13.689,False
609,2011-08-18,10:19:15.323,0.868,TCP,147.32.84.164:50188,94.127.76.194:80,FA_,2,132,LEGITIMATE,147.32.84.164,50188,94.127.76.194,80,2011-08-18 10:19:15.323,False
726,2011-08-18,10:19:15.693,2.5,TCP,147.32.84.164:54784,74.125.232.215:443,PA_,5,2675,LEGITIMATE,147.32.84.164,54784,74.125.232.215,443,2011-08-18 10:19:15.693,False
1267,2011-08-18,10:19:18.143,0.0,TCP,147.32.84.164:56165,74.125.232.197:80,A_,1,66,LEGITIMATE,147.32.84.164,56165,74.125.232.197,80,2011-08-18 10:19:18.143,False
1301,2011-08-18,10:19:18.303,0.0,TCP,147.32.84.164:57965,209.85.149.138:80,A_,1,66,LEGITIMATE,147.32.84.164,57965,209.85.149.138,80,2011-08-18 10:19:18.303,False


In [47]:
# group data by infected
group_label = df2.groupby('malicious')

mal = df2.loc[df2['malicious'] == True]
normal = df2.loc[df2['malicious'] == False]

In [48]:
group_label['duration'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
malicious,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,39925.0,0.356154,1.045342,0.0,0.0,0.0,0.037,4.999
True,100394.0,0.741356,1.558236,0.0,0.0,0.0,0.0,4.999


In [49]:
group_label['protocol'].value_counts()

malicious  protocol
False      TCP          29433
           UDP           9779
           ICMP           713
True       ICMP        100394
Name: protocol, dtype: int64

In [50]:
df2

Unnamed: 0,date,flow_start,duration,protocol,src_addrIP,dst_addrIP,flags,packets,bytes,label,src_addr,src_port,dst_addr,dst_port,datetime,malicious
126,2011-08-18,10:19:13.689,4.434,TCP,147.32.84.164:34588,195.24.232.164:80,A_,30,1980,LEGITIMATE,147.32.84.164,34588,195.24.232.164,80,2011-08-18 10:19:13.689,False
609,2011-08-18,10:19:15.323,0.868,TCP,147.32.84.164:50188,94.127.76.194:80,FA_,2,132,LEGITIMATE,147.32.84.164,50188,94.127.76.194,80,2011-08-18 10:19:15.323,False
726,2011-08-18,10:19:15.693,2.500,TCP,147.32.84.164:54784,74.125.232.215:443,PA_,5,2675,LEGITIMATE,147.32.84.164,54784,74.125.232.215,443,2011-08-18 10:19:15.693,False
1267,2011-08-18,10:19:18.143,0.000,TCP,147.32.84.164:56165,74.125.232.197:80,A_,1,66,LEGITIMATE,147.32.84.164,56165,74.125.232.197,80,2011-08-18 10:19:18.143,False
1301,2011-08-18,10:19:18.303,0.000,TCP,147.32.84.164:57965,209.85.149.138:80,A_,1,66,LEGITIMATE,147.32.84.164,57965,209.85.149.138,80,2011-08-18 10:19:18.303,False
1554,2011-08-18,10:19:19.159,4.434,TCP,147.32.84.164:34588,195.24.232.164:80,A_,29,1914,LEGITIMATE,147.32.84.164,34588,195.24.232.164,80,2011-08-18 10:19:19.159,False
1723,2011-08-18,10:19:19.838,0.000,UDP,147.32.84.170:45807,147.32.80.9:53,INT,1,74,LEGITIMATE,147.32.84.170,45807,147.32.80.9,53,2011-08-18 10:19:19.838,False
1725,2011-08-18,10:19:19.839,0.000,UDP,147.32.84.170:35380,147.32.80.9:53,INT,1,74,LEGITIMATE,147.32.84.170,35380,147.32.80.9,53,2011-08-18 10:19:19.839,False
1728,2011-08-18,10:19:19.840,0.018,TCP,147.32.84.170:44383,209.85.148.105:80,FSA_,4,272,LEGITIMATE,147.32.84.170,44383,209.85.148.105,80,2011-08-18 10:19:19.840,False
1730,2011-08-18,10:19:19.848,0.009,TCP,147.32.84.170:44383,209.85.148.105:80,_FSA,2,140,LEGITIMATE,147.32.84.170,44383,209.85.148.105,80,2011-08-18 10:19:19.848,False


In [72]:
y = df2.malicious
x = df2.drop('malicious', axis=1)

In [73]:
#standard discreatization method followed as mentioned in class 
class Mapping:
    size = None
    def value(self, index):
        return self.mapped[index]


class Numerical_comp(Mapping):
    def __init__(self, df, feature, num_bins=10):
        self.size = num_bins
        thresholds = np.percentile(
            df[feature], np.arange(0.1, 1.0, 1.0 / self.size)
        )
        def get_code(value):
            for code, threshold in enumerate(thresholds):
                if value < threshold:
                    return code
            else:
                return 0
        self.mapped = dict(
            (i, get_code(row[feature]))
            for i, row in df.iterrows()
        )


class Categorical(Mapping):
    def __init__(self, df, feature):
        codes = dict(
            (value, code) for (code, value) in
            enumerate(sorted(df[feature]))
        )
        self.mapped = dict(
            (i, codes[row[feature]])
            for i, row in df.iterrows()
        )
        self.size = len(codes)


def encode(df, features):    
    mappings = [mapper(df, feature) for mapper, feature in features]
    
    encoded = []
    for index, _ in df.iterrows():
        space_size = 1
        for mapping in mappings:
            space_size *= mapping.size
        
        code = 0
        for mapping in mappings:
            space_size /= mapping.size
            code += mapping.value(index) * space_size
            
        encoded.append(code)
    return encoded

In [89]:
def makedict():
    map_li = defaultdict(dict)
    for idx1,i in enumerate(range(1,15)):
        for idx2,j in enumerate(range(1,15)):
            pos = 5 * idx1 + idx2
            map_li[i][j] = pos
    return(map_li)

def sliding(df, window_size):
    default0 = makedict()
    window = window_size
    
    for col in encoded.columns[:-1]:
        data = []
        sensor = encoded[col]
        bigrams[col] = {}
        for i in range(len(sensor)-window):
            subseq = sensor[i:i+window]
            denom = window - 1
            row = np.zeros(25)
            for num in range(len(subseq)-2):
                curr_num =  sensor[num]
                next_num =  sensor[num+1]
                row[default0[curr_num][next_num]] += 1
            row = row / denom
            data.append(row)
    default0[col] = np.array(data)
    return default

In [90]:
import pandas as pd
import numpy as np
from collections import defaultdict
def bigrams(data, a=2):
    bigram = []
    for d in data:
        for i in range(len(d)-1):
            bigram.append(d[i:i+a])    
    return bigram
bigrams1 = bigrams(df2,2)