## Botnet Profiling
### functions

In [2]:
import pandas as pd
import numpy as np
from hmmlearn.hmm import GaussianHMM


# classes and functions used for the discretization of flow data
class M_num:
    '''
    Compute the mapping for numerical features
    '''

    # paramater to change number of percentiles
    p = 5

    # percentile vals contains the feature values corresponding to the different percentiles
    percentile_vals = []
    # s is the size of the feature (i.e. |M_i|)
    s = 0
    # feature that we're working with
    feat = ''
    # the full dataframe
    df = None

    def __init__(self, df, feat):
        percentiles = np.arange(.1, 1, 1 / self.p)
        self.percentile_vals = map(lambda p: np.percentile(df[feat], p), percentiles)
        self.s = len(percentiles)
        self.feat = feat
        self.df = df

    # return the size |M_i|
    def size(self):
        return self.s

    # return the value M_i
    def val(self, row):
        val = row[self.feat]
        prev = 0
        for i, cur in enumerate(self.percentile_vals):
            if val >= prev and val < cur:
                return i
            prev = cur
        return 0


class M_cat:
    '''
    Compute the mapping for categorical features
    '''

    # s is the size of the feature (i.e. |M_i|)
    s = 0
    # feature that we're working with
    feat = ''
    # the full dataframe
    df = None

    def __init__(self, df, feat):
        self.s = len(df[feat].unique())
        self.feat = feat
        self.df = df

    # return the size |M_i|
    def size(self):
        return self.s

    # return the value M_i
    def val(self, row):
        return row[self.feat]


# encode a single flow
def encode_flow(feats, row):
    code = 0
    space_size = 1
    for feat in feats:
        space_size *= feats[feat].size()
    for feat in feats:
        s = feats[feat].size()
        v = feats[feat].val(row)
        code += v * (space_size / s)
        space_size = space_size / s
    return code


# encode an entire series of flows
def encode_series(df, feats):
    encoded = []
    for index, row in df.iterrows():
        encoded.append(encode_flow(feats, row))
    return encoded

### Pre-process the data

In [3]:
# load data
names = ['date', 'time', 'duration', 'protocol', 'src', 'direction', 'dst', 'flags', 'tos', 'packets', 'bytes', 'flows', 'label']
df = pd.read_csv('data/capture20110818_51_.pcap.netflow.labeled',skiprows=1,header=0,sep='\s+',names=names)
# remove background data
df = df[df['label'] != 'Background']
# process the rest of the data
df['src_ip'], df['src_port'] = df['src'].str.split(':', 1).str
df['dst_ip'], df['dst_port'] = df['dst'].str.split(':', 1).str
df['src_ip_num'] = pd.Categorical(df['src_ip'], categories=df['src_ip'].unique()).codes
df['dst_ip_num'] = pd.Categorical(df['dst_ip'], categories=df['dst_ip'].unique()).codes
# handle categorical data
df['protocol_num'] = pd.Categorical(df['protocol'], categories=df['protocol'].unique()).codes

## Discretization of infected host data and more profiling

In [None]:
# the infected host flows that we will profile
chosen = df[df['src_ip'] == '147.32.84.204']
# rest of the hosts split between benign and malicious for testing purposes
normal =  ['147.32.84.170', '147.32.84.134', '147.32.84.164', '147.32.87.36', '147.32.80.9', '147.32.87.11']
infected = ['147.32.84.165','147.32.84.191','147.32.84.192','147.32.84.193','147.32.84.205','147.32.84.206','147.32.84.207','147.32.84.208','147.32.84.209']

# discretization based on the important features found in the previous task
feats = {'protocol_num': M_cat(chosen, 'protocol_num'), 'bytes': M_num(chosen, 'bytes')}
chosen_discrete = encode_series(chosen,feats)
sliding_window = 10
size = len(chosen_discrete) - sliding_window
# prepare matrix for slinding window
data = np.zeros((size,sliding_window),dtype=np.int32)
for i in range(size):
    data[i] = np.array([flow for flow in chosen_discrete[i:i+sliding_window]])

# use a Gaussian Hidden Markov Model with 4 states. These are from the infected host data
hmm = GaussianHMM(n_components=4)
hmm.fit(data)
# log-likelihood of training host
ll = hmm.decode(data)[0]

## model other host data

In [6]:

log_likelihood = {}

# log-likelihood data sequence of normal data, calculated from infected host model
for ip in normal:
    # get the flows of that host only
    host = df[df['src_ip'] == ip]
    # discretize flows using protocol and packet bytes only
    host_discrete = encode_series(host,feats)
    size = len(host_discrete) - sliding_window
    # if host has enough flows for creating a window
    if size > 0:
        # create sliding windows sequences
        data = np.zeros((size,sliding_window),dtype=np.int32)
        for i in range(size):
            data[i] = np.array([flow for flow in host_discrete[i:i+sliding_window]])
        # get the log-likelihood of the sequential data
        log_likelihood[ip] = hmm.decode(data)[0]
    else:
        log_likelihood[ip] = 0

# same process for infected data
for ip in infected:
    host = df[df['src_ip'] == ip]
    host_discrete = encode_series(host,feats)
    size = len(host_discrete) - sliding_window
    if size > 0:
        data = np.zeros((size,sliding_window),dtype=np.int32)
        for i in range(size):
            data[i] = np.array([flow for flow in host_discrete[i:i+sliding_window]])
        log_likelihood[ip] = hmm.decode(data)[0]
    else:
        log_likelihood[ip] = 0

## Evaluate and print TP/FP/TN/FN

In [7]:
# evaluate results using the log-likelihood distance of hosts from the one who trained the model
TP = 0
TN = 0
FP = 0
FN = 0
positives = []
negatives = []

# 
dist = {}
for ip in log_likelihood.keys():
    # absolute log-likelihood distance
    dist[ip] = abs(log_likelihood[ip] - ll)
    # threshold is half log-likelihood
    if dist[ip] > ll / 2:
        negatives.append(ip)
    else:
        positives.append(ip)

# evaluate all potentially malicious hosts
for i in positives:
    if i in infected:
        TP += 1
    else:
        FP += 1

# evaluate all potentially benign hosts
for i in negatives:
    if i in normal:
        TN += 1
    else:
        FN += 1

print('True Positives : {}'.format(TP))
print('False Positives : {}'.format(FP))
print('True Negatives : {}'.format(TN))
print('False Negatives : {}'.format(FN))

True Positives : 9
False Positives : 0
True Negatives : 6
False Negatives : 0
