In [1]:
import pdb
import time
import tqdm
import datetime
import numpy as np
import pandas as pd
from collections import Counter

from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline

np.set_printoptions(precision=5)

  from numpy.core.umath_tests import inner1d


# 0. Data
 - [Scenario10](https://mcfp.felk.cvut.cz/publicDatasets/CTU-Malware-Capture-Botnet-51/)
     - This scenario has multiple infected (67) and normal (471) IPs (with almost equivalent netflows)
     - Many IPs only have a few netflows unfortunately (check figure below)
     - We again use the scenario, so that we may perform a comparative analysis between Task4 and Task5

In [2]:
def getData(filename):
    t0 = time.time()
    if (1):
        cols = ['date', 'time', 'duration', 'protocol', 'src', 'direction', 'dst', 'flags', 'tos', 'packets', 'bytes', 'flows', 'label']
        df   = pd.read_csv(filename, skiprows=1, header=0, sep='\s+',names=cols)
    else:
        cols = ['date', 'time', 'duration', 'protocol', 'src', 'direction', 'dst', 'flags', 'tos', 'packets', 'bytes', 'flows', 'label', 'labels']
        df   = pd.read_csv(filename, skiprows=1, header=0, sep='\s+',names=cols)
    print (' - Dataframe read. Pre-processing now... ({0}s)'.format(round(time.time() - t0,2)))
    return df

def processData(df):
    # Step1 - Clean out NaN values
    df                           = df.fillna(0)
    
    # Step2 - Remove background labels
    df                           = df[df['label'] != 'Background']
    print (' - Total len : ', len(df), ' (w/0 background and w/0 NaN)')
    
    if (not df.isnull().values.any()):
        
        # Step3 - Index of Columns
        df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'])
        df             = df.set_index(df['datetime'])
        
        # Step4 - Column String Formatting
        df['src_ip'], df['src_port'] = df['src'].str.split(':', 1).str
        df['dst_ip'], df['dst_port'] = df['dst'].str.split(':', 1).str
        
        # Step5 - Codify columns
        df['src_ip_num']             = pd.Categorical(df['src_ip'], categories=df['src_ip'].unique()).codes
        df['dst_ip_num']             = pd.Categorical(df['dst_ip'], categories=df['dst_ip'].unique()).codes
        df['protocol_num']           = pd.Categorical(df['protocol'], categories=df['protocol'].unique()).codes
        df['flags_num']              = pd.Categorical(df['flags'], categories=df['flags'].unique()).codes
        
        # Step6 - Casting of columns
        df['packets'] = df['packets'].astype(int)
        df['bytes']  = df['bytes'].astype(int)
        
        # Stp7 - Drop Columns
        df.drop('src', axis=1, inplace=True)
        df.drop('dst', axis=1, inplace=True)
        df.drop('direction', axis=1, inplace=True)
        # df.drop('tos', axis=1, inplace=True)
        df.drop('flows', axis=1, inplace=True)
        df.drop('flags', axis=1, inplace=True)
        df.drop('date', axis=1, inplace=True)
        df.drop('time', axis=1, inplace=True)
        df.drop('datetime', axis=1, inplace=True)
        
        tmp = df.columns.tolist()
        tmp.remove('label')
        df  = df[tmp + ['label']]
        
        df.sort_index(inplace=True)
        df.reset_index(inplace=True)
        
        print (' - label : ', Counter(df['label']))
        print (' - IPs   : ', len(df['src_ip'].unique()))
        for label_ in df['label'].unique():
            unique_vals = df[df['label'] == label_]['src_ip'].unique()
            print (' - Label : ', label_, ' || Unique IPs : ', len(unique_vals))
        
    else:
        print (' - NAN detected!')
        
    return df

if __name__ == "__main__":
    df_scene10_raw = getData('data/capture20110818.pcap.netflow.labeled')
    df_scene10     = processData(df_scene10_raw) 
    display(df_scene10.head(n=5))

 - Dataframe read. Pre-processing now... (9.63s)
 - Total len :  645358  (w/0 background and w/0 NaN)
 - label :  Counter({'Botnet': 323441, 'LEGITIMATE': 321917})
 - IPs   :  522
 - Label :  LEGITIMATE  || Unique IPs :  471
 - Label :  Botnet  || Unique IPs :  67


Unnamed: 0,datetime,duration,protocol,tos,packets,bytes,src_ip,src_port,dst_ip,dst_port,src_ip_num,dst_ip_num,protocol_num,flags_num,label
0,2011-08-18 10:19:13.347,4.985,TCP,0,91,86277,147.32.80.13,80,147.32.85.88,56949,0,0,0,0,LEGITIMATE
1,2011-08-18 10:19:13.392,0.0,TCP,0,1,66,147.32.86.110,48102,74.125.232.214,443,1,1,0,1,LEGITIMATE
2,2011-08-18 10:19:13.411,4.921,TCP,0,49,3234,147.32.85.88,56949,147.32.80.13,80,2,2,0,1,LEGITIMATE
3,2011-08-18 10:19:13.460,4.742,TCP,0,118,7080,147.32.84.59,2768,74.125.108.208,80,3,3,0,1,LEGITIMATE
4,2011-08-18 10:19:13.486,0.0,TCP,0,1,60,147.32.84.59,56058,74.125.232.215,443,3,4,0,1,LEGITIMATE


# Task
 - Classifier
     - construct a classifier for detecting anomalous behavior in individual NetFlows (Study paper 3)
     - Do not forget to study and deal with properties of your data such as class imbalance. 
     - Evaluate your method in two ways: 
         - on the packet level (as in paper3), 
         - on the host level (as in paper 4). 
     - Do you prefer using a sequential model or a classifier for detecting botnets? Explain

# Task - Classifier - Packet + Host
 - As in Task4, we split data here as config, train and validation.
 - We ensure that the validation datasets are the same as Task4, by fixing our random_seed=42 (_the answer to all of life's questions_)
 - We then proceed to classify the netflows as well as each host as either 'Botnet' or 'LEGITIMATE'
 - A host is classified as a botnet, if even one of its netflows is classified as a Botnet

In [3]:
def getSplitData(df, IP_TRAIN, verbose=0):
    
    # Step0 - Select top 380 hosts (as per paper)
    df_tmp = df_scene10.groupby('src_ip').aggregate({'protocol':'count'}).sort_values('protocol', ascending=False).head(n=380)
    
    # Step1 - Split dataset into configuration, train, validation
    df_config = df[df['label'] == 'LEGITIMATE'].sample(frac=0.3,  random_state=42)
    df_train  = df[df['src_ip'] == IP_TRAIN] 
    df_valid  = df[~df.index.isin(df_config.index)]
    df_valid  = df_valid[df_valid['src_ip'] != IP_TRAIN] 
    
    top_380_ips = df_valid.groupby('src_ip').aggregate({'protocol':'count'}).sort_values('protocol', ascending=False).head(n=380).index
    df_valid   = df_valid[df_valid['src_ip'].isin(top_380_ips)] 
    
    if verbose:
        print ('- Step 0 : Prepare datasets ...')
        print (' - [DATA] Total  : ', len(df))
        print (' - [DATA] Config : ', len(df_config))
        print (' - [DATA] Train  : ', len(df_train))
        print (' - [DATA] Valid  : ', len(df_valid))

    df_valid_botnet = df_valid[df_valid['label'] == 'Botnet']
    df_valid_legit  = df_valid[df_valid['label'] == 'LEGITIMATE']
    df_valid_botnet_ips = df_valid_botnet['src_ip'].unique()
    df_valid_legit_ips  = df_valid_legit['src_ip'].unique()
        
    if verbose:
        print (' - [DATA] Valid (Botnet flows) : ', len(df_valid_botnet))
        print (' - [DATA] Valid (Legit flows)  : ', len(df_valid_legit))
        print (' - [DATA] Valid (Botnet IPs)   : ', len(df_valid_botnet_ips) - 1)
        print (' - [DATA] Valid (Legit IPs)    : ', len(df_valid_legit_ips))
    
    return df_config, df_train, df_valid, df_valid_botnet, df_valid_legit, df_valid_botnet_ips, df_valid_legit_ips

def getTrainData(df_config, df_train, df_valid):
    
    df_classsifier_train = pd.concat([df_config, df_train])
    df_classsifier_train.drop(['src_ip', 'datetime','protocol','tos', 'dst_ip','src_ip_num','src_port', 'dst_port'], axis = 1, inplace=True)
    
    return df_classsifier_train

def classify(df_classsifier_train, df_valid):
    TN_hosts = 0
    FN_hosts = 0
    TP_hosts = 0
    FP_hosts = 0
    conf_mat_hosts = []
    conf_mat_flows = np.zeros((2,2))
    
    if (1):
        Y_train    = df_classsifier_train.label
        X_train    = df_classsifier_train
        X_train    = X_train.drop('label', axis = 1)

        smt = SMOTE(random_state = 42, ratio = float(0.5))
        X_train_smote, Y_train_smote = smt.fit_sample(X_train, Y_train)

        classifier = RandomForestClassifier(n_estimators=500)
        classifier.fit(X_train_smote, Y_train_smote)

    if (1):
        tmp_botnet_count = 0
        tmp_legit_count  = 0
        df_valid_ips = df_valid['src_ip'].unique()
        with tqdm.tqdm_notebook(total = len(df_valid_ips)) as pbar:
            for valid_ip in df_valid_ips:
                pbar.update(1)
                df_valid_tmp        = df_valid[df_valid['src_ip'] == valid_ip]
                df_valid_tmp_labels = df_valid_tmp['label'] 
                df_valid_tmp_label  = df_valid_tmp_labels.unique()
                if (len(df_valid_tmp_label) == 1):
                    df_valid_tmp.drop(['src_ip', 'datetime','protocol','tos', 'dst_ip','src_ip_num','src_port', 'dst_port', 'label'], axis = 1, inplace=True)
                    Y_predict_tmp  = classifier.predict(df_valid_tmp)
                    conf_mat       = confusion_matrix(y_pred=Y_predict_tmp,y_true=df_valid_tmp_labels, labels=['Botnet', 'LEGITIMATE'])
                    conf_mat_flows += conf_mat

                    if df_valid_tmp_label == 'Botnet':
                        tmp_botnet_count += 1
                        if 'Botnet' in  Y_predict_tmp:
                            TP_hosts += 1
                        else:
                            FN_hosts += 1
                    elif df_valid_tmp_label == 'LEGITIMATE':
                        tmp_legit_count += 1
                        if 'Botnet' in Y_predict_tmp:
                            FP_hosts += 1
                        else:
                            TN_hosts += 1
                            
    if (1):
        TP_flow, FP_flow, FN_flow, TN_flow = conf_mat_flows.ravel()
        precision_flow = float(TP_flow)/(TP_flow + FP_flow)
        recall_flow    = float(TP_flow)/(TP_flow + FN_flow)
        accuracy_flow  = float(TP_flow + TN_flow)/(TP_flow+FN_flow+TN_flow+FP_flow)
        f1_flow        = 2*precision_flow*recall_flow / (precision_flow + recall_flow)

        print (' - [Flow] TP : ', TP_flow)
        print (' - [Flow] FP : ', FP_flow)
        print (' - [Flow] TN : ', TN_flow)
        print (' - [Flow] FN : ', FN_flow)
        print (' - [Flow] Conf_mat  : ', conf_mat_flows)
        print (' - [Flow] Accuracy  : ', round(accuracy_flow,2))
        print (' - [Flow] Precision : ', round(precision_flow,2))
        print (' - [Flow] Recall    : ', round(recall_flow,2))
        print (' - [Flow] F1-score  : ', round(f1_flow,2))
        print ('')
        
    if (1):
        precision_hosts = TP_hosts / (TP_hosts + FP_hosts)
        recall_hosts    = TP_hosts / (TP_hosts + FN_hosts)
        accuracy_hosts  = (TP_hosts + TN_hosts) / (TP_hosts + TN_hosts + FP_hosts + FN_hosts) 
        f1_hosts        = 2*precision_hosts*recall_hosts / (precision_hosts + recall_hosts)
        print (' - [Host] TP : ', TP_hosts)
        print (' - [Host] FP : ', FP_hosts)
        print (' - [Host] TN : ', TN_hosts)
        print (' - [Host] FN : ', FN_hosts)
        print (' - [Host] Total     : ', TN_hosts + FP_hosts + FN_hosts + TP_hosts)
        conf_mat_hosts = np.array([[TN_hosts,FP_hosts],[FN_hosts,TP_hosts]])
        print (' - [Host] Conf Mat  : ', conf_mat_hosts)
        print (' - [Host] Accuracy  : ', round(accuracy_flow,2))
        print (' - [Host] Precision : ', round(precision_flow,2))
        print (' - [Host] Recall    : ', round(recall_flow,2))
        print (' - [Host] F1-score  : ', round(f1_hosts,2))

In [4]:
if __name__ == "__main__":
    # Step0 - Split dataset into configuration, train, validation
    if (1):
        IP_TRAIN = '147.32.84.165' #from paper since it has 19889 netflows
        df_config, df_train, df_valid, df_valid_botnet, df_valid_legit , df_valid_botnet_ips, df_valid_legit_ips = getSplitData(df_scene10, IP_TRAIN, verbose=1)
    
    if (1):
        print ('')
        print ('- Classify ... ')
        df_classsifier_train = getTrainData(df_config, df_train, df_valid)
        classify(df_classsifier_train, df_valid)

- Step 0 : Prepare datasets ...
 - [DATA] Total  :  645358
 - [DATA] Config :  96575
 - [DATA] Train  :  19889
 - [DATA] Valid  :  528717
 - [DATA] Valid (Botnet flows) :  303527
 - [DATA] Valid (Legit flows)  :  225190
 - [DATA] Valid (Botnet IPs)   :  44
 - [DATA] Valid (Legit IPs)    :  351

- Classify ... 


HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)



 - [Flow] TP :  198094.0
 - [Flow] FP :  105230.0
 - [Flow] TN :  224500.0
 - [Flow] FN :  19.0
 - [Flow] Conf_mat  :  [[1.98094e+05 1.05230e+05]
 [1.90000e+01 2.24500e+05]]
 - [Flow] Accuracy  :  0.8
 - [Flow] Precision :  0.65
 - [Flow] Recall    :  1.0
 - [Flow] F1-score  :  0.79

 - [Host] TP :  9
 - [Host] FP :  7
 - [Host] TN :  328
 - [Host] FN :  20
 - [Host] Total     :  364
 - [Host] Conf Mat  :  [[328   7]
 [ 20   9]]
 - [Host] Accuracy  :  0.8
 - [Host] Precision :  0.65
 - [Host] Recall    :  1.0
 - [Host] F1-score  :  0.4
