In [67]:
from loaddata import *
import pandas as pd
from collections import Counter
import numpy as np
import math
import time
from imblearn.over_sampling import SMOTE
from sklearn import neighbors
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from random import sample 


In [None]:
df = load_data('datasets/capture20110811.pcap.netflow.labeled')
df.head()

In [None]:
#Data processing
df['Flags']= df['Flags'].astype("category")
df['Prot']= df['Prot'].astype("category")

flag_cat = df['Flags'].astype("category").cat
prot_cat = df['Prot'].astype("category").cat

df['Flags']= df['Flags'].cat.codes
df['Prot']= df['Prot'].cat.codes


features = df[['Durat', 'Prot', 'Flags', 'Tos', 'Packets','Bytes','Flows']]
labels = df[['Label']]
hosts = df[['src_ip', 'src_port', 'dst_ip','dst_port']]

In [None]:
#Convert to np_arrays 
x= features.values
y = labels.values

In [None]:
unique, counts = np.unique(y, return_counts=True)
dict(zip(unique, counts))

In [None]:
pd.DataFrame(x).to_csv("flow_class_packet_x.csv", header=None, index=None)
pd.DataFrame(y).to_csv("flow_class_packet_y.csv", header=None, index=None)

In [None]:
#Data aggeregation 
src_ips = df['src_ip'].unique()
aggr_x = []
aggr_y = []

grouped = df.groupby(by='src_ip')
for src_ip, relevant_flows in grouped:
    src_ports = len(relevant_flows['src_port'].unique())
    dst_ports = len(relevant_flows['dst_port'].unique())
    dst_addr = len( relevant_flows['dst_ip'].unique())
    netflows = relevant_flows['Flows'].sum()
    byts = relevant_flows['Bytes'].sum()
    packets = relevant_flows['Packets'].sum()
    labels = relevant_flows[relevant_flows['Label'] == 'Botnet']
    feats = [src_ports, dst_ports,dst_addr, netflows, byts, packets]
    aggr_x.append(feats)
    #generate class
    lab = 0

    if(len(labels) > 0):
        lab = 1
    aggr_y.append(lab)

In [None]:
aggr_x = np.array(aggr_x)
aggr_y = np.array(aggr_y)
pd.DataFrame(aggr_x).to_csv("flow_class_packet_x_aggr.csv", header=None, index=None)
pd.DataFrame(aggr_y).to_csv("flow_class_packet_y_aggr.csv", header=None, index=None)

In [4]:
x = pd.read_csv("flow_class_packet_x.csv").values
y = pd.read_csv("flow_class_packet_y.csv").values
aggr_x = pd.read_csv("flow_class_packet_x_aggr.csv").values
aggr_y = pd.read_csv("flow_class_packet_y_aggr.csv").values
def clean_y(y):
    isBotnet = y == 'Botnet'
    y[isBotnet] = 1
    y[~isBotnet] = 0
    return y
y = clean_y(y)
y=np.ravel(y).astype(int)
aggr_y=np.ravel(aggr_y).astype(int)

In [58]:
norm_idx[0].shape

(6296754,)

In [95]:
aggr_y1[aggr_y1 ==1].shape

(266,)

In [84]:
def undersample(x,y,a):
    norm_idx = np.where(y == 0)[0]
    botnet_idx = np.where(y == 1)[0]
    n = len(norm_idx)
    sample_size = math.ceil(n*a)
    smpl_idx = np.random.choice(norm_idx,sample_size)
    smpl_idx= np.append(smpl_idx,botnet_idx)
    return (x[smpl_idx,:],y[smpl_idx])
    
(x1, y1) = undersample(x,y,0.01)    
(aggr_x1, aggr_y1) = undersample(aggr_x,aggr_y,0.01)    

In [118]:
#Run the ML algorithm using cross-validation
def evaluate_classifier(x,y,clf, verbose=False):
    TP, FP, FN, TN = 0.0001, 0.0001, 0.0001, 0.0001
    kf = KFold(n_splits=10, shuffle=True)
    for train_index, test_index in kf.split(x):
        #Initialize training and test data
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        #Resamples the training data using SMOTE
        (x_train, y_train) = undersample(x_train, y_train,0.01)  
        #Fits classifier to training data
        clf.fit(x_train, y_train)
        #Predict data for testing data
        y_predict = clf.predict(x_test)
        #Enumerate results
        print("*", end = '')
        for i in range(len(y_predict)):
            if y_test[i]==1 and y_predict[i]==1:
                TP += 1
            if y_test[i]==0 and y_predict[i]==1:
                FP += 1
            if y_test[i]==1 and y_predict[i]==0:
                FN += 1
            if y_test[i]==0 and y_predict[i]==0:
                TN += 1
    print("")
    tp_avg = TP/10
    fp_avg = FP/10
    fn_avg = FN/10
    tn_avg = TN/10
    
    #Calculates the metrics
    acc = (TP+TN)/(TP+FP+TN+FN) #Accuracy
    recall = TP/(TP+FN) #Recall
    specif = TN / (FP + TN) #Specificity
    prec = TP/(TP+FP) #Precision
    fp_rate = FP/(FP+TN) #False Positive Rate
    
    #Print metrics
    if(verbose):
        print ('TP: '+ str(TP))
        print ('FP: '+ str(FP))
        print ('FN: '+ str(FN))
        print ('TN: '+ str(TN))
        print ('FP Rate : '+ str(fp_rate))
        print('Accuracy:' + str(acc))
        print('Recall:' + str(recall))
        print('Specificity:' + str(specif))
        print('Precision:' + str(prec))
        
    return [tp_avg, fp_avg, fn_avg, tn_avg, acc, recall, specif, prec, fp_rate]

In [119]:
#Create some classifiers
classifiers = []
classifiers.append((neighbors.KNeighborsClassifier(n_neighbors=3, weights = 'distance'), "Distance Weighed KNN"))
classifiers.append((AdaBoostClassifier(n_estimators=200), "AdaBoost Classifier"))
classifiers.append((RandomForestClassifier(n_estimators=200), "Random Forest Classifier"))
classifiers.append((BaggingClassifier(base_estimator= svm.SVC(kernel = 'rbf', gamma='auto'), max_samples=0.001, bootstrap=False, n_estimators=1000, verbose=0), "Bagged SVM"))
classifiers.append((ExtraTreesClassifier(n_estimators=200, max_depth=None,min_samples_split=2, random_state=0, criterion='entropy'), "Extra Random Forest Classifier with entropy criterion"))
sm = SMOTE(random_state=42)

res_pck = []
res_hst = []
#Iterate over all classifiers
for (clf, name) in classifiers:
    print(name)
    print("Host lvl")
    res_hst.append(evaluate_classifier(aggr_x,aggr_y,clf))
    print("Package lvl")
    res_pck.append(evaluate_classifier(x,y,clf))

Distance Weighed KNN
Host lvl
**********
Package lvl
**********
AdaBoost Classifier
Host lvl
**********
Package lvl
**********
Random Forest Classifier
Host lvl
**********
Package lvl
**********
Bagged SVM
Host lvl
**********
Package lvl
**********
Extra Random Forest Classifier with entropy criterion
Host lvl
**********
Package lvl
**********


In [121]:
pd.DataFrame(res_pck).to_csv("results_pack_lvl.csv", header=None, index=None)
pd.DataFrame(res_hst).to_csv("results_host_lvl.csv", header=None, index=None)


In [116]:
aggr_x1.shape

(4258, 6)