In [None]:
import numpy as np
from sklearn import svm
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
import glob
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.feature_selection import chi2

In [2]:
df = pd.read_csv("../finallabelleddata_new.csv",sep=",")

In [3]:
df.columns

Index(['SrcIP DstIP SrcPort DstPort Protocol Duration StartTime EndTime PktRate TOS MeanLength TotalLength PktCount IPFlags TTL DataOff TCPFlags Window Label'], dtype='object')

In [4]:
def newestparseline(line):
    line = line.rstrip().split()
    datapoint = []
    datapoint.append(line[4]) #proto
    datapoint.append(line[2]) #sport
    datapoint.append(line[0]) #srcip
    datapoint.append(line[3]) #dport
    datapoint.append(line[1]) #dstip
    datapoint.append(line[9]) #TOS
    datapoint.append(line[13]) #IP flags
    datapoint.append(line[15]) #TCP Flags
    datapoint.append(float(line[8]) if float(line[8]) != float('inf') else 0) #Packet rate
    datapoint.append(float(line[10])) #Mean Length
    datapoint.append(float(line[5])) #duration
    datapoint.append(int(float(line[11]))) #Total Length
    datapoint.append(int(line[12])) #Packet count
    datapoint.append(int(float(line[14]))) #TTL
    if "background_flow" in line[-1]:
        label = 0
    elif "ddossim" in line[-1]:
        label = 1
    elif "goldeneye" in line[-1]:
        label = 2
    elif "hulk" in line[-1]:
        label = 3
    elif "rudy" in line[-1]:
        label = 4
    elif "slowbody2" in line[-1]:
        label = 5
    elif "slowheaders" in line[-1]:
        label = 6
    elif "slowloris" in line[-1]:
        label = 7
    elif "slowread" in line[-1]:
        label = 8
    
#     print(datapoint)
#     print(label)
    return (datapoint, label)
    
def parseflowdata():
    data = []
    y = []
    with open("../finallabelleddata_new.csv") as dat:
        next(dat)
        for line in dat:
#             if "hulk" in line or "background_flow" in line:
            datapoint, label = newestparseline(line)
            data.append(datapoint)
            y.append(label)
    return (data, np.array(y))

In [5]:
X, y = parseflowdata()

In [6]:
labelenc = LabelEncoder()
proto = labelenc.fit_transform([x[0] for x in X])
sport = labelenc.fit_transform([x[1] for x in X])
srcip = labelenc.fit_transform([x[2] for x in X])
dport = labelenc.fit_transform([x[3] for x in X])
dstip = labelenc.fit_transform([x[4] for x in X])
tos = labelenc.fit_transform([x[5] for x in X])
ipflags = labelenc.fit_transform([x[6] for x in X])
tcpflags = labelenc.fit_transform([x[7] for x in X])

encodedX = []
for i in range(len(X)):
    encodedX.append([srcip[i], dstip[i], dport[i], sport[i], 
                     proto[i], tos[i], ipflags[i], tcpflags[i]] + X[i][8:])
    
scaler = MinMaxScaler()
encodedX = scaler.fit_transform(encodedX)
enc = OneHotEncoder(categorical_features=[0,1,2,3,4,5,6,7])
encodedX = enc.fit_transform(encodedX)

In [33]:
encodedX.toarray()[0]

array([1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       4.41940282e-07, 4.49844665e-02, 3.38125915e-03, 3.61301740e-06,
       1.11019212e-05, 5.00000000e-01])

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(encodedX, y, test_size=0.20)
X_test, X_valid, y_test, y_valid = train_test_split(X_valid, y_valid, test_size=0.50)

In [None]:
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(X_train[y_train == 0])
y_predict = clf.predict(X_test)
accuracy_score(y_test, y_predict)

In [8]:
clf = svm.SVC(class_weight={0: 1, 1: 10, 2: 15, 3: 10, 4: 8, 5: 10, 6: 7, 7: 12, 8: 8}, gamma=1000)
clf.fit(X_train, y_train)
y_predict = clf.predict(X_valid)
accuracy_score(y_valid, y_predict)

0.9543021282151717

In [9]:
y_predict = clf.predict(X_test)
accuracy_score(y_test, y_predict)

0.9557369022360046

In [57]:
rf = IsolationForest(max_samples=1.0, contamination=0.1, n_jobs=4, max_features=1.0, bootstrap=True)
rf.fit(X_train)
y_predict = rf.predict(X_test)
accuracy_score(y_test, y_predict)

0.8278016035800858

In [20]:
tp = [y_test[i] == 1 and y_predict[i] == 1 for i in range(len(y_test))]
tp = sum(tp)*1.0
fp = [y_test[i] == 0 and y_predict[i] == 1 for i in range(len(y_test))]
fp = sum(fp)*1.0
fn = [y_test[i] == 1 and y_predict[i] == 0 for i in range(len(y_test))]
fn = sum(fn)*1.0
tn = [y_test[i] == 0 and y_predict[i] == 0 for i in range(len(y_test))]
tn = sum(tn)*1.0
#DDosSim
print(tp)
print(tn)
print(fp)
print(fn)
print("DDoSSim precision: {}".format(tp/(tp+fp)))
print("DDoSSim Recall: {}".format(tp/(tp+fn)))
print("DDoSSim Accuracy: {}".format((tp+tn)/(tp+tn+fp+fn)))

4073.0
23685.0
0.0
0.0
DDoSSim precision: 1.0
DDoSSim Recall: 1.0
DDoSSim Accuracy: 1.0


In [21]:
tp = [y_test[i] == 2 and y_predict[i] == 2 for i in range(len(y_test))]
tp = sum(tp)*1.0
fp = [y_test[i] == 0 and y_predict[i] == 2 for i in range(len(y_test))]
fp = sum(fp)*1.0
fn = [y_test[i] == 2 and y_predict[i] == 0 for i in range(len(y_test))]
fn = sum(fn)*1.0
tn = [y_test[i] == 0 and y_predict[i] == 0 for i in range(len(y_test))]
tn = sum(tn)*1.0
#Goldeneye
print(tp)
print(tn)
print(fp)
print(fn)
print("Goldeneye precision: {}".format(tp/(tp+fp)))
print("Goldeneye Recall: {}".format(tp/(tp+fn)))
print("Goldeneye Accuracy: {}".format((tp+tn)/(tp+tn+fp+fn)))

91.0
23685.0
216.0
4.0
Goldeneye precision: 0.2964169381107492
Goldeneye Recall: 0.9578947368421052
Goldeneye Accuracy: 0.9908318053008834


In [22]:
tp = [y_test[i] == 3 and y_predict[i] == 3 for i in range(len(y_test))]
tp = sum(tp)*1.0
fp = [y_test[i] == 0 and y_predict[i] == 3 for i in range(len(y_test))]
fp = sum(fp)*1.0
fn = [y_test[i] == 3 and y_predict[i] == 0 for i in range(len(y_test))]
fn = sum(fn)*1.0
tn = [y_test[i] == 0 and y_predict[i] == 0 for i in range(len(y_test))]
tn = sum(tn)*1.0
#Hulk
print(tp)
print(tn)
print(fp)
print(fn)
print("Hulk precision: {}".format(tp/(tp+fp)))
print("Hulk Recall: {}".format(tp/(tp+fn)))
print("Hulk Accuracy: {}".format((tp+tn)/(tp+tn+fp+fn)))

73.0
23685.0
72.0
11.0
Hulk precision: 0.503448275862069
Hulk Recall: 0.8690476190476191
Hulk Accuracy: 0.9965186024076171


In [23]:
tp = [y_test[i] == 4 and y_predict[i] == 4 for i in range(len(y_test))]
tp = sum(tp)*1.0
fp = [y_test[i] == 0 and y_predict[i] == 4 for i in range(len(y_test))]
fp = sum(fp)*1.0
fn = [y_test[i] == 4 and y_predict[i] == 0 for i in range(len(y_test))]
fn = sum(fn)*1.0
tn = [y_test[i] == 0 and y_predict[i] == 0 for i in range(len(y_test))]
tn = sum(tn)*1.0
#Rudy
print(tp)
print(tn)
print(fp)
print(fn)
print("Rudy precision: {}".format(tp/(tp+fp)))
print("Rudy Recall: {}".format(tp/(tp+fn)))
print("Rudy Accuracy: {}".format((tp+tn)/(tp+tn+fp+fn)))

168.0
23685.0
25.0
1.0
Rudy precision: 0.8704663212435233
Rudy Recall: 0.9940828402366864
Rudy Accuracy: 0.9989111771849742


In [24]:
tp = [y_test[i] == 5 and y_predict[i] == 5 for i in range(len(y_test))]
tp = sum(tp)*1.0
fp = [y_test[i] == 0 and y_predict[i] == 5 for i in range(len(y_test))]
fp = sum(fp)*1.0
fn = [y_test[i] == 5 and y_predict[i] == 0 for i in range(len(y_test))]
fn = sum(fn)*1.0
tn = [y_test[i] == 0 and y_predict[i] == 0 for i in range(len(y_test))]
tn = sum(tn)*1.0
#SlowBody
print(tp)
print(tn)
print(fp)
print(fn)
print("SlowBody precision: {}".format(tp/(tp+fp)))
print("SlowBody Recall: {}".format(tp/(tp+fn)))
print("SlowBody Accuracy: {}".format((tp+tn)/(tp+tn+fp+fn)))

54.0
23685.0
3.0
0.0
SlowBody precision: 0.9473684210526315
SlowBody Recall: 1.0
SlowBody Accuracy: 0.9998736416477129


In [25]:
tp = [y_test[i] == 6 and y_predict[i] == 6 for i in range(len(y_test))]
tp = sum(tp)*1.0
fp = [y_test[i] == 0 and y_predict[i] == 6 for i in range(len(y_test))]
fp = sum(fp)*1.0
fn = [y_test[i] == 6 and y_predict[i] == 0 for i in range(len(y_test))]
fn = sum(fn)*1.0
tn = [y_test[i] == 0 and y_predict[i] == 0 for i in range(len(y_test))]
tn = sum(tn)*1.0
#SlowHeaders
print(tp)
print(tn)
print(fp)
print(fn)
print("SlowHeaders precision: {}".format(tp/(tp+fp)))
print("SlowHeaders Recall: {}".format(tp/(tp+fn)))
print("SlowHeaders Accuracy: {}".format((tp+tn)/(tp+tn+fp+fn)))

579.0
23685.0
247.0
32.0
SlowHeaders precision: 0.7009685230024213
SlowHeaders Recall: 0.9476268412438625
SlowHeaders Accuracy: 0.9886321965529886


In [26]:
tp = [y_test[i] == 7 and y_predict[i] == 7 for i in range(len(y_test))]
tp = sum(tp)*1.0
fp = [y_test[i] == 0 and y_predict[i] == 7 for i in range(len(y_test))]
fp = sum(fp)*1.0
fn = [y_test[i] == 7 and y_predict[i] == 0 for i in range(len(y_test))]
fn = sum(fn)*1.0
tn = [y_test[i] == 0 and y_predict[i] == 0 for i in range(len(y_test))]
tn = sum(tn)*1.0
#SlowLoris
print(tp)
print(tn)
print(fp)
print(fn)
print("SlowLoris precision: {}".format(tp/(tp+fp)))
print("SlowLoris Recall: {}".format(tp/(tp+fn)))
print("SlowLoris Accuracy: {}".format((tp+tn)/(tp+tn+fp+fn)))

127.0
23685.0
0.0
4.0
SlowLoris precision: 1.0
SlowLoris Recall: 0.9694656488549618
SlowLoris Accuracy: 0.9998320456835741


In [27]:
tp = [y_test[i] == 8 and y_predict[i] == 8 for i in range(len(y_test))]
tp = sum(tp)*1.0
fp = [y_test[i] == 0 and y_predict[i] == 8 for i in range(len(y_test))]
fp = sum(fp)*1.0
fn = [y_test[i] == 8 and y_predict[i] == 0 for i in range(len(y_test))]
fn = sum(fn)*1.0
tn = [y_test[i] == 0 and y_predict[i] == 0 for i in range(len(y_test))]
tn = sum(tn)*1.0
#SlowRead
print(tp)
print(tn)
print(fp)
print(fn)
print("SlowRead precision: {}".format(tp/(tp+fp)))
print("SlowRead Recall: {}".format(tp/(tp+fn)))
print("SlowRead Accuracy: {}".format((tp+tn)/(tp+tn+fp+fn)))

429.0
23685.0
270.0
2.0
SlowRead precision: 0.6137339055793991
SlowRead Recall: 0.9953596287703016
SlowRead Accuracy: 0.9888460592143032


In [30]:
chi2(X_train, y_train)

(array([1.02584926e-06, 2.51418213e-01, 1.02584926e-06, 2.51418213e-01,
        1.02584926e-06, 2.51418213e-01, 4.10341377e-06, 5.02836427e-01,
        1.34460975e+04, 3.75312881e+03, 3.35206475e+00, 4.52804202e+02,
        5.07392046e+04, 2.64458866e+04, 2.82445028e+01, 1.26781910e+03,
        6.89377008e+01, 1.54150839e+03, 1.28815630e+03, 3.21902148e+00,
        5.02185932e+00, 2.31074790e+03]),
 array([1.00000000e+000, 9.99990588e-001, 1.00000000e+000, 9.99990588e-001,
        1.00000000e+000, 9.99990588e-001, 1.00000000e+000, 9.99863732e-001,
        0.00000000e+000, 0.00000000e+000, 9.10362810e-001, 9.26988222e-093,
        0.00000000e+000, 0.00000000e+000, 4.29922727e-004, 2.12112977e-268,
        7.99488802e-012, 0.00000000e+000, 8.53299790e-273, 9.19870220e-001,
        7.55237242e-001, 0.00000000e+000]))