In [31]:
import numpy as np
from sklearn import svm
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
import glob
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve

In [32]:
df = pd.read_csv("./capture20110818-2.binetflow.2format",sep=",")

In [33]:
df.columns

Index(['SrcAddr', 'DstAddr', 'Proto', 'Sport', 'Dport', 'State', 'sTos',
       'dTos', 'SrcWin', 'DstWin', 'sHops', 'dHops', 'StartTime', 'LastTime',
       'sTtl', 'dTtl', 'TcpRtt', 'SynAck', 'AckDat', 'SrcPkts', 'DstPkts',
       'SrcBytes', 'DstBytes', 'SAppBytes', 'DAppBytes', 'Dur', 'TotPkts',
       'TotBytes', 'TotAppByte', 'Rate', 'SrcRate', 'DstRate', 'Label'],
      dtype='object')

In [34]:
df["dHops"]

0          1.0
1          1.0
2          1.0
3          1.0
4          1.0
5          1.0
6          1.0
7          1.0
8          1.0
9          1.0
10         1.0
11         1.0
12         1.0
13         1.0
14         1.0
15         1.0
16         2.0
17         1.0
18         1.0
19         1.0
20         1.0
21         1.0
22         1.0
23         1.0
24        11.0
25         1.0
26         1.0
27         1.0
28        13.0
29         1.0
          ... 
107221     1.0
107222     1.0
107223     1.0
107224     NaN
107225     NaN
107226     NaN
107227     NaN
107228    12.0
107229     NaN
107230    13.0
107231     NaN
107232    15.0
107233     NaN
107234     NaN
107235     NaN
107236     NaN
107237     NaN
107238     NaN
107239    11.0
107240     NaN
107241     NaN
107242     NaN
107243     1.0
107244     1.0
107245     1.0
107246     1.0
107247     NaN
107248     1.0
107249     1.0
107250     1.0
Name: dHops, Length: 107251, dtype: float64

In [35]:
df = pd.read_csv("capture20110818-2.binetflow",sep=",")

In [36]:
df.columns

Index(['StartTime', 'Dur', 'Proto', 'SrcAddr', 'Sport', 'Dir', 'DstAddr',
       'Dport', 'State', 'sTos', 'dTos', 'TotPkts', 'TotBytes', 'SrcBytes',
       'Label'],
      dtype='object')

In [37]:
def newestparseline(line):
    line = line.rstrip().split("|")
    datapoint = []
    datapoint.append(line[2]) #proto
    datapoint.append(line[6]) #sport
    datapoint.append(line[8]) #srcip
    datapoint.append(line[11]) #dport
    datapoint.append(line[15]) #dstip
    datapoint.append(line[4]) #TOS
    datapoint.append(int(line[1])) #PACKETS
    datapoint.append(int(line[0])) #bytes
    datapoint.append(float(line[86])) #duration

def newparseline(line):
    line = line.rstrip().split(",")
#     print(line)
    datapoint = []
    datapoint.append(line[0])
    datapoint.append(line[1])
    datapoint.append(line[2])
    datapoint.append(line[3])
    datapoint.append(line[4])
    datapoint.append(line[5])
    datapoint.append(line[6])
    datapoint.append(line[7])
    datapoint.append(int(line[8]) if line[8] != "" else 0)
    datapoint.append(int(line[9]) if line[9] != "" else 0)
    datapoint.append(int(line[10]) if line[10] != "" else 0)
    datapoint.append(int(line[11]) if line[11] != "" else 0)
    datapoint.append(int(line[14]) if line[14] != "" else 0)
    datapoint.append(int(line[15]) if line[15] != "" else 0)
    datapoint.append(float(line[16]))
    datapoint.append(float(line[17]))
    datapoint.append(float(line[18]))
    datapoint.append(int(line[19]))
    datapoint.append(int(line[20]))
    datapoint.append(int(line[21]))
    datapoint.append(int(line[22]))
    datapoint.append(int(line[23]))
    datapoint.append(int(line[24]))
    datapoint.append(float(line[25]))
    datapoint.append(int(line[26]))
    datapoint.append(int(line[27]))
    datapoint.append(int(line[28]))
    datapoint.append(float(line[29]))
    datapoint.append(float(line[30]))
    datapoint.append(float(line[31]))
    if "From-Botnet" in line[32]:
        label = -1
    else:
        label = 1
        
    return (datapoint, label)
    
def parseline(line):
    line = line.rstrip().split(",")[1:]
    datapoint = []
    datapoint.append(line[1])
    datapoint.append(line[2])
    datapoint.append(line[3])
    datapoint.append(line[4])
    datapoint.append(line[5])
    datapoint.append(line[6])
    datapoint.append(line[7])
    datapoint.append(line[8])
    datapoint.append(line[9])
    datapoint.append(int(line[10]))
    datapoint.append(int(line[11]))
    datapoint.append(float(line[0]))
    datapoint.append(int(line[12]))
    if "From-Botnet" in line[13]:
        label = -1
    else:
        label = 1
        
    return (datapoint, label)

def parseflowdata():
    data = []
    y = []
    #for file in glob.glob("./*.binetflow"):
#     with open("capture20110818-2.binetflow.2format") as dat:
    with open("combineddatafinal") as dat:
        next(dat)
        for line in dat:
#             datapoint, label = newparseline(line)
            datapoint, label = newestparseline(line)
            data.append(datapoint)
            y.append(label)
    return (data, np.array(y))

In [38]:
X, y = parseflowdata()

TypeError: 'NoneType' object is not iterable

In [25]:
X[:10]

[['tcp',
  '76.76.172.248',
  '63577',
  '   ->',
  '147.32.84.229',
  '13363',
  'SR_SA',
  '0',
  '0',
  3,
  184,
  2.983247,
  122],
 ['tcp',
  '76.76.172.248',
  '63580',
  '   ->',
  '147.32.84.229',
  '443',
  'SR_SA',
  '0',
  '0',
  3,
  184,
  2.906029,
  122],
 ['tcp',
  '76.76.172.248',
  '63582',
  '   ->',
  '147.32.84.229',
  '80',
  'SR_SA',
  '0',
  '0',
  3,
  184,
  3.030517,
  122],
 ['tcp',
  '76.76.172.248',
  '63577',
  '   ->',
  '147.32.84.229',
  '13363',
  'SR_SA',
  '0',
  '0',
  3,
  184,
  6.016227,
  122],
 ['tcp',
  '76.76.172.248',
  '63580',
  '   ->',
  '147.32.84.229',
  '443',
  'SR_SA',
  '0',
  '0',
  3,
  184,
  6.124715,
  122],
 ['tcp',
  '76.76.172.248',
  '63582',
  '   ->',
  '147.32.84.229',
  '80',
  'SR_SA',
  '0',
  '0',
  3,
  184,
  6.015555,
  122],
 ['tcp',
  '147.32.3.51',
  '4403',
  '   ->',
  '147.32.84.46',
  '10010',
  'S_RA',
  '0',
  '0',
  4,
  244,
  1.077735,
  124],
 ['tcp',
  '147.32.85.124',
  '50408',
  '   ->',
  '147

In [26]:
labelenc = LabelEncoder()
# srcip = labelenc.fit_transform([x[0] for x in X])
# dstip = labelenc.fit_transform([x[1] for x in X])
# proto = labelenc.fit_transform([x[2] for x in X])
# sport = labelenc.fit_transform([x[3] for x in X])
# dport = labelenc.fit_transform([x[4] for x in X])
# state = labelenc.fit_transform([x[5] for x in X])
# stos = labelenc.fit_transform([x[6] for x in X])
# dtos = labelenc.fit_transform([x[7] for x in X])
# direc = labelenc.fit_transform([x[8] for x in X])
proto = labelenc.fit_transform([x[0] for x in X])
srcip = labelenc.fit_transform([x[1] for x in X])
sport = labelenc.fit_transform([x[2] for x in X])
direc = labelenc.fit_transform([x[3] for x in X])
dstip = labelenc.fit_transform([x[4] for x in X])
dport = labelenc.fit_transform([x[5] for x in X])
state = labelenc.fit_transform([x[6] for x in X])
stos = labelenc.fit_transform([x[7] for x in X])
dtos = labelenc.fit_transform([x[8] for x in X])

encodedX = []
print(proto)
for i in range(len(X)):
    #encodedX.append([X[i][0], proto[i], srcip[i], sport[i], direc[i], dstip[i], dport[i], state[i], stos[i], dtos[i], X[i][10], X[i][11], X[i][12]])
    encodedX.append([srcip[i], dstip[i], dport[i], sport[i], #direc[i], 
                     proto[i], stos[i]] + #, state[i]] + #dtos[i]] + 
                    X[i][9:12])
#     encodedX.append([X[i][0], proto[i], direc[i], X[i][10], X[i][11], X[i][12]])
#     encodedX.append([srcip[i], dstip[i], proto[i], sport[i], dport[i], state[i], stos[i], dtos[i]] + X[i][8:])

minmaxscaler = MinMaxScaler()
encodedX = minmaxscaler.fit_transform(encodedX)
enc = OneHotEncoder(categorical_features=[0,1])
encodedX = enc.fit_transform(encodedX)

[10 10 10 ... 11 11 11]


In [27]:
X_train, X_test, y_train, y_test = train_test_split(encodedX, y, test_size=0.10)

In [28]:
# X_train = X_train[y_train == 1]

In [24]:
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(X_train)
y_predict = clf.predict(X_test)
accuracy_score(y_test, y_predict)

0.8614581391012494

In [29]:
clf = svm.SVC()
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
accuracy_score(y_test, y_predict)

0.9261607309341786

In [57]:
rf = IsolationForest(max_samples=1.0, contamination=0.1, n_jobs=4, max_features=1.0, bootstrap=True)
rf.fit(X_train)
y_predict = rf.predict(X_test)
accuracy_score(y_test, y_predict)

0.8278016035800858

In [58]:
tp = [y_test[i] == 1 and y_predict[i] == 1 for i in range(len(y_test))]
tp = sum(tp)*1.0
fp = [y_test[i] == -1 and y_predict[i] == 1 for i in range(len(y_test))]
fp = sum(fp)*1.0
fn = [y_test[i] == 1 and y_predict[i] == -1 for i in range(len(y_test))]
fn = sum(fn)*1.0
tn = [y_test[i] == -1 and y_predict[i] == -1 for i in range(len(y_test))]
tn = sum(tn)*1.0

In [59]:
print(tp)
print(tn)
print(fp)
print(fn)

8865.0
14.0
766.0
1081.0


In [20]:
z = [i for i in range(len(y_test)) if y_test[i] == 1]
z

[8,
 13,
 47,
 69,
 80,
 84,
 96,
 130,
 136,
 141,
 144,
 146,
 147,
 157,
 179,
 200,
 206,
 215,
 227,
 228,
 232,
 261,
 276,
 278,
 280,
 285,
 287,
 295,
 299,
 325,
 362,
 370,
 372,
 374,
 402,
 411,
 421,
 428,
 429,
 432,
 490,
 523,
 530,
 550,
 579,
 592,
 599,
 634,
 635,
 650,
 659,
 671,
 676,
 686,
 694,
 707,
 720,
 729,
 736,
 739,
 749,
 768,
 788,
 808,
 809,
 812,
 831,
 834,
 845,
 867,
 871,
 895,
 937,
 966,
 982,
 983,
 987,
 1018,
 1019,
 1057,
 1075,
 1089,
 1093,
 1101,
 1123,
 1147,
 1148,
 1152,
 1171,
 1193,
 1199,
 1201,
 1203,
 1204,
 1208,
 1211,
 1217,
 1218,
 1227,
 1254,
 1282,
 1297,
 1306,
 1308,
 1380,
 1394,
 1402,
 1432,
 1451,
 1475,
 1477,
 1481,
 1482,
 1486,
 1492,
 1535,
 1547,
 1548,
 1565,
 1578,
 1582,
 1594,
 1600,
 1630,
 1643,
 1662,
 1671,
 1706,
 1707,
 1715,
 1723,
 1725,
 1731,
 1734,
 1747,
 1751,
 1757,
 1759,
 1761,
 1762,
 1766,
 1768,
 1774,
 1785,
 1818,
 1833,
 1844,
 1855,
 1856,
 1868,
 1878,
 1888,
 1915,
 1932,
 1963,


In [31]:
len([y_predict[i] for i in range(y_predict.shape[0]) if y_predict[i] != 1])

9889

In [None]:
##### sum([y_test[i] == -1 for i in range(len(y_test))])