In [1]:
import numpy as np
import pandas as pd

In [2]:
raw = pd.read_csv("data/log2.csv")
data = pd.DataFrame(data = raw)
data.head()

Unnamed: 0,Source Port,Destination Port,NAT Source Port,NAT Destination Port,Action,Bytes,Bytes Sent,Bytes Received,Packets,Elapsed Time (sec),pkts_sent,pkts_received
0,57222,53,54587,53,allow,177,94,83,2,30,1,1
1,56258,3389,56258,3389,allow,4768,1600,3168,19,17,10,9
2,6881,50321,43265,50321,allow,238,118,120,2,1199,1,1
3,50553,3389,50553,3389,allow,3327,1438,1889,15,17,8,7
4,50002,443,45848,443,allow,25358,6778,18580,31,16,13,18


In [3]:
df = data.copy()

Missing Values

In [4]:
df.isna().sum().sum()

0

Single Values

In [5]:
single_vals = df.columns[df.eq(df.iloc[0]).all()].tolist()
print(f"There are {len(single_vals)} columns with single values: {single_vals}")
print(f"Original data dimension: {df.shape}")

There are 0 columns with single values: []
Original data dimension: (65532, 12)


Duplicates

In [6]:
df.duplicated().sum()
df.drop_duplicates(inplace=True)

In [7]:
df['Action'].value_counts()

allow         37439
drop          11635
deny           8042
reset-both       54
Name: Action, dtype: int64

Data Types

In [8]:
df.dtypes

Source Port              int64
Destination Port         int64
NAT Source Port          int64
NAT Destination Port     int64
Action                  object
Bytes                    int64
Bytes Sent               int64
Bytes Received           int64
Packets                  int64
Elapsed Time (sec)       int64
pkts_sent                int64
pkts_received            int64
dtype: object

OHE Target Variable / Split

In [9]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
df1 = df.copy()
le = LabelEncoder()
y = df1['Action']
y = le.fit_transform(y)
X = df1.drop(columns=['Action'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 9)

In [10]:
# allow = 0, deny = 1, drop = 2, reset-both =3
le.classes_

array(['allow', 'deny', 'drop', 'reset-both'], dtype=object)

Random Forest Benchmark

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
clf = RandomForestClassifier(random_state=9)
clf.fit(X_train,y_train)
preds = clf.predict(X_test)
print(accuracy_score(y_test, preds))

0.9977260801119469


SVM

In [12]:
from sklearn import multiclass
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
svc = SVC(C=1, kernel='poly', degree=4, decision_function_shape='ovo', random_state=10)

In [13]:
svc.fit(X_train, y_train)
preds = svc.predict(X_test)
accuracy_score(y_test, preds)

0.6606611859366801

In [35]:
# param_grid = {
#     'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
#     'C' : [.01, .1, 1, 10, 100]
# }

In [None]:
# #don't run
# svc = SVC(degree=4, decision_function_shape='ovo', random_state=9)

# from sklearn.model_selection import GridSearchCV
# model_gs = GridSearchCV(
#     estimator = svc,
#     param_grid = param_grid,
#     scoring = 'accuracy',
#     n_jobs =-1,
#     cv=5,
#     refit=True
# )

# model_gs.fit(X, y)

In [37]:
# # also don't run
# best = 0

# for kernel in ['linear','poly','rbf','sigmoid']:
#     for C in [0.001,0.01, 0.1,1,10,100]:
#         if kernel=='poly':
#             svm=SVC(C=C, kernel=kernel, degree=4, decision_function_shape='ovo', random_state=9)
#         else:
#             svm = SVC(C=C, kernel=kernel, decision_function_shape='ovo', random_state=9)
#         svm.fit(X_train,y_train)
#         score=svm.score(X_test,y_test)
#         if score > best:
#             best = score
#             best_parameters = {'C':C, 'kernel': kernel}

# print(f'Best score: {best}')
# print(f'Best parameters: {best_parameters}')

In [15]:
svm = SVC(C=.1, kernel='linear', decision_function_shape='ovo', random_state=9)
svm.fit(X_train, y_train)
preds = svm.predict(X_test)
accuracy_score(y_test, preds)

0.9929158649641421

SGD (VW)

In [None]:
def to_vw_format(row):
    res = f"{int(row.y)} |"
    for idx, value in row.drop(["y"]).iteritems():
        feature_name = idx.replace(" ", "_").replace("(", "").replace(")", "")
        res += f" {feature_name}:{value}"
    return res

In [None]:
import vowpalwabbit

train_vw = X_train.copy()
test_vw = X_test.copy()

train_vw['y'] = y_train + 1
test_vw['y'] = y_test + 1

print(train_vw.apply(to_vw_format, axis=1))

28268    1 | Source_Port:45350 Destination_Port:443 NAT...
1630     1 | Source_Port:47152 Destination_Port:443 NAT...
13943    3 | Source_Port:64034 Destination_Port:445 NAT...
39290    1 | Source_Port:56498 Destination_Port:80 NAT_...
13694    3 | Source_Port:64235 Destination_Port:445 NAT...
                               ...                        
48603    1 | Source_Port:54676 Destination_Port:443 NAT...
25970    3 | Source_Port:49383 Destination_Port:445 NAT...
503      1 | Source_Port:51946 Destination_Port:80 NAT_...
63979    2 | Source_Port:5223 Destination_Port:1210 NAT...
24014    3 | Source_Port:61630 Destination_Port:445 NAT...
Length: 45736, dtype: object


In [None]:
vw = vowpalwabbit.Workspace("--oaa 5 --quiet --learning_rate 100 -q :: --cubic ::: --loss_function hinge --random_seed 10")

# learn from training set with multiple passes
for sample in train_vw.apply(to_vw_format, axis=1):
    vw.learn(sample)

# predict from the testing set
predictions = []
for sample in test_vw.apply(to_vw_format, axis=1):
    predicted_class = vw.predict(sample)
    predictions.append(predicted_class)

accuracy = len(test_vw[test_vw.y == predictions]) / len(test_vw)
print(f"Model accuracy {accuracy}")

Model accuracy 0.9920412803918139


In [None]:
print(len(predictions), len(train_vw))

11434 45736


In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

counter = 0
results = pd.DataFrame(columns=['Model Accuracy', 'Loss Function', 'L2 Regularization', 'Learning Rate'])

for loss in ['hinge', 'logistic']:
    for reg in ['0.0', '0.001', '0.01', '0.10', '1.0', '10', '100', '1000']:
        for l in ['.001','.01','.1','1.0','5.0','10','100']:
            vw1 = eval("vowpalwabbit.Workspace('--passes 1000 --cache --oaa 5 --quiet -l "+l+" --l2 "+reg+" -q :: --cubic ::: --loss_function "+loss+" --random_seed 10')")

            # learn
            for sample in train_vw.apply(to_vw_format, axis=1):
                vw1.learn(sample)

            # predict
            predictions = []
            for sample in test_vw.apply(to_vw_format, axis=1):
                predicted_class = vw1.predict(sample)
                predictions.append(predicted_class)

            accuracy = len(test_vw[test_vw.y == predictions]) / len(test_vw)
            counter += 1

            print(counter, f"Accuracy: {accuracy}", f"Loss Function: {loss}", f"L2: {reg}", f"Learning Rate: {l}")
            results = results.append({'Model Accuracy': accuracy, 'Loss Function': loss, 'L2 Regularization': reg, 'Learning Rate': l}, ignore_index=True)

1 Accuracy: 0.6606611859366801 Loss Function: hinge L2: 0.0 Learning Rate: .001
2 Accuracy: 0.9699142907119118 Loss Function: hinge L2: 0.0 Learning Rate: .01
3 Accuracy: 0.9846073115270246 Loss Function: hinge L2: 0.0 Learning Rate: .1
4 Accuracy: 0.9909043204477873 Loss Function: hinge L2: 0.0 Learning Rate: 1.0
5 Accuracy: 0.9916914465628827 Loss Function: hinge L2: 0.0 Learning Rate: 5.0
6 Accuracy: 0.9913416127339514 Loss Function: hinge L2: 0.0 Learning Rate: 10
7 Accuracy: 0.9917789050201155 Loss Function: hinge L2: 0.0 Learning Rate: 100
8 Accuracy: 0.6606611859366801 Loss Function: hinge L2: 0.001 Learning Rate: .001
9 Accuracy: 0.9668532447087633 Loss Function: hinge L2: 0.001 Learning Rate: .01
10 Accuracy: 0.9804967640370824 Loss Function: hinge L2: 0.001 Learning Rate: .1
11 Accuracy: 0.9835578100402309 Loss Function: hinge L2: 0.001 Learning Rate: 1.0
12 Accuracy: 0.9790099702641245 Loss Function: hinge L2: 0.001 Learning Rate: 5.0
13 Accuracy: 0.9781353856917964 Loss Fun

In [None]:
results.sort_values(by='Model Accuracy', ascending=False)

Unnamed: 0,Model Accuracy,Loss Function,L2 Regularization,Learning Rate
6,0.991866,squared,0.0,100
7,0.991866,squared,0.0,1000
70,0.991779,hinge,0.0,100
68,0.991691,hinge,0.0,5.0
71,0.991691,hinge,0.0,1000
...,...,...,...,...
94,0.0007,hinge,0.10,100
106,0.0007,hinge,10,.1
49,0.0007,squared,100,.01
120,0.0007,hinge,1000,.001
