In [1]:
import numpy as np
import pandas as pd


In [2]:
raw = pd.read_csv("data/log2.csv")
data = pd.DataFrame(data = raw)
data.head()

Unnamed: 0,Source Port,Destination Port,NAT Source Port,NAT Destination Port,Action,Bytes,Bytes Sent,Bytes Received,Packets,Elapsed Time (sec),pkts_sent,pkts_received
0,57222,53,54587,53,allow,177,94,83,2,30,1,1
1,56258,3389,56258,3389,allow,4768,1600,3168,19,17,10,9
2,6881,50321,43265,50321,allow,238,118,120,2,1199,1,1
3,50553,3389,50553,3389,allow,3327,1438,1889,15,17,8,7
4,50002,443,45848,443,allow,25358,6778,18580,31,16,13,18


In [3]:
df = data.copy()

Missing Values

In [7]:
df.isna().sum().sum()

0

Single Values

In [8]:
single_vals = df.columns[df.eq(df.iloc[0]).all()].tolist()
print(f"There are {len(single_vals)} columns with single values: {single_vals}")
print(f"Original data dimension: {df.shape}")

There are 0 columns with single values: []
Original data dimension: (65532, 12)


Duplicates

In [9]:
df.duplicated().sum()
df.drop_duplicates(inplace=True)

In [10]:
df['Action'].value_counts()

allow         37439
drop          11635
deny           8042
reset-both       54
Name: Action, dtype: int64

Data Types

In [11]:
df.dtypes

Source Port              int64
Destination Port         int64
NAT Source Port          int64
NAT Destination Port     int64
Action                  object
Bytes                    int64
Bytes Sent               int64
Bytes Received           int64
Packets                  int64
Elapsed Time (sec)       int64
pkts_sent                int64
pkts_received            int64
dtype: object

OHE Target Variable / Split

In [12]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
df1 = df.copy()
le = LabelEncoder()
y = df1['Action']
y = le.fit_transform(y)
X = df1.drop(columns=['Action'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 9)

In [13]:
# allow = 0, deny = 1, drop = 2, reset-both =3
le.classes_

array(['allow', 'deny', 'drop', 'reset-both'], dtype=object)

Random Forest Benchmark

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
clf = RandomForestClassifier(random_state=9)
clf.fit(X_train,y_train)
preds = clf.predict(X_test)
print(accuracy_score(y_test, preds))

0.9977260801119469


SVM

In [15]:
from sklearn import multiclass
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
svc = SVC(C=1, kernel='poly', degree=4, decision_function_shape='ovo', random_state=10)

In [16]:
svc.fit(X_train, y_train)
preds = svc.predict(X_test)
accuracy_score(y_test, preds)

0.6606611859366801

In [None]:
param_grid = {
    'kernel' = ['linear', 'poly', 'rbf', 'sigmoid'],
    'C' = [.01, .1, 1, 10, 100]
}

In [None]:
#don't run
from sklearn.model_selection import GridSearchCV
model_gs = GridSearchCV(
    estimator = svc,
    param_grid = param_grid,
    scoring = 'accuracy'
    n_jobs =-1,
    cv=5,
    refit=True
)

In [57]:
for j in ['linear','poly','rbf','sigmoid']:
    svc.kernel = j
    for i in [.01, .1, 1, 10, 100]: 
        svc.C = i
        print(j, i, cross_val_score(svc, X, y, scoring='accuracy', cv=5).mean())

linear 0.01 0.9914850268489976
linear 0.1 0.9916986624697721
