In [13]:
import numpy as np
import pandas as pd


In [14]:
raw = pd.read_csv("data/log2.csv")
data = pd.DataFrame(data = raw)
data.head()

Unnamed: 0,Source Port,Destination Port,NAT Source Port,NAT Destination Port,Action,Bytes,Bytes Sent,Bytes Received,Packets,Elapsed Time (sec),pkts_sent,pkts_received
0,57222,53,54587,53,allow,177,94,83,2,30,1,1
1,56258,3389,56258,3389,allow,4768,1600,3168,19,17,10,9
2,6881,50321,43265,50321,allow,238,118,120,2,1199,1,1
3,50553,3389,50553,3389,allow,3327,1438,1889,15,17,8,7
4,50002,443,45848,443,allow,25358,6778,18580,31,16,13,18


In [15]:
df = data.copy()

Missing Values

In [16]:
df.isna().sum().sum()

0

Single Values

In [17]:
single_vals = df.columns[df.eq(df.iloc[0]).all()].tolist()
print(f"There are {len(single_vals)} columns with single values: {single_vals}")
print(f"Original data dimension: {df.shape}")

There are 0 columns with single values: []
Original data dimension: (65532, 12)


Duplicates

In [19]:
df.duplicated().sum()
df.drop_duplicates(inplace=True)
df.shape

(57170, 12)

In [20]:
df['Action'].value_counts()

allow         37439
drop          11635
deny           8042
reset-both       54
Name: Action, dtype: int64

Data Types

In [24]:
df.dtypes

Source Port              int32
Destination Port         int32
NAT Source Port          int32
NAT Destination Port     int32
Action                  object
Bytes                    int32
Bytes Sent               int32
Bytes Received           int32
Packets                  int32
Elapsed Time (sec)       int32
pkts_sent                int32
pkts_received            int32
dtype: object

In [22]:
df['pkts_sent'].dtype

dtype('int64')

In [23]:
for i in df.columns:
    if df[i].dtype == 'int64':
        df[i] = df[i].astype(np.int32)

OHE Target Variable / Split

In [9]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
df1 = df.copy()
le = LabelEncoder()
y = df1['Action']
y = le.fit_transform(y)
X = df1.drop(columns=['Action'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 9)

In [10]:
# allow = 0, deny = 1, drop = 2, reset-both =3
le.classes_

array(['allow', 'deny', 'drop', 'reset-both'], dtype=object)

Random Forest Benchmark

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
clf = RandomForestClassifier(random_state=9)
clf.fit(X_train,y_train)
preds = clf.predict(X_test)
print(accuracy_score(y_test, preds))

0.9977260801119469


SVM

In [12]:
from sklearn import multiclass
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
svc = SVC(C=1, kernel='poly', degree=4, decision_function_shape='ovo', random_state=10)

In [13]:
svc.fit(X_train, y_train)
preds = svc.predict(X_test)
accuracy_score(y_test, preds)

0.6606611859366801

In [35]:
param_grid = {
    'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
    'C' : [.01, .1, 1, 10, 100]
}

In [None]:
#don't run
svc = SVC(degree=4, decision_function_shape='ovo', random_state=9)

from sklearn.model_selection import GridSearchCV
model_gs = GridSearchCV(
    estimator = svc,
    param_grid = param_grid,
    scoring = 'accuracy',
    n_jobs =-1,
    cv=5,
    refit=True
)

model_gs.fit(X, y)

In [37]:
# also don't run
best = 0

for kernel in ['linear','poly','rbf','sigmoid']:
    for C in [0.001,0.01, 0.1,1,10,100]:
        if kernel=='poly':
            svm=SVC(C=C, kernel=kernel, degree=4, decision_function_shape='ovo', random_state=9)
        else:
            svm = SVC(C=C, kernel=kernel, decision_function_shape='ovo', random_state=9)
        svm.fit(X_train,y_train)
        score=svm.score(X_test,y_test)
        if score > best:
            best = score
            best_parameters = {'C':C, 'kernel': kernel}

print(f'Best score: {best}')
print(f'Best parameters: {best_parameters}')

In [15]:
svm = SVC(C=.1, kernel='linear', decision_function_shape='ovo', random_state=9)
svm.fit(X_train, y_train)
preds = svm.predict(X_test)
accuracy_score(y_test, preds)

0.9929158649641421

In [16]:
from sklearn.model_selection import RandomizedSearchCV# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
rf_random.fit(X_train, y_train)

In [18]:
rf_random.best_params_

{'n_estimators': 1000,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 100,
 'bootstrap': False}

In [19]:
rf2 = RandomForestClassifier(n_estimators=1000,min_samples_split=5,min_samples_leaf=2,max_features='sqrt',max_depth=100,bootstrap=False,random_state=9)
rf2.fit(X_train, y_train)
preds = rf2.predict(X_test)
accuracy_score(y_test,preds)

0.9987755815987406

In [20]:
param_grid={
    'bootstrap' : [False],
    'max_depth' : [100,110,120,130],
    'max_features' : ['sqrt'],
    'min_samples_split' : [4,5,6],
    'min_samples_leaf' : [1,2,3],
    'n_estimators' : [800,1000,1100,1300]
}

In [22]:
from sklearn.model_selection import GridSearchCV
rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1)

In [23]:
grid_search.fit(X_train, y_train)

In [25]:
grid_search.best_params_

{'bootstrap': False,
 'max_depth': 110,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 1000}

In [24]:
preds = grid_search.predict(X_test)
accuracy_score(y_test, preds)

0.9987755815987406