In [1]:
import pandas as pd
import numpy as np

In [2]:
fw_df = pd.read_csv("log2.csv")

In [3]:
fw_df.columns

Index(['Source Port', 'Destination Port', 'NAT Source Port',
       'NAT Destination Port', 'Action', 'Bytes', 'Bytes Sent',
       'Bytes Received', 'Packets', 'Elapsed Time (sec)', 'pkts_sent',
       'pkts_received'],
      dtype='object')

- Must one-hot encode 'Source Port', 'Destination Port', 'NAT Source Port', and 'NAT Destination Port',
- Don't need to use cross validation

In [4]:
fw_df.head()

Unnamed: 0,Source Port,Destination Port,NAT Source Port,NAT Destination Port,Action,Bytes,Bytes Sent,Bytes Received,Packets,Elapsed Time (sec),pkts_sent,pkts_received
0,57222,53,54587,53,allow,177,94,83,2,30,1,1
1,56258,3389,56258,3389,allow,4768,1600,3168,19,17,10,9
2,6881,50321,43265,50321,allow,238,118,120,2,1199,1,1
3,50553,3389,50553,3389,allow,3327,1438,1889,15,17,8,7
4,50002,443,45848,443,allow,25358,6778,18580,31,16,13,18


In [5]:
# target
fw_df["Action"].value_counts()

allow         37640
deny          14987
drop          12851
reset-both       54
Name: Action, dtype: int64

In [6]:
fw_df.describe()

Unnamed: 0,Source Port,Destination Port,NAT Source Port,NAT Destination Port,Bytes,Bytes Sent,Bytes Received,Packets,Elapsed Time (sec),pkts_sent,pkts_received
count,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0
mean,49391.969343,10577.385812,19282.972761,2671.04993,97123.95,22385.8,74738.15,102.866,65.833577,41.39953,61.466505
std,15255.712537,18466.027039,21970.689669,9739.162278,5618439.0,3828139.0,2463208.0,5133.002,302.461762,3218.871288,2223.332271
min,0.0,0.0,0.0,0.0,60.0,60.0,0.0,1.0,0.0,1.0,0.0
25%,49183.0,80.0,0.0,0.0,66.0,66.0,0.0,1.0,0.0,1.0,0.0
50%,53776.5,445.0,8820.5,53.0,168.0,90.0,79.0,2.0,15.0,1.0,1.0
75%,58638.0,15000.0,38366.25,443.0,752.25,210.0,449.0,6.0,30.0,3.0,2.0
max,65534.0,65535.0,65535.0,65535.0,1269359000.0,948477200.0,320881800.0,1036116.0,10824.0,747520.0,327208.0


In [7]:
fw_df["Source Port"].value_counts()

58638    840
27005    513
443      273
57470    222
49418    210
        ... 
32996      1
60574      1
60570      1
65368      1
54871      1
Name: Source Port, Length: 22724, dtype: int64

Unique Counts per columns.

In [8]:
fw_df.nunique()

Source Port             22724
Destination Port         3273
NAT Source Port         29152
NAT Destination Port     2533
Action                      4
Bytes                   10724
Bytes Sent               6683
Bytes Received           8814
Packets                  1116
Elapsed Time (sec)        915
pkts_sent                 749
pkts_received             922
dtype: int64

No missing data

In [9]:
fw_df.isna().sum()

Source Port             0
Destination Port        0
NAT Source Port         0
NAT Destination Port    0
Action                  0
Bytes                   0
Bytes Sent              0
Bytes Received          0
Packets                 0
Elapsed Time (sec)      0
pkts_sent               0
pkts_received           0
dtype: int64

## EDA

## Preprocessing

In [10]:
fw_df.columns

Index(['Source Port', 'Destination Port', 'NAT Source Port',
       'NAT Destination Port', 'Action', 'Bytes', 'Bytes Sent',
       'Bytes Received', 'Packets', 'Elapsed Time (sec)', 'pkts_sent',
       'pkts_received'],
      dtype='object')

allow         37640
deny          14987
drop          12851
reset-both       54

Rename target as integers

In [11]:
fw_df["Action"] = fw_df["Action"].replace({'allow':1,'deny':2,'drop':3,'reset-both':4})

In [12]:
fw_df["Action"].value_counts()

1    37640
2    14987
3    12851
4       54
Name: Action, dtype: int64

One-Hot-Encode

In [14]:
df = pd.get_dummies(fw_df, prefix=['Source Port', 'Destination Port', 'NAT Source Port','NAT Destination Port'], columns=['Source Port', 'Destination Port', 'NAT Source Port','NAT Destination Port'], drop_first=True)

In [15]:
df.head()

Unnamed: 0,Action,Bytes,Bytes Sent,Bytes Received,Packets,Elapsed Time (sec),pkts_sent,pkts_received,Source Port_5,Source Port_20,...,NAT Destination Port_64932,NAT Destination Port_65054,NAT Destination Port_65100,NAT Destination Port_65128,NAT Destination Port_65253,NAT Destination Port_65264,NAT Destination Port_65387,NAT Destination Port_65427,NAT Destination Port_65534,NAT Destination Port_65535
0,1,177,94,83,2,30,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,4768,1600,3168,19,17,10,9,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,238,118,120,2,1199,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,3327,1438,1889,15,17,8,7,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,25358,6778,18580,31,16,13,18,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
X = df.drop("Action", axis = 1).copy()
y = df["Action"].values.copy()

## Train Test Split

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10,random_state=4)

In [32]:
X_train

Unnamed: 0,Bytes,Bytes Sent,Bytes Received,Packets,Elapsed Time (sec),pkts_sent,pkts_received,Source Port_5,Source Port_20,Source Port_22,...,NAT Destination Port_64932,NAT Destination Port_65054,NAT Destination Port_65100,NAT Destination Port_65128,NAT Destination Port_65253,NAT Destination Port_65264,NAT Destination Port_65387,NAT Destination Port_65427,NAT Destination Port_65534,NAT Destination Port_65535
19504,734,102,632,2,30,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40372,62,62,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4520,164,78,86,2,30,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12095,7613,2230,5383,22,360,11,11,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39758,501,150,351,2,1200,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55169,60,60,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49861,62,62,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27063,70,70,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8366,66,66,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
X_train

Unnamed: 0,Bytes,Bytes Sent,Bytes Received,Packets,Elapsed Time (sec),pkts_sent,pkts_received,Source Port_5,Source Port_20,Source Port_22,...,NAT Destination Port_64932,NAT Destination Port_65054,NAT Destination Port_65100,NAT Destination Port_65128,NAT Destination Port_65253,NAT Destination Port_65264,NAT Destination Port_65387,NAT Destination Port_65427,NAT Destination Port_65534,NAT Destination Port_65535
19504,734,102,632,2,30,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40372,62,62,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4520,164,78,86,2,30,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12095,7613,2230,5383,22,360,11,11,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39758,501,150,351,2,1200,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55169,60,60,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49861,62,62,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27063,70,70,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8366,66,66,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Standard Scale

In [25]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [26]:
# train_sclr= StandardScaler()

standard_transformer = Pipeline(steps=[
        ('standard', StandardScaler())])

preprocessor = ColumnTransformer(
        remainder='passthrough', #passthough features not listed
        transformers=[
            ('std', standard_transformer , ['Bytes', 'Bytes Sent',
       'Bytes Received', 'Packets', 'Elapsed Time (sec)', 'pkts_sent','pkts_received'])])
# test_sclr= StandardScaler()

In [28]:
X_train_scaled = preprocessor.fit_transform(X_train)

In [33]:
X_test_scaled = preprocessor.fit_transform(X_test)

In [35]:
X_test_scaled.shape

(6554, 57685)

## Models

## Grid Search