In [1]:
import pandas as pd
import numpy as np

In [2]:
fw_df = pd.read_csv("log2.csv")

In [3]:
fw_df.columns

Index(['Source Port', 'Destination Port', 'NAT Source Port',
       'NAT Destination Port', 'Action', 'Bytes', 'Bytes Sent',
       'Bytes Received', 'Packets', 'Elapsed Time (sec)', 'pkts_sent',
       'pkts_received'],
      dtype='object')

- Must one-hot encode 'Source Port', 'Destination Port', 'NAT Source Port', and 'NAT Destination Port',
- Don't need to use cross validation

In [4]:
fw_df.head()

Unnamed: 0,Source Port,Destination Port,NAT Source Port,NAT Destination Port,Action,Bytes,Bytes Sent,Bytes Received,Packets,Elapsed Time (sec),pkts_sent,pkts_received
0,57222,53,54587,53,allow,177,94,83,2,30,1,1
1,56258,3389,56258,3389,allow,4768,1600,3168,19,17,10,9
2,6881,50321,43265,50321,allow,238,118,120,2,1199,1,1
3,50553,3389,50553,3389,allow,3327,1438,1889,15,17,8,7
4,50002,443,45848,443,allow,25358,6778,18580,31,16,13,18


In [5]:
# target
fw_df["Action"].value_counts()

allow         37640
deny          14987
drop          12851
reset-both       54
Name: Action, dtype: int64

In [6]:
fw_df.describe()

Unnamed: 0,Source Port,Destination Port,NAT Source Port,NAT Destination Port,Bytes,Bytes Sent,Bytes Received,Packets,Elapsed Time (sec),pkts_sent,pkts_received
count,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0
mean,49391.969343,10577.385812,19282.972761,2671.04993,97123.95,22385.8,74738.15,102.866,65.833577,41.39953,61.466505
std,15255.712537,18466.027039,21970.689669,9739.162278,5618439.0,3828139.0,2463208.0,5133.002,302.461762,3218.871288,2223.332271
min,0.0,0.0,0.0,0.0,60.0,60.0,0.0,1.0,0.0,1.0,0.0
25%,49183.0,80.0,0.0,0.0,66.0,66.0,0.0,1.0,0.0,1.0,0.0
50%,53776.5,445.0,8820.5,53.0,168.0,90.0,79.0,2.0,15.0,1.0,1.0
75%,58638.0,15000.0,38366.25,443.0,752.25,210.0,449.0,6.0,30.0,3.0,2.0
max,65534.0,65535.0,65535.0,65535.0,1269359000.0,948477200.0,320881800.0,1036116.0,10824.0,747520.0,327208.0


In [7]:
fw_df["Source Port"].value_counts()

58638    840
27005    513
443      273
57470    222
49418    210
        ... 
32996      1
60574      1
60570      1
65368      1
54871      1
Name: Source Port, Length: 22724, dtype: int64

Unique Counts per columns.

In [8]:
fw_df.nunique()

Source Port             22724
Destination Port         3273
NAT Source Port         29152
NAT Destination Port     2533
Action                      4
Bytes                   10724
Bytes Sent               6683
Bytes Received           8814
Packets                  1116
Elapsed Time (sec)        915
pkts_sent                 749
pkts_received             922
dtype: int64

No missing data

In [9]:
fw_df.isna().sum()

Source Port             0
Destination Port        0
NAT Source Port         0
NAT Destination Port    0
Action                  0
Bytes                   0
Bytes Sent              0
Bytes Received          0
Packets                 0
Elapsed Time (sec)      0
pkts_sent               0
pkts_received           0
dtype: int64

## EDA

## Preprocessing

In [10]:
fw_df.columns

Index(['Source Port', 'Destination Port', 'NAT Source Port',
       'NAT Destination Port', 'Action', 'Bytes', 'Bytes Sent',
       'Bytes Received', 'Packets', 'Elapsed Time (sec)', 'pkts_sent',
       'pkts_received'],
      dtype='object')

allow         37640
deny          14987
drop          12851
reset-both       54

Rename target as integers

In [11]:
fw_df["Action"] = fw_df["Action"].replace({'allow':1,'deny':2,'drop':3,'reset-both':4})

In [12]:
fw_df["Action"].value_counts()

1    37640
2    14987
3    12851
4       54
Name: Action, dtype: int64

One-Hot-Encode

In [13]:
df = pd.get_dummies(fw_df, prefix=['Source Port', 'Destination Port', 'NAT Source Port','NAT Destination Port'], columns=['Source Port', 'Destination Port', 'NAT Source Port','NAT Destination Port'], drop_first=True)

In [14]:
df.head()

Unnamed: 0,Action,Bytes,Bytes Sent,Bytes Received,Packets,Elapsed Time (sec),pkts_sent,pkts_received,Source Port_5,Source Port_20,...,NAT Destination Port_64932,NAT Destination Port_65054,NAT Destination Port_65100,NAT Destination Port_65128,NAT Destination Port_65253,NAT Destination Port_65264,NAT Destination Port_65387,NAT Destination Port_65427,NAT Destination Port_65534,NAT Destination Port_65535
0,1,177,94,83,2,30,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,4768,1600,3168,19,17,10,9,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,238,118,120,2,1199,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,3327,1438,1889,15,17,8,7,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,25358,6778,18580,31,16,13,18,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
X = df.drop("Action", axis = 1)
y = df["Action"].values

## Train Test Split

In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from scipy.sparse import csr_matrix
from sklearn.model_selection import cross_val_score

In [17]:
standard_transformer = Pipeline(steps=[
        ('standard', StandardScaler())])

preprocessor = ColumnTransformer(
        remainder='passthrough', #passthough features not listed
        transformers=[
            ('std', standard_transformer , ['Bytes', 'Bytes Sent',
       'Bytes Received', 'Packets', 'Elapsed Time (sec)', 'pkts_sent','pkts_received'])])

In [18]:
X_full = preprocessor.fit_transform(X) 

In [19]:
X_full.shape

(65532, 57685)

In [20]:
X_full = csr_matrix(X_full)

In [122]:
X_full

<65532x57685 sparse matrix of type '<class 'numpy.float64'>'
	with 663642 stored elements in Compressed Sparse Row format>

## Standard Scale

In [None]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=4)

In [20]:
# X_train_scaled = preprocessor.fit_transform(X_train)
# X_train_scaled = csr_matrix(X_train_scaled)
# X_train_scaled

In [22]:
# X_test_scaled = preprocessor.fit_transform(X_test)
# X_test_scaled = csr_matrix(X_test_scaled)
# X_test_scaled

In [27]:
# from sklearn.linear_model import LogisticRegression
# log = LogisticRegression()
# log.fit(X_train_scaled, y_train)
# log.predict(X_test_scaled)
# score = log.score(X_test_scaled, y_test)
# print(score)

0.9949649069270674


In [21]:
# from sklearn.svm import LinearSVC
# lsvc = LinearSVC(max_iter=10000)
# lsvc.fit(X_train_scaled, y_train)
# lsvc.predict(X_test_scaled)
# score = lsvc.score(X_test_scaled, y_test)

In [22]:
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC

skf_cv = StratifiedKFold(n_splits=10)
lsvc = LinearSVC(max_iter=10000)

# Models

## Linear SVC

In [30]:
%%time
lvsc_mean_acc = cross_val_score(lsvc,X_full,y, cv = skf_cv)

CPU times: user 18.2 s, sys: 62.5 ms, total: 18.3 s
Wall time: 18.3 s


In [31]:
print(np.mean(lvsc_mean_acc))

0.9939266112174427


## SVC 

In [36]:
from sklearn.svm import SVC
svc = SVC()

In [33]:
%%time
svc_mean_acc= cross_val_score(svc,X_full,y, cv = skf_cv)

CPU times: user 6min 25s, sys: 6.86 s, total: 6min 31s
Wall time: 6min 32s


In [34]:
print(np.mean(svc_mean_acc))

0.995986654857757


## SGD Classifier

In [113]:
from sklearn.linear_model import SGDClassifier

skf_cv = StratifiedKFold(n_splits=10)
sgd = SGDClassifier(early_stopping=True, max_iter=1000)

In [114]:
sgd_mean_acc = cross_val_score(sgd,X_full,y, cv = skf_cv, scoring = 'accuracy')

In [115]:
print(sgd_mean_acc)

[0.98413183 0.99694843 0.99694796 0.9919121  0.99313292 0.99282771
 0.98901267 0.99069129 0.99069129 0.98916527]


In [116]:
print(np.mean(sgd_mean_acc))

0.9915461455782644


In [52]:
from sklearn.model_selection import GridSearchCV

In [118]:
param_grid = { 
    'learning_rate': ['adaptive','constant','invscaling'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha' : [0.03, 0.05,0.2, 0],
    'class_weight' : ["balanced", None],
    'eta0': [0.02]
}
CV_sgd = GridSearchCV(estimator=sgd, param_grid=param_grid, cv=skf_cv,scoring="accuracy")
CV_sgd.fit(X_full, y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             estimator=SGDClassifier(early_stopping=True),
             param_grid={'alpha': [0.03, 0.05, 0.2, 0],
                         'class_weight': ['balanced', None], 'eta0': [0.02],
                         'learning_rate': ['adaptive', 'constant',
                                           'invscaling'],
                         'penalty': ['l2', 'l1', 'elasticnet']},
             scoring='accuracy')

In [119]:
CV_sgd.best_estimator_

SGDClassifier(alpha=0, early_stopping=True, eta0=0.02, learning_rate='adaptive',
              penalty='elasticnet')

In [120]:
bestest_sgd = CV_sgd.best_estimator_

In [121]:
CV_sgd.best_params_

{'alpha': 0,
 'class_weight': None,
 'eta0': 0.02,
 'learning_rate': 'adaptive',
 'penalty': 'elasticnet'}

In [78]:
#sgd = cross_val_score(bestest_sgd,X_full,y, cv = skf_cv)

In [79]:
sgd

array([0.98382667, 0.99481233, 0.99679536, 0.99023348, 0.99236991,
       0.99008088, 0.98794445, 0.98748665, 0.98870746, 0.98672364])

In [80]:
np.mean(sgd)

0.9898980827254832

In [81]:
new_sgd = SGDClassifier(early_stopping=True, eta0=0.02, learning_rate='adaptive',
              max_iter=1000)

cross_val_score(new_sgd,X_full,y, cv = skf_cv)

array([0.98352151, 0.99465975, 0.99694796, 0.99023348, 0.9920647 ,
       0.9920647 , 0.98763925, 0.98931787, 0.98886006, 0.98718144])

In [83]:
means_sgd = cross_val_score(new_sgd,X_full,y, cv = skf_cv)

In [84]:
print(np.mean(means_sgd))

0.9904322032118478


In [55]:
sgd = SGDClassifier(early_stopping=True, max_iter=1000)

In [56]:
param_grid = { 
    'learning_rate': ['adaptive','constant','invscaling'],
    'penalty': [None],
    'class_weight' : [None],
    'eta0': [0.02, 0.03, 0.04, 0.05]
}
CV_sgd = GridSearchCV(estimator=sgd, param_grid=param_grid, cv=skf_cv,scoring="accuracy")
CV_sgd.fit(X_full, y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             estimator=SGDClassifier(early_stopping=True),
             param_grid={'class_weight': [None],
                         'eta0': [0.02, 0.03, 0.04, 0.05],
                         'learning_rate': ['adaptive', 'constant',
                                           'invscaling'],
                         'penalty': [None]},
             scoring='accuracy')

In [57]:
CV_sgd.best_estimator_

SGDClassifier(early_stopping=True, eta0=0.04, learning_rate='constant',
              penalty=None)

In [58]:
mean_sgd_2=cross_val_score(CV_sgd.best_estimator_,X_full,y, cv = skf_cv)

In [59]:
np.mean(mean_sgd_2)

0.990706870264342

In [63]:
from sklearn.linear_model import SGDClassifier

skf_cv = StratifiedKFold(n_splits=10)
final_sto = SGDClassifier(early_stopping=True, eta0=0.04, learning_rate='constant',
              penalty=None)

In [64]:
mean_sgd_2=cross_val_score(final_sto,X_full,y, cv = skf_cv)

In [65]:
np.mean(mean_sgd_2)

0.9906305670050933