In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv("firewall_data.csv")

In [6]:
# check for null values

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65532 entries, 0 to 65531
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Source Port           65532 non-null  int64 
 1   Destination Port      65532 non-null  int64 
 2   NAT Source Port       65532 non-null  int64 
 3   NAT Destination Port  65532 non-null  int64 
 4   Action                65532 non-null  object
 5   Bytes                 65532 non-null  int64 
 6   Bytes Sent            65532 non-null  int64 
 7   Bytes Received        65532 non-null  int64 
 8   Packets               65532 non-null  int64 
 9   Elapsed Time (sec)    65532 non-null  int64 
 10  pkts_sent             65532 non-null  int64 
 11  pkts_received         65532 non-null  int64 
dtypes: int64(11), object(1)
memory usage: 6.0+ MB


In [3]:
df.columns

Index(['Source Port', 'Destination Port', 'NAT Source Port',
       'NAT Destination Port', 'Action', 'Bytes', 'Bytes Sent',
       'Bytes Received', 'Packets', 'Elapsed Time (sec)', 'pkts_sent',
       'pkts_received'],
      dtype='object')

In [5]:
# descriptive analysis
df.describe()

Unnamed: 0,Source Port,Destination Port,NAT Source Port,NAT Destination Port,Bytes,Bytes Sent,Bytes Received,Packets,Elapsed Time (sec),pkts_sent,pkts_received
count,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0
mean,49391.969343,10577.385812,19282.972761,2671.04993,97123.95,22385.8,74738.15,102.866,65.833577,41.39953,61.466505
std,15255.712537,18466.027039,21970.689669,9739.162278,5618439.0,3828139.0,2463208.0,5133.002,302.461762,3218.871288,2223.332271
min,0.0,0.0,0.0,0.0,60.0,60.0,0.0,1.0,0.0,1.0,0.0
25%,49183.0,80.0,0.0,0.0,66.0,66.0,0.0,1.0,0.0,1.0,0.0
50%,53776.5,445.0,8820.5,53.0,168.0,90.0,79.0,2.0,15.0,1.0,1.0
75%,58638.0,15000.0,38366.25,443.0,752.25,210.0,449.0,6.0,30.0,3.0,2.0
max,65534.0,65535.0,65535.0,65535.0,1269359000.0,948477200.0,320881800.0,1036116.0,10824.0,747520.0,327208.0


In [8]:
#see the output classes

df.Action.value_counts()

allow         37640
deny          14987
drop          12851
reset-both       54
Name: Action, dtype: int64

In [9]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test= train_test_split(df[['Source Port', 'Destination Port', 'NAT Source Port',
       'NAT Destination Port', 'Bytes', 'Bytes Sent',
       'Bytes Received', 'Packets', 'Elapsed Time (sec)', 'pkts_sent',
       'pkts_received']], df[['Action']], test_size=0.2, random_state=55)

In [10]:
# model creation

from sklearn.tree import DecisionTreeClassifier

model=DecisionTreeClassifier(criterion='entropy',max_depth=8,min_samples_split=10,min_samples_leaf=10)

In [11]:
#train the model 


model.fit(X_train,Y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=8, min_samples_leaf=10,
                       min_samples_split=10)

In [12]:
#predict on the test data

pred=model.predict(X_test)

In [13]:
#find test accuracy and f1 score

from sklearn.metrics import accuracy_score, f1_score

acc=accuracy_score(Y_test,pred)
f=f1_score(Y_test,pred, average='weighted')

In [14]:
print("Accuracy=",acc)
print("F1-score=",f)

Accuracy= 0.9982452124818799
F1-score= 0.9977121692476818


In [None]:
# since we are getting a very high accuracy there can be a chance that the model is overfitted, 
#so to tackle that k fold cross validation will be used

In [20]:
from sklearn.model_selection import cross_val_score
clf = DecisionTreeClassifier(criterion='entropy',max_depth=8,min_samples_split=10,min_samples_leaf=10)
scores = cross_val_score(clf, X_train, Y_train, cv=10)

print("Average validation accuracy:",np.mean(scores))

Average validation accuracy: 0.9982641850986722


In [None]:
# so it can be seen that our model is not overfitted since validation accuracy is pretty similar to test accuracy
#It is rather a good fit.