# Data Loading

In [1]:
import numpy as np 
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 80)

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

import time

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/cicids2017/MachineLearningCSV.md5
/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Tuesday-WorkingHours.pcap_ISCX.csv
/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv
/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Friday-WorkingHours-Morning.pcap_ISCX.csv
/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv


In [2]:
df_1 = pd.read_csv('/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv')
df_2 = pd.read_csv('/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv')
df_3 = pd.read_csv('/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Friday-WorkingHours-Morning.pcap_ISCX.csv')
df_4 = pd.read_csv('/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv')
df_5 = pd.read_csv('/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv')
df_6 = pd.read_csv('/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv')
df_7 = pd.read_csv('/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Tuesday-WorkingHours.pcap_ISCX.csv')
df_8 = pd.read_csv('/kaggle/input/cicids2017/MachineLearningCSV/MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv')

# 1.0 Data Preprocessing

In [3]:
# get shapes of each dataframe's
for i in range(1, 9):
    df_name = f'df_{i}'
    current_df = globals()[df_name]
    print(f"{df_name}: {current_df.shape}")

df_1: (225745, 79)
df_2: (286467, 79)
df_3: (191033, 79)
df_4: (529918, 79)
df_5: (288602, 79)
df_6: (170366, 79)
df_7: (445909, 79)
df_8: (692703, 79)


## 1.1 Data Concatenation, dropping of duplicated rows and features with constant values

In [4]:
# Concatenates all data frames
main_df = pd.concat([df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8])
main_df = main_df.reset_index()


# Drop duplicates: same observation but multiple occurences
main_df_1 = main_df.drop_duplicates(keep='first')


# Drop columns that have just one unique value, the model wont learn form columns like this.
one_value = main_df_1.columns[main_df_1.nunique() == 1]
main_df_2 = main_df_1.drop(columns = one_value, axis=1)


# Fill nan values
main_df_2['Flow Bytes/s'] = main_df_2['Flow Bytes/s'].fillna(main_df_2['Flow Bytes/s'].mean())


# Remove leading space character in all feature names
main_df_2.rename(columns=lambda x: x.lstrip(), inplace=True)


main_df_2.shape

(2830731, 72)

In [5]:
# Checking numbers of missing values on the df
main_df_2.isna().sum().sum()

0

# 1.2 Extract subsample of data
Getting a subsample of the data, while taking every unique values in the Label(Target) column into consideration.

In [6]:
# Get sub sample of the dataframe, so as to reduce computation time.


# Sort Label target to unique values
heartbleed = main_df_2[main_df_2['Label'] == 'Heartbleed']
Web_Attack_Sql_Injection = main_df_2[main_df_2['Label'] == 'Web Attack � Sql Injection']
Infiltration = main_df_2[main_df_2['Label'] == 'Infiltration']
Web_Attack_XSS = main_df_2[main_df_2['Label'] == 'Web Attack � XSS']
Web_Attack_Brute_Force = main_df_2[main_df_2['Label'] == 'Web Attack � Brute Force']
bot = main_df_2[main_df_2['Label'] == 'bot']
DoS_Slowhttptest = main_df_2[main_df_2['Label'] == 'DoS Slowhttptest']
DoS_slowloris = main_df_2[main_df_2['Label'] == 'DoS slowloris']
SSH_Patator = main_df_2[main_df_2['Label'] == 'SSH-Patator']
FTP_Patator = main_df_2[main_df_2['Label'] == 'FTP-Patator']
DoS_GoldenEye = main_df_2[main_df_2['Label'] == 'DoS GoldenEye']
DDoS = main_df_2[main_df_2['Label'] == 'DDoS']
PortScan = main_df_2[main_df_2['Label'] == 'PortScan']
DoS_Hulk = main_df_2[main_df_2['Label'] == 'DoS Hulk']
BENIGN = main_df_2[main_df_2['Label'] == 'BENIGN']


# extracting 5000 rows of data, from features/columns with over 5000 observations.
DoS_Slowhttptest_sample = DoS_Slowhttptest.sample(n=5000)
DoS_slowloris_sample = DoS_slowloris.sample(n=5000)
SSH_Patator_sample = SSH_Patator.sample(n=5000)
FTP_Patator_sample = FTP_Patator.sample(n=5000)
DoS_GoldenEye_sample = DoS_GoldenEye.sample(n=5000)
DDoS_sample = DDoS.sample(n=5000)
PortScan_sample = PortScan.sample(n=5000)
DoS_Hulk_sample = DoS_Hulk.sample(n=5000)
BENIGN_sample = BENIGN.sample(n=5000)


# Concate sub sample observations to a dataframe
sample_df = pd.concat([heartbleed, Web_Attack_Sql_Injection, Infiltration, Web_Attack_XSS, Web_Attack_Brute_Force, 
                       bot, DoS_Slowhttptest_sample, DoS_slowloris_sample, SSH_Patator_sample, FTP_Patator_sample, 
                       DoS_GoldenEye_sample, DDoS_sample, PortScan_sample, DoS_Hulk_sample, BENIGN_sample])
sample_df.shape

(47227, 72)

## 1.3 Label Encoder and Data Scaling

In [7]:
sample_df_1 = sample_df.copy()


le = LabelEncoder()
sample_df_1['Label'] = le.fit_transform(sample_df_1['Label'])

# Data Spliting

In [8]:
X = sample_df_1.drop(columns=['Label', 'Flow Packets/s', 'Flow Bytes/s'], axis=1)
y = sample_df_1['Label']

# Initial train test split set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Further split training set into training (70%) and validation (10%) sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42, stratify=y_train)

# Display the shapes of the resulting sets
print("Training set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (34002, 69)
Validation set shape: (3779, 69)
Testing set shape: (9446, 69)


In [9]:
%%time
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_y_pred = knn.predict(X_test)
accuracy_score(y_test, knn_y_pred)

CPU times: user 5.74 s, sys: 26.4 ms, total: 5.77 s
Wall time: 2.09 s


0.9616769002752488

In [10]:
%%time
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_y_pred = knn.predict(X_test)
accuracy_score(y_test, dt_y_pred)

CPU times: user 7.23 s, sys: 8.7 ms, total: 7.24 s
Wall time: 3.69 s


0.9616769002752488

In [11]:
%%time
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_y_pred = rf.predict(X_test)
accuracy_score(y_test, rf_y_pred)

CPU times: user 8.61 s, sys: 6.58 ms, total: 8.61 s
Wall time: 8.64 s


0.997353377090832