*****
## Necessary Library Import

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import sys
import warnings
warnings.filterwarnings('ignore')

In [2]:
sys.path.append(os.path.abspath("../")) # get the parent directory
import importlib
from src.data_processing import preprocess, reduce_memory_size
importlib.reload(preprocess)
from src.data_processing.preprocess import *
from src.data_processing.reduce_memory_size import *

**Fixing the display size**

In [3]:
# Set the maximum number of columns to display
pd.set_option('display.max_columns', None) # no limit on the number of columns
pd.set_option('display.max_rows', None)  # no limit on the number of rows
pd.set_option('display.max_colwidth', None) # no limit on the column width

*****
## Dataset Import

In [4]:
df = load_data("D:/Programing/web-attack-detection/data/raw/cic-ids2017.csv")

In [5]:
# create a new status column that contain 'safe' or 'malicious' based on the label column.
df = create_status(df)

In [6]:
df.shape

(1854979, 79)

In [7]:
# reduce the memory size of the dataframe
df = reduce_memory_size(df)

file_path = "D:/Programing/web-attack-detection/data/modified/cic-ids2017.csv"

try:
    df.to_csv(file_path, index=False)
    print("The modified dataset is saved successfully!")
except Exception as e:
    print(f"Error saving the modified dataset: {e}")

Initial Memory Size: 1282.79 MB
Updated Memory Size: 718.46 MB
Memory Usage Reduced by: 564.33 MB
this is: 56.01% of the initial size
The modified dataset is saved successfully!


*****
## Class Mapping

In [8]:
attack_mapping = {
    "Benign": "Benign",
    "DDoS": "DDoS",
    "DoS Hulk": "DoS",
    "DoS GoldenEye": "DoS",
    "DoS slowloris": "DoS",
    "DoS Slowhttptest": "DoS",
    "FTP-Patator": "Brute Force Attack",
    "SSH-Patator": "Brute Force Attack",
    "Infiltration": "Brute Force Attack",
    "Web Attack – Brute Force": "Web Attack",
    "Web Attack – XSS": "Web Attack",
    "Web Attack – Sql Injection": "Web Attack",
    "Bot": "Botnet",
    "PortScan": "Port Scanning Attack",
    "Heartbleed": "Heartbleed"
}

# map the attack labels to the attack categories
df['Attack Type'] = df["Label"].map(attack_mapping)

# remove the label column
df.drop(columns=["Label"], inplace=True)

# remove samples where the 'attack type' is 'heartbleed' since there is only few samples
df = df[df["Attack Type"] != "Heartbleed"]

*****
## Data Cleaning

In [10]:
# remove duplicate values
df = remove_duplicate_values(df)

In [13]:
# dropping constant columns
constant_columns = find_constant_columns(df)
df = df.drop(constant_columns, axis=1)

In [15]:
# dropping identical or duplicate columns
identical_columns, original_columns, duplicate_columns = find_duplicate_columns(df)
df = df.drop(duplicate_columns, axis=1)

In [16]:
# dropping low variance columns whose variance are less than 0.01
low_variance_columns = find_low_variance_columns(df, 0.01)
df = df.drop(low_variance_columns, axis=1)

In [17]:
df.shape

(1776123, 62)

*****
## Categorical Encoding

In [18]:
df.columns

Index(['Protocol', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Fwd Packets Length Total',
       'Bwd Packets Length Total', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Packet Length Min', 'Packet Length Max', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'Down/Up Ratio',
       'Avg P

In [25]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
status_encoder = LabelEncoder()

df["Attack Type"] = label_encoder.fit_transform(df["Attack Type"])
df['status'] = status_encoder.fit_transform(df['status'])

In [None]:
class_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))


attack_type_mapping = dict(zip(label_encoder.classes_, label_encoder))


In [28]:
df['Attack Type'].value_counts()

Attack Type
0    1439642
4     193745
3     128014
2       9186
6       2143
5       1956
1       1437
Name: count, dtype: int64

In [26]:
df.head()

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,Down/Up Ratio,Avg Packet Size,Subflow Fwd Bytes,Subflow Bwd Bytes,Init Fwd Win Bytes,Init Bwd Win Bytes,Fwd Act Data Packets,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,status,Attack Type
0,6,112740690,32,16,6448,1152,403,0,201.5,204.724197,72,72,72.0,0.0,67.411331,0.425756,2398738.0,5798698.0,16400000,3,113000000,3636796.5,6848761.0,16400000,3,113000000,7516023.0,8323385.0,16400000,3,1,1024,512,0.283837,0.141919,0,403,163.326538,178.931717,32016.559,0,0,1,0,0,166.729172,6448,1152,377,2079,15,32,359.4286,11.99802,380,343,16100000.0,498804.8,16400000,15400000,1,0
1,6,112740560,32,16,6448,5056,403,0,201.5,204.724197,316,316,316.0,0.0,102.039585,0.425756,2398735.2,5798709.5,16400000,2,113000000,3636792.2,6848777.0,16400000,2,113000000,7516016.0,8323376.0,16400000,4,1,1024,512,0.283838,0.141919,0,403,243.0,174.716919,30526.0,0,0,1,0,0,248.0625,6448,5056,955,2079,15,32,320.2857,15.74499,330,285,16100000.0,498793.66,16400000,15400000,1,0
2,0,113757377,545,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,4.790898,209112.83,1395543.4,20800000,0,114000000,209112.83,1395543.4,20800000,0,0,0.0,0.0,0,0,0,0,0,4.790898,0.0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0,-1,-1,0,0,9361829.0,7324646.0,18900000,19,12200000.0,6935824.0,20800000,5504997,1,0
3,17,100126,22,0,616,0,28,28,28.0,0.0,0,0,0.0,0.0,6152.248167,219.723149,4767.905,21833.02,100055,1,100126,4767.905,21833.02,100055,1,0,0.0,0.0,0,0,0,704,0,219.72314,0.0,28,28,28.0,0.0,0.0,0,0,0,0,0,29.272728,616,0,-1,-1,21,32,0.0,0.0,0,0,0.0,0.0,0,0,1,0
4,0,54760,4,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,73.046019,18253.334,30469.836,53431,108,54760,18253.334,30469.836,53431,108,0,0.0,0.0,0,0,0,0,0,73.04602,0.0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0,-1,-1,0,0,0.0,0.0,0,0,0.0,0.0,0,0,1,0
