In [1]:

import os
import logging

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.keras as keras

from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import plot_model


# Log setting
logging.basicConfig(format="%(asctime)s %(levelname)s %(message)s", datefmt="%H:%M:%S", level=logging.INFO)

# Change display.max_rows to show all features.
pd.set_option("display.max_rows", 85)

In [2]:
DIR_PATH = "~/CICIDS_models/dataset/"
PROCESSED_DIR_PATH = "/Users/smalih/CICIDS_models/ProcessedDataset/"

In [3]:
python_df = pd.read_csv('example.csv')
python_df = pd.read_csv('rbp.csv')
py_cols = list(python_df.columns)







DATA_PATH = "~/CICIDS_models/dataset/TrafficLabelling.csv"
df = pd.read_csv(DATA_PATH)
# Show number of NaN rows
print("Removing {} rows that contains only NaN values...".format(df[df.isna().all(axis=1)].shape[0]))
# remove duplicate Fwd Header Len column
df.drop("Fwd Header Length.1", axis=1, inplace=True)
# Remove NaN rows
df = df[~ df.isna().all(axis=1)]


# with open('columns_compare.csv', 'w') as cols_compare:
#     pairs = list(zip(dataset_cols, py_cols))
#     for i in range(len(pairs)):
#         pair = pairs[i]
#         cols_compare.write(f"{i} {pair[0]}\t{pair[1]}\n")
#     # for i in range(-2,0):
#     #     cols_compare.write(f"\"\"\t{py_cols[i]}\n")

# for i in range((max(len(dataset_cols), len(py_cols)))):
#     print(dataset_cols[min(i, len(dataset_cols)-1)], py_cols[min(i, len(py_cols)-1)])


Removing 0 rows that contains only NaN values...


In [4]:
corr = df.corr(numeric_only=True).round(2)

In [5]:
# print(corr)
correlated_features = np.where(np.abs(corr) > 0.95) # select ones above the abs 0.95
correlated_features = [(corr.iloc[x,y], x, y) for x, y in zip(*correlated_features) if x != y and x < y] # avoid duplication
s_corr_list = sorted(correlated_features, key=lambda x: -abs(x[0])) # sort by correlation value

if s_corr_list == []:
    print("There are no highly correlated features with correlation above", 0.95)
else:
    for v, i, j in s_corr_list:
        cols = df.columns
        print ("%s and %s = %.3f" % (corr.index[i], corr.columns[j], v))

Flow Duration and Fwd IAT Total = 1.000
Total Fwd Packets and Total Backward Packets = 1.000
Total Fwd Packets and Total Length of Bwd Packets = 1.000
Total Fwd Packets and Subflow Fwd Packets = 1.000
Total Fwd Packets and Subflow Bwd Packets = 1.000
Total Fwd Packets and Subflow Bwd Bytes = 1.000
Total Backward Packets and Subflow Fwd Packets = 1.000
Total Backward Packets and Subflow Bwd Packets = 1.000
Total Length of Fwd Packets and Subflow Fwd Bytes = 1.000
Total Length of Bwd Packets and Subflow Fwd Packets = 1.000
Total Length of Bwd Packets and Subflow Bwd Bytes = 1.000
Fwd Packet Length Mean and Avg Fwd Segment Size = 1.000
Bwd Packet Length Mean and Avg Bwd Segment Size = 1.000
Flow IAT Max and Fwd IAT Max = 1.000
Fwd PSH Flags and SYN Flag Count = 1.000
Fwd URG Flags and CWE Flag Count = 1.000
Packet Length Mean and Average Packet Size = 1.000
RST Flag Count and ECE Flag Count = 1.000
Subflow Fwd Packets and Subflow Bwd Packets = 1.000
Subflow Fwd Packets and Subflow Bwd Byt

In [6]:
# get upper triangle of correlation matrix

upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))
print(upper)
# find features with correlation greater than 0.95

to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# drop highly correlated features

new_df = df.drop(to_drop, axis=1, inplace=False)

                             Destination Port  Flow Duration  \
Destination Port                          NaN          -0.15   
Flow Duration                             NaN            NaN   
Total Fwd Packets                         NaN            NaN   
Total Backward Packets                    NaN            NaN   
Total Length of Fwd Packets               NaN            NaN   
Total Length of Bwd Packets               NaN            NaN   
Fwd Packet Length Max                     NaN            NaN   
Fwd Packet Length Min                     NaN            NaN   
Fwd Packet Length Mean                    NaN            NaN   
Fwd Packet Length Std                     NaN            NaN   
Bwd Packet Length Max                     NaN            NaN   
Bwd Packet Length Min                     NaN            NaN   
Bwd Packet Length Mean                    NaN            NaN   
Bwd Packet Length Std                     NaN            NaN   
Flow Bytes/s                            

In [7]:


def remove_strongly_correlated(df, threshold, print_columns=False):
    corr = df.corr(numeric_only=True)
    correlated_features = np.where(np.abs(corr) > threshold) # select ones above the abs threshold
    correlated_features = [(corr.iloc[x,y], x, y) for x, y in zip(*correlated_features) if x != y and x < y] # avoid duplication
    s_corr_list = sorted(correlated_features, key=lambda x: -abs(x[0])) # sort by correlation value
    to_drop = []
    if s_corr_list == []:
        print("There are no highly correlated features with correlation above", threshold)
    else:
        for v, i, j in s_corr_list:
            to_drop.append(corr.index[i])
            if print_columns:
                print ("%s and %s = %.3f" % (corr.index[i], corr.columns[j], v))

    # drop strongly correlated features
    return df.drop(to_drop, axis=1, inplace=False)

In [8]:
std = df.std(numeric_only=True)
zero_std_cols = std[std == 0].index.tolist()
zero_std_cols

  sqr = _ensure_numeric((avg - values) ** 2)


['Bwd PSH Flags',
 'Bwd URG Flags',
 'Fwd Avg Bytes/Bulk',
 'Fwd Avg Packets/Bulk',
 'Fwd Avg Bulk Rate',
 'Bwd Avg Bytes/Bulk',
 'Bwd Avg Packets/Bulk',
 'Bwd Avg Bulk Rate']

In [9]:
corr = df.corr(numeric_only=True).round(2)
correlated_features = np.where(np.abs(corr) > 0.9) # select ones above the abs 0.95
correlated_features = [(corr.iloc[x,y], x, y) for x, y in zip(*correlated_features) if x != y and x < y] # avoid duplication
s_corr_list = sorted(correlated_features, key=lambda x: -abs(x[0])) # sort by correlation value

if s_corr_list == []:
    print("There are no highly correlated features with correlation above", 0.95)
else:
    for v, i, j in s_corr_list:
        cols = df.columns
        print ("%s and %s = %.3f" % (corr.index[i], corr.columns[j], v))

Flow Duration and Fwd IAT Total = 1.000
Total Fwd Packets and Total Backward Packets = 1.000
Total Fwd Packets and Total Length of Bwd Packets = 1.000
Total Fwd Packets and Subflow Fwd Packets = 1.000
Total Fwd Packets and Subflow Bwd Packets = 1.000
Total Fwd Packets and Subflow Bwd Bytes = 1.000
Total Backward Packets and Subflow Fwd Packets = 1.000
Total Backward Packets and Subflow Bwd Packets = 1.000
Total Length of Fwd Packets and Subflow Fwd Bytes = 1.000
Total Length of Bwd Packets and Subflow Fwd Packets = 1.000
Total Length of Bwd Packets and Subflow Bwd Bytes = 1.000
Fwd Packet Length Mean and Avg Fwd Segment Size = 1.000
Bwd Packet Length Mean and Avg Bwd Segment Size = 1.000
Flow IAT Max and Fwd IAT Max = 1.000
Fwd PSH Flags and SYN Flag Count = 1.000
Fwd URG Flags and CWE Flag Count = 1.000
Packet Length Mean and Average Packet Size = 1.000
RST Flag Count and ECE Flag Count = 1.000
Subflow Fwd Packets and Subflow Bwd Packets = 1.000
Subflow Fwd Packets and Subflow Bwd Byt

In [10]:
df = remove_strongly_correlated(df, 0.95, True)

Total Fwd Packets and Subflow Fwd Packets = 1.000
Total Backward Packets and Subflow Bwd Packets = 1.000
Fwd Packet Length Mean and Avg Fwd Segment Size = 1.000
Fwd PSH Flags and SYN Flag Count = 1.000
Fwd URG Flags and CWE Flag Count = 1.000
Bwd Packet Length Mean and Avg Bwd Segment Size = 1.000
Total Length of Bwd Packets and Subflow Bwd Bytes = 1.000
Total Length of Fwd Packets and Subflow Fwd Bytes = 1.000
Total Fwd Packets and Total Backward Packets = 0.999
Total Fwd Packets and Subflow Bwd Packets = 0.999
Subflow Fwd Packets and Subflow Bwd Packets = 0.999
Total Backward Packets and Subflow Fwd Packets = 0.999
Flow Duration and Fwd IAT Total = 0.999
Flow IAT Max and Fwd IAT Max = 0.998
Packet Length Mean and Average Packet Size = 0.998
RST Flag Count and ECE Flag Count = 0.998
Total Fwd Packets and Total Length of Bwd Packets = 0.997
Total Length of Bwd Packets and Subflow Fwd Packets = 0.997
Total Fwd Packets and Subflow Bwd Bytes = 0.997
Subflow Fwd Packets and Subflow Bwd Byt

In [11]:
def remove_zero_std_features(df, print_columns=True):
    std_devs = df.std(numeric_only=True)
    zero_std_dev_cols = std_devs[std_devs == 0].index.tolist()
    if print_columns:
        zero_std_dev_cols

    # drop zero std deviation features
    return df.drop(zero_std_dev_cols, axis=1, inplace=False)    

In [12]:
def handle_inf_values(df):
    return df.replace([np.inf, -np.inf], np.nan, inplace=False)

In [13]:
def handle_missing_values(df):
    missing = df.isna().sum()

    med_flow_bytes = df['Flow Bytes/s'].median()
    new_df = df
    new_df['Flow Bytes/s'].fillna(med_flow_bytes, inplace=True)
    return new_df

In [14]:


def pre_process(df, threshold, print_columns=False):
    new_df = remove_strongly_correlated(df, threshold, print_columns)
    new_df = remove_zero_std_features(new_df, print_columns)
    return new_df 

In [15]:
# print(dataset)
df = pre_process(df, 0.95, print_columns=True)
print(df)

There are no highly correlated features with correlation above 0.95


  sqr = _ensure_numeric((avg - values) ** 2)


         Destination Port  Fwd Packet Length Min  Fwd Packet Length Std  \
0                   49188                      6                    0.0   
1                   49188                      6                    0.0   
2                   49188                      6                    0.0   
3                   49188                      6                    0.0   
4                   49486                      6                    0.0   
...                   ...                    ...                    ...   
5491115             61374                      6                    0.0   
5491116             61378                      6                    0.0   
5491117             61375                      6                    0.0   
5491118             61323                      6                    0.0   
5491119             61326                      6                    0.0   

         Bwd Packet Length Min  Bwd Packet Length Std  Flow Bytes/s  \
0                           

In [16]:
df = handle_inf_values(df)
# print(df)
df = handle_missing_values(df)

print(df)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_df['Flow Bytes/s'].fillna(med_flow_bytes, inplace=True)


         Destination Port  Fwd Packet Length Min  Fwd Packet Length Std  \
0                   49188                      6                    0.0   
1                   49188                      6                    0.0   
2                   49188                      6                    0.0   
3                   49188                      6                    0.0   
4                   49486                      6                    0.0   
...                   ...                    ...                    ...   
5491115             61374                      6                    0.0   
5491116             61378                      6                    0.0   
5491117             61375                      6                    0.0   
5491118             61323                      6                    0.0   
5491119             61326                      6                    0.0   

         Bwd Packet Length Min  Bwd Packet Length Std  Flow Bytes/s  \
0                           

In [17]:

# Standardizing the dataset
from sklearn.preprocessing import StandardScaler
features = df.drop('Label', axis=1) # what does this line do?
attacks = df['Label'] # since this still has the labels
print(features.columns)
print(len(features.columns))
print(attacks)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
# # df[df[:, :-3] < 0]

Index(['Destination Port', 'Fwd Packet Length Min', 'Fwd Packet Length Std',
       'Bwd Packet Length Min', 'Bwd Packet Length Std', 'Flow Bytes/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Min', 'Fwd IAT Total',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Min', 'Bwd IAT Total',
       'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min',
       'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s',
       'Bwd Packets/s', 'Min Packet Length', 'Packet Length Std',
       'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count',
       'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'CWE Flag Count',
       'ECE Flag Count', 'Down/Up Ratio', 'Average Packet Size',
       'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Subflow Fwd Bytes',
       'Subflow Bwd Bytes', 'Init_Win_bytes_forward',
       'Init_Win_bytes_backward', 'act_data_pkt_fwd', 'min_seg_size_forward',
       'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Std',
       'Id

In [18]:

from sklearn.decomposition import PCA
X = df.iloc[:,:-1]

pca = PCA(20)
pca.fit(X)
print(f'information retained: {sum(pca.explained_variance_ratio_)}')


information retained: 0.9999617463031587


In [19]:
transformed_features = pca.transform(features)

new_data = pd.DataFrame(transformed_features, columns =[f'PC{i+1}' for i in range(20)])
new_data['Label'] = attacks.values

In [20]:
feature_map = {
    'Destination Port': 'dst_port',
    'Fwd Packet Length Min': 'fwd_pkt_len_min',
    'Fwd Packet Length Std': 'fwd_pkt_len_std',
    'Bwd Packet Length Min': 'bwd_pkt_len_min', 
    'Bwd Packet Length Std': 'bwd_pkt_len_std',
    'Flow Bytes/s': 'flow_byts_s',
    'Flow IAT Mean': 'flow_iat_mean',
    'Flow IAT Std': 'flow_iat_std',
    'Flow IAT Min': 'flow_iat_min',
    'Fwd IAT Total': 'fwd_iat_tot',
    'Fwd IAT Mean': 'fwd_iat_mean',
    'Fwd IAT Std': 'fwd_iat_std',
    'Fwd IAT Min': 'fwd_iat_min',
    'Bwd IAT Total': 'bwd_iat_tot',
    'Bwd IAT Mean': 'bwd_iat_mean',
    'Bwd IAT Std': 'bwd_iat_std',
    'Bwd IAT Max': 'bwd_iat_max',
    'Bwd IAT Min': 'bwd_iat_min',
    'Fwd Header Length': 'fwd_header_len',
    'Bwd Header Length': 'bwd_header_len',
    'Fwd Packets/s': 'fwd_pkts_s',
    'Bwd Packets/s': 'bwd_pkts_s',
    'Min Packet Length': 'pkt_len_min',
    'Packet Length Std': 'pkt_len_std',
    'Packet Length Variance': 'pkt_len_var',
    'FIN Flag Count': 'fin_flag_cnt',
    'SYN Flag Count': 'syn_flag_cnt',
    'PSH Flag Count': 'psh_flag_cnt',
    'ACK Flag Count': 'ack_flag_cnt',
    'URG Flag Count': 'urg_flag_cnt',
    'CWE Flag Count': 'cwr_flag_cnt',
    'ECE Flag Count': 'ece_flag_cnt',
    'Down/Up Ratio': 'down_up_ratio',
    'Average Packet Size': 'pkt_size_avg',
    'Avg Fwd Segment Size': 'fwd_seg_size_avg',
    'Avg Bwd Segment Size': 'bwd_seg_size_avg',
    'Subflow Fwd Bytes': 'subflow_fwd_byts',
    'Subflow Bwd Bytes': 'subflow_bwd_byts',
    'Init_Win_bytes_forward': 'init_fwd_win_byts',
    'Init_Win_bytes_backward': 'init_bwd_win_byts',
    'act_data_pkt_fwd': 'fwd_act_data_pkts',
    'min_seg_size_forward': 'fwd_seg_size_min',
    'Active Mean': 'active_mean',
    'Active Std': 'active_std',
    'Active Max': 'active_max',
    'Active Min': 'active_min',
    'Idle Std': 'idle_std',
    'Idle Min': 'idle_min'
}

In [21]:

with open ('fields.txt', 'r') as fields_file:
    fields = fields_file.read().splitlines()

print(len(fields))

48


In [22]:
for i in range(len(fields)):
    if feature_map[features.columns[i]] != fields[i]:
        print(f"{feature_map[features.columns[i]]}\t{fields[i]}")

In [23]:
print(feature_map.values())

dict_values(['dst_port', 'fwd_pkt_len_min', 'fwd_pkt_len_std', 'bwd_pkt_len_min', 'bwd_pkt_len_std', 'flow_byts_s', 'flow_iat_mean', 'flow_iat_std', 'flow_iat_min', 'fwd_iat_tot', 'fwd_iat_mean', 'fwd_iat_std', 'fwd_iat_min', 'bwd_iat_tot', 'bwd_iat_mean', 'bwd_iat_std', 'bwd_iat_max', 'bwd_iat_min', 'fwd_header_len', 'bwd_header_len', 'fwd_pkts_s', 'bwd_pkts_s', 'pkt_len_min', 'pkt_len_std', 'pkt_len_var', 'fin_flag_cnt', 'syn_flag_cnt', 'psh_flag_cnt', 'ack_flag_cnt', 'urg_flag_cnt', 'cwr_flag_cnt', 'ece_flag_cnt', 'down_up_ratio', 'pkt_size_avg', 'fwd_seg_size_avg', 'bwd_seg_size_avg', 'subflow_fwd_byts', 'subflow_bwd_byts', 'init_fwd_win_byts', 'init_bwd_win_byts', 'fwd_act_data_pkts', 'fwd_seg_size_min', 'active_mean', 'active_std', 'active_max', 'active_min', 'idle_std', 'idle_min'])


In [24]:
df.to_csv('dataset/dataset.csv', index=False)

In [25]:
Y

NameError: name 'Y' is not defined