# Data Preprocessing
- Load both 2017, 2018 original and corrected CICIDS datasets
- Combine all the protions of dataset into one dataframe
- Rename Columns
- Remove duplicate and missing value rows
- Reclassify the data labels to ['Benign', 'Attack']
- Encode Labels
- Remove duplicate column for 2017 original dataset - Fwd Header Length.1
- Extract features by random forest (feature importance)
- Generate data only contains selected features
- Fix Data Types

### Load both 2017, 2018 original and corrected CICIDS datasets

In [3]:
# Import Necessary Libraries
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob

In [17]:
df2017_original_files = glob.glob('2017/*.csv')

for file in df2017_original_files:
    df = pd.read_csv(file)
    df.to_parquet(f'{file[:-4]}.parquet', compression=None)

In [18]:
df2017_corrected_files = glob.glob('improved_2017/*.csv')

for file in df2017_corrected_files:
    df = pd.read_csv(file)
    df.to_parquet(f'{file[:-4]}.parquet', compression=None)

In [2]:
df2018_original_files = glob.glob('2018/*.csv')

for file in df2018_original_files:
    df = pd.read_csv(file, low_memory=False)
    df.to_parquet(f'{file[:-4]}.parquet', compression=None)

Experiment for time reading file (check methods if improved)

In [11]:
import time
start = time.time()

pd.read_csv('2018/Friday-02-03-2018_TrafficForML_CICFlowMeter.csv')

print(f'time for reading csv file: {time.time() - start}')

time for reading csv file: 12.51607632637024


In [12]:
start = time.time()

pd.read_parquet('2018/Friday-02-03-2018_TrafficForML_CICFlowMeter.parquet', engine="fastparquet")

print(f'time for reading csv file: {time.time() - start}')



time for reading csv file: 2.0231380462646484


In [4]:
df2017_original = pd.read_parquet('2017/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.parquet')
# df2018_original = pd.read_parquet('2018/Friday-02-03-2018_TrafficForML_CICFlowMeter.parquet')
# df2017_corrected = pd.read_parquet('improved_2017/friday.parquet')
# df2018_corrected = pd.read_parquet('../resources/Improved/CSECICIDS2018_improved/Friday-02-03-2018.csv')


In [5]:
row_2017, col_2017 = df2017_original.shape
print(f'There are {col_2017} columns, {row_2017} rows in CICIDS2017')
# row_2018, col_2018 = df2018_original.shape
# print(f'There are {col_2018} columns, {row_2018} rows in CICIDS2018')
# row_2017, col_2017 = df2017_corrected.shape
# print(f'There are {col_2017} columns, {row_2017} rows in CICIDS2017')
# row_2018, col_2018 = df2018_corrected.shape
# print(f'There are {col_2018} columns, {row_2018} rows in CICIDS2018')

There are 79 columns, 225745 rows in CICIDS2017


In [6]:
df2017_original.head(5)

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [5]:
df2017_corrected.head(5)

Unnamed: 0,id,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Total Fwd Packet,...,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label,Attempted Category
0,1,192.168.10.50-192.168.10.3-56108-3268-6,192.168.10.50,56108,192.168.10.3,3268,6,2017-07-07 11:59:50.315195,112740690,32,...,343,16105400.0,498804.8,16399772,15375229,-1,-1,112740690,BENIGN,-1
1,2,192.168.10.50-192.168.10.3-42144-389-6,192.168.10.50,42144,192.168.10.3,389,6,2017-07-07 11:59:50.316273,112740560,32,...,285,16105430.0,498793.7,16399782,15375263,-1,-1,112740560,BENIGN,-1
2,3,8.6.0.1-8.0.6.4-0-0-0,8.6.0.1,0,8.0.6.4,0,0,2017-07-07 12:00:31.388567,113757377,545,...,19,12210360.0,6935824.0,20757030,5504997,-1,-1,0,BENIGN,-1
3,4,192.168.10.25-224.0.0.251-5353-5353-17,192.168.10.25,5353,224.0.0.251,5353,17,2017-07-07 12:00:42.903850,91997219,388,...,16,13197640.0,5826905.0,19776791,5817470,-1,-1,0,BENIGN,-1
4,5,192.168.10.25-17.253.14.125-123-123-17,192.168.10.25,123,17.253.14.125,123,17,2017-07-07 12:00:42.430758,66966070,6,...,1968172,64974430.0,0.0,64974431,64974431,-1,-1,0,BENIGN,-1


In [4]:
df2018_original.head(5)

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,443,6,02/03/2018 08:47:38,141385,9,7,553,3773,202,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,49684,6,02/03/2018 08:47:38,281,2,1,38,0,38,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,443,6,02/03/2018 08:47:40,279824,11,15,1086,10527,385,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,443,6,02/03/2018 08:47:40,132,2,0,0,0,0,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,443,6,02/03/2018 08:47:41,274016,9,13,1285,6141,517,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


In [3]:
df2018_corrected.head(5)

NameError: name 'df2018_corrected' is not defined

In [7]:
print(f'2017 original columns: {df2017_original.columns}')
print(f'2017 corrected columns: {df2017_corrected.columns}')
print(f'2018 original columns: {df2018_original.columns}')
# print(f'2018 corrected columns: {df2018_corrected.columns}')

2017 original columns: Index([' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags',
       ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Pac

In [3]:
df2017_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225745 entries, 0 to 225744
Data columns (total 79 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0    Destination Port             225745 non-null  int64  
 1    Flow Duration                225745 non-null  int64  
 2    Total Fwd Packets            225745 non-null  int64  
 3    Total Backward Packets       225745 non-null  int64  
 4   Total Length of Fwd Packets   225745 non-null  int64  
 5    Total Length of Bwd Packets  225745 non-null  int64  
 6    Fwd Packet Length Max        225745 non-null  int64  
 7    Fwd Packet Length Min        225745 non-null  int64  
 8    Fwd Packet Length Mean       225745 non-null  float64
 9    Fwd Packet Length Std        225745 non-null  float64
 10  Bwd Packet Length Max         225745 non-null  int64  
 11   Bwd Packet Length Min        225745 non-null  int64  
 12   Bwd Packet Length Mean       225745 non-nul

### Combine all the protions of dataset into one dataframe

In [7]:
d0_2017_o = df2017_original #the first row portion is already imported so we will just copy that
d1_2017_o = pd.read_parquet('2017/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.parquet')
d2_2017_o = pd.read_parquet('2017/Friday-WorkingHours-Morning.pcap_ISCX.parquet')
d3_2017_o = pd.read_parquet('2017/Monday-WorkingHours.pcap_ISCX.parquet')
d4_2017_o = pd.read_parquet('2017/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.parquet')
d5_2017_o = pd.read_parquet('2017/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.parquet')
d6_2017_o = pd.read_parquet('2017/Tuesday-WorkingHours.pcap_ISCX.parquet')
d7_2017_o = pd.read_parquet('2017/Wednesday-workingHours.pcap_ISCX.parquet')

In [4]:
d0_2017_c = df2017_corrected #the first row portion is already imported so we will just copy that
d1_2017_c = pd.read_parquet('improved_2017/monday.parquet')
d2_2017_c = pd.read_parquet('improved_2017/thursday.parquet')
d3_2017_c = pd.read_parquet('improved_2017/tuesday.parquet')
d4_2017_c = pd.read_parquet('improved_2017/wednesday.parquet')

In [5]:
d0_2018_o = df2018_original #the first row portion is already imported so we will just copy that
d1_2018_o = pd.read_parquet('2018/Friday-16-02-2018_TrafficForML_CICFlowMeter.parquet')
d2_2018_o = pd.read_parquet('2018/Friday-23-02-2018_TrafficForML_CICFlowMeter.parquet')
d3_2018_o = pd.read_parquet('2018/Thuesday-20-02-2018_TrafficForML_CICFlowMeter.parquet')
d3_2018_o.drop(columns=['Flow ID', 'Src IP', 'Src Port', 'Dst IP'], axis=1,inplace=True)
d4_2018_o = pd.read_parquet('2018/Thursday-01-03-2018_TrafficForML_CICFlowMeter.parquet')
d5_2018_o = pd.read_parquet('2018/Thursday-15-02-2018_TrafficForML_CICFlowMeter.parquet')
d6_2018_o = pd.read_parquet('2018/Thursday-22-02-2018_TrafficForML_CICFlowMeter.parquet')
d7_2018_o = pd.read_parquet('2018/Wednesday-14-02-2018_TrafficForML_CICFlowMeter.parquet')
d8_2018_o = pd.read_parquet('2018/Wednesday-21-02-2018_TrafficForML_CICFlowMeter.parquet')
d9_2018_o = pd.read_parquet('2018/Wednesday-28-02-2018_TrafficForML_CICFlowMeter.parquet')

In [None]:
d0_2018_c = df2018_corrected #the first row portion is already imported so we will just copy that
d1_2018_c = pd.read_csv('../resources/Improved/CSECICIDS2018_improved/Friday-16-02-2018.csv')
d2_2018_c = pd.read_csv('../resources/Improved/CSECICIDS2018_improved/Friday-23-02-2018.csv')
d3_2018_c = pd.read_csv('../resources/Improved/CSECICIDS2018_improved/Thursday-01-03-2018.csv')
d4_2018_c = pd.read_csv('../resources/Improved/CSECICIDS2018_improved/Thursday-15-02-2018.csv')
d5_2018_c = pd.read_csv('../resources/Improved/CSECICIDS2018_improved/Thursday-22-02-2018.csv')
d6_2018_c = pd.read_csv('../resources/Improved/CSECICIDS2018_improved/Tuesday-20-02-2018.csv')
d7_2018_c = pd.read_csv('../resources/Improved/CSECICIDS2018_improved/Wednesday-14-02-2018.csv')
d8_2018_c = pd.read_csv('../resources/Improved/CSECICIDS2018_improved/Wednesday-21-02-2018.csv')
d9_2018_c = pd.read_csv('../resources/Improved/CSECICIDS2018_improved/Wednesday-28-02-2018.csv')

In [None]:
columns = d0_2017_o.columns

for c in columns:
    if c in d0_2017_o.columns and c in d1_2017_o.columns and c in d2_2017_o.columns and c in d3_2017_o.columns and c in d4_2017_o.columns and c in d5_2017_o.columns and c in d6_2017_o.columns and c in d7_2017_o.columns:
        dtype0 = d0_2017_o[c].dtype
        dtype1 = d1_2017_o[c].dtype
        dtype2 = d2_2017_o[c].dtype
        dtype3 = d3_2017_o[c].dtype
        dtype4 = d4_2017_o[c].dtype
        dtype5 = d5_2017_o[c].dtype
        dtype6 = d6_2017_o[c].dtype
        dtype7 = d7_2017_o[c].dtype

        if dtype0 == dtype1 == dtype2 == dtype3 == dtype4 == dtype5 ==dtype6 == dtype7:
            print("The data types of '{}' in both dataframes are the same: {}".format(c, dtype1))
        else:
            print("The data types of '{}' in both dataframes are different.".format(c))
    else:
        print("The column '{}' does not exist in one or both dataframes.".format(c))


In [8]:
# And now cmbining the datasets
df2017_original = pd.concat([d0_2017_o, d1_2017_o, d2_2017_o, d3_2017_o, d4_2017_o, d5_2017_o, d6_2017_o, d7_2017_o], ignore_index=True)
# df2017_corrected = pd.concat([d0_2017_c, d1_2017_c, d2_2017_c, d3_2017_c, d4_2017_c], ignore_index=True)
# df2018_original = pd.concat([d0_2018_o, d1_2018_o, d2_2018_o, d3_2018_o, d4_2018_o, d5_2018_o, d6_2018_o, d7_2018_o, d8_2018_o, d9_2018_o], ignore_index=True)
# df2018_corrected = pd.concat([d0_2018_c, d1_2018_c, d2_2018_c, d3_2018_c, d4_2018_c, d5_2018_c, d6_2018_c, d7_2018_c, d8_2018_c, d9_2018_c], ignore_index=True)

In [7]:
print(f'df2017_original.shape : {df2017_original.shape }')
print(f'df2017_corrected.shape : {df2017_corrected.shape }')
# print(f'df2018_original.shape : {df2018_original.shape }')
# print(f'df2018_corrected.shape : {df2018_corrected.shape }')


df2017_original.shape : (2830743, 79)
df2017_corrected.shape : (2099976, 91)


### Rename Columns

In [9]:
formatted_data = {' Destination Port': 'Destination Port',
                  ' Flow Duration': 'Flow Duration', 
                  ' Total Fwd Packets': 'Total Fwd Packets',
                  ' Total Backward Packets': 'Total Backward Packets', 
                  'Total Length of Fwd Packets': 'Total Length of Fwd Packets',
                  ' Total Length of Bwd Packets': 'Total Length of Bwd Packets', 
                  ' Fwd Packet Length Max': 'Fwd Packet Length Max',
                  ' Fwd Packet Length Min': 'Fwd Packet Length Min', 
                  ' Fwd Packet Length Mean': 'Fwd Packet Length Mean',
                  ' Fwd Packet Length Std': 'Fwd Packet Length Std', 
                  'Bwd Packet Length Max': 'Bwd Packet Length Max',
                  ' Bwd Packet Length Min': 'Bwd Packet Length Min', 
                  ' Bwd Packet Length Mean': 'Bwd Packet Length Mean',
                  ' Bwd Packet Length Std': 'Bwd Packet Length Std', 
                  'Flow Bytes/s': 'Flow Bytes/s', 
                  ' Flow Packets/s': 'Flow Packets/s',
                  ' Flow IAT Mean': 'Flow IAT Mean', 
                  ' Flow IAT Std': 'Flow IAT Std', 
                  ' Flow IAT Max': 'Flow IAT Max', 
                  ' Flow IAT Min': 'Flow IAT Min',
                  'Fwd IAT Total': 'Fwd IAT Total', 
                  ' Fwd IAT Mean': 'Fwd IAT Mean', 
                  ' Fwd IAT Std': 'Fwd IAT Std', 
                  ' Fwd IAT Max': 'Fwd IAT Max',
                  ' Fwd IAT Min': 'Fwd IAT Min', 
                  'Bwd IAT Total': 'Bwd IAT Total', 
                  ' Bwd IAT Mean': 'Bwd IAT Mean', 
                  ' Bwd IAT Std': 'Bwd IAT Std',
                  ' Bwd IAT Max': 'Bwd IAT Max', 
                  ' Bwd IAT Min': 'Bwd IAT Min', 
                  'Fwd PSH Flags': 'Fwd PSH Flags', 
                  ' Bwd PSH Flags': 'Bwd PSH Flags',
                  ' Fwd URG Flags': 'Fwd URG Flags', 
                  ' Bwd URG Flags': 'Bwd URG Flags', 
                  ' Fwd Header Length': 'Fwd Header Length',
                  ' Bwd Header Length': 'Bwd Header Length', 
                  'Fwd Packets/s': 'Fwd Packets/s', 
                  ' Bwd Packets/s': 'Bwd Packets/s',
                  ' Min Packet Length': 'Min Packet Length', 
                  ' Max Packet Length': 'Max Packet Length', 
                  ' Packet Length Mean': 'Packet Length Mean',
                  ' Packet Length Std': 'Packet Length Std', 
                  ' Packet Length Variance': 'Packet Length Variance', 
                  'FIN Flag Count': 'FIN Flag Count',
                  ' SYN Flag Count': 'SYN Flag Count', 
                  ' RST Flag Count': 'RST Flag Count', 
                  ' PSH Flag Count': 'PSH Flag Count',
                  ' ACK Flag Count': 'ACK Flag Count', 
                  ' URG Flag Count': 'URG Flag Count', 
                  ' CWE Flag Count': 'CWE Flag Count',
                  ' ECE Flag Count': 'ECE Flag Count', 
                  ' Down/Up Ratio': 'Down/Up Ratio', 
                  ' Average Packet Size': 'Average Packet Size',
                  ' Avg Fwd Segment Size': 'Avg Fwd Segment Size', 
                  ' Avg Bwd Segment Size': 'Avg Bwd Segment Size',
                  ' Fwd Header Length.1': 'Fwd Header Length.1', 
                  'Fwd Avg Bytes/Bulk': 'Fwd Avg Bytes/Bulk', 
                  ' Fwd Avg Packets/Bulk': 'Fwd Avg Packets/Bulk',
                  ' Fwd Avg Bulk Rate': 'Fwd Avg Bulk Rate', 
                  ' Bwd Avg Bytes/Bulk': 'Bwd Avg Bytes/Bulk', 
                  ' Bwd Avg Packets/Bulk': 'Bwd Avg Packets/Bulk',
                  'Bwd Avg Bulk Rate': 'Bwd Avg Bulk Rate', 
                  'Subflow Fwd Packets': 'Subflow Fwd Packets', 
                  ' Subflow Fwd Bytes': 'Subflow Fwd Bytes',
                  ' Subflow Bwd Packets': 'Subflow Bwd Packets', 
                  ' Subflow Bwd Bytes': 'Subflow Bwd Bytes', 
                  'Init_Win_bytes_forward': 'Init_Win_bytes_forward',
                  ' Init_Win_bytes_backward': 'Init_Win_bytes_backward', 
                  ' act_data_pkt_fwd': 'act_data_pkt_fwd',
                  ' min_seg_size_forward': 'min_seg_size_forward', 
                  'Active Mean': 'Active Mean', 
                  ' Active Std': 'Active Std', 
                  ' Active Max': 'Active Max',
                  ' Active Min': 'Active Min', 
                  'Idle Mean': 'Idle Mean', 
                  ' Idle Std': 'Idle Std', 
                  ' Idle Max': 'Idle Max', 
                  ' Idle Min': 'Idle Min',
                  ' Label': 'Label'
}

In [10]:
#rename dataset
df2017_original.rename(columns=formatted_data, inplace=True)

In [8]:
formatted_data = {
    'Flow Duration': 'Flow Duration', 
    'Total Fwd Packet': 'Total Fwd Packets', 
    'Total Bwd packets': 'Total Backward Packets', 
    'Total Length of Fwd Packet': 'Total Length of Fwd Packets', 
    'Total Length of Bwd Packet': 'Total Length of Bwd Packets', 
    'Fwd Packet Length Max': 'Fwd Packet Length Max', 
    'Fwd Packet Length Min': 'Fwd Packet Length Min', 
    'Fwd Packet Length Mean': 'Fwd Packet Length Mean', 
    'Fwd Packet Length Std': 'Fwd Packet Length Std', 
    'Bwd Packet Length Max': 'Bwd Packet Length Max', 
    'Bwd Packet Length Min': 'Bwd Packet Length Min', 
    'Bwd Packet Length Mean': 'Bwd Packet Length Mean', 
    'Bwd Packet Length Std': 'Bwd Packet Length Std', 
    'Flow Bytes/s': 'Flow Bytes/s', 
    'Flow Packets/s': 'Flow Packets/s', 
    'Flow IAT Mean': 'Flow IAT Mean', 
    'Flow IAT Std': 'Flow IAT Std', 
    'Flow IAT Max': 'Flow IAT Max', 
    'Flow IAT Min': 'Flow IAT Min', 
    'Fwd IAT Total': 'Fwd IAT Total', 
    'Fwd IAT Mean': 'Fwd IAT Mean', 
    'Fwd IAT Std': 'Fwd IAT Std', 
    'Fwd IAT Max': 'Fwd IAT Max', 
    'Fwd IAT Min': 'Fwd IAT Min', 
    'Bwd IAT Total': 'Bwd IAT Total', 
    'Bwd IAT Mean': 'Bwd IAT Mean', 
    'Bwd IAT Std': 'Bwd IAT Std', 
    'Bwd IAT Max': 'Bwd IAT Max', 
    'Bwd IAT Min': 'Bwd IAT Min', 
    'Fwd PSH Flags': 'Fwd PSH Flags', 
    'Bwd PSH Flags': 'Bwd PSH Flags', 
    'Fwd URG Flags': 'Fwd URG Flags', 
    'Bwd URG Flags': 'Bwd URG Flags', 
    'Fwd Header Length': 'Fwd Header Length', 
    'Bwd Header Length': 'Bwd Header Length', 
    'Fwd Packets/s': 'Fwd Packets/s', 
    'Bwd Packets/s': 'Bwd Packets/s', 
    'Packet Length Min': 'Min Packet Length', 
    'Packet Length Max': 'Max Packet Length', 
    'Packet Length Mean': 'Packet Length Mean', 
    'Packet Length Std': 'Packet Length Std', 
    'Packet Length Variance': 'Packet Length Variance', 
    'FIN Flag Count': 'FIN Flag Count', 
    'SYN Flag Count': 'SYN Flag Count', 
    'RST Flag Count': 'RST Flag Count', 
    'PSH Flag Count': 'PSH Flag Count', 
    'ACK Flag Count': 'ACK Flag Count', 
    'URG Flag Count': 'URG Flag Count', 
    'CWR Flag Count': 'CWE Flag Count', 
    'ECE Flag Count': 'ECE Flag Count', 
    'Down/Up Ratio': 'Down/Up Ratio', 
    'Average Packet Size': 'Average Packet Size', 
    'Fwd Segment Size Avg': 'Avg Fwd Segment Size', 
    'Bwd Segment Size Avg': 'Avg Bwd Segment Size', 
    'Fwd Bytes/Bulk Avg': 'Fwd Avg Bytes/Bulk', 
    'Fwd Packet/Bulk Avg': 'Fwd Avg Packets/Bulk', 
    'Fwd Bulk Rate Avg': 'Fwd Avg Bulk Rate', 
    'Bwd Bytes/Bulk Avg': 'Bwd Avg Bytes/Bulk', 
    'Bwd Packet/Bulk Avg': 'Bwd Avg Packets/Bulk', 
    'Bwd Bulk Rate Avg': 'Bwd Avg Bulk Rate', 
    'Subflow Fwd Packets': 'Subflow Fwd Packets', 
    'Subflow Fwd Bytes': 'Subflow Fwd Bytes', 
    'Subflow Bwd Packets': 'Subflow Bwd Packets', 
    'Subflow Bwd Bytes': 'Subflow Bwd Bytes', 
    'FWD Init Win Bytes': 'Init_Win_bytes_forward', 
    'Bwd Init Win Bytes': 'Init_Win_bytes_backward', 
    'Fwd Act Data Pkts': 'act_data_pkt_fwd', 
    'Fwd Seg Size Min': 'min_seg_size_forward', 
    'Active Mean': 'Active Mean', 
    'Active Std': 'Active Std', 
    'Active Max': 'Active Max', 
    'Active Min': 'Active Min', 
    'Idle Mean': 'Idle Mean', 
    'Idle Std': 'Idle Std', 
    'Idle Max': 'Idle Max', 
    'Idle Min': 'Idle Min'
}

In [9]:
df2017_corrected.rename(columns=formatted_data, inplace=True)

In [None]:
formatted_data = {
    'Dst Port': 'Destination Port', 
    'Flow Duration': 'Flow Duration', 
    'Tot Fwd Pkts': 'Total Fwd Packets', 
    'Tot Bwd Pkts': 'Total Backward Packets', 
    'TotLen Fwd Pkts': 'Total Length of Fwd Packets', 
    'TotLen Bwd Pkts': 'Total Length of Bwd Packets', 
    'Fwd Pkt Len Max': 'Fwd Packet Length Max', 
    'Fwd Pkt Len Min': 'Fwd Packet Length Min', 
    'Fwd Pkt Len Mean': 'Fwd Packet Length Mean', 
    'Fwd Pkt Len Std': 'Fwd Packet Length Std', 
    'Bwd Pkt Len Max': 'Bwd Packet Length Max', 
    'Bwd Pkt Len Min': 'Bwd Packet Length Min', 
    'Bwd Pkt Len Mean': 'Bwd Packet Length Mean', 
    'Bwd Pkt Len Std': 'Bwd Packet Length Std', 
    'Flow Byts/s': 'Flow Bytes/s', 
    'Flow Pkts/s': 'Flow Packets/s', 
    'Flow IAT Mean': 'Flow IAT Mean', 
    'Flow IAT Std': 'Flow IAT Std', 
    'Flow IAT Max': 'Flow IAT Max', 
    'Flow IAT Min': 'Flow IAT Min', 
    'Fwd IAT Tot': 'Fwd IAT Total', 
    'Fwd IAT Mean': 'Fwd IAT Mean', 
    'Fwd IAT Std': 'Fwd IAT Std', 
    'Fwd IAT Max': 'Fwd IAT Max', 
    'Fwd IAT Min': 'Fwd IAT Min', 
    'Bwd IAT Tot': 'Bwd IAT Total', 
    'Bwd IAT Mean': 'Bwd IAT Mean', 
    'Bwd IAT Std': 'Bwd IAT Std', 
    'Bwd IAT Max': 'Bwd IAT Max', 
    'Bwd IAT Min': 'Bwd IAT Min', 
    'Fwd PSH Flags': 'Fwd PSH Flags', 
    'Bwd PSH Flags': 'Bwd PSH Flags', 
    'Fwd URG Flags': 'Fwd URG Flags', 
    'Bwd URG Flags': 'Bwd URG Flags', 
    'Fwd Header Len': 'Fwd Header Length', 
    'Bwd Header Len': 'Bwd Header Length', 
    'Fwd Pkts/s': 'Fwd Packets/s', 
    'Bwd Pkts/s': 'Bwd Packets/s', 
    'Pkt Len Min': 'Min Packet Length', 
    'Pkt Len Max': 'Max Packet Length', 
    'Pkt Len Mean': 'Packet Length Mean', 
    'Pkt Len Std': 'Packet Length Std', 
    'Pkt Len Var': 'Packet Length Variance', 
    'FIN Flag Cnt': 'FIN Flag Count', 
    'SYN Flag Cnt': 'SYN Flag Count', 
    'RST Flag Cnt': 'RST Flag Count', 
    'PSH Flag Cnt': 'PSH Flag Count', 
    'ACK Flag Cnt': 'ACK Flag Count', 
    'URG Flag Cnt': 'URG Flag Count', 
    'CWE Flag Count': 'CWE Flag Count', 
    'ECE Flag Cnt': 'ECE Flag Count', 
    'Down/Up Ratio': 'Down/Up Ratio', 
    'Pkt Size Avg': 'Average Packet Size', 
    'Fwd Seg Size Avg': 'Avg Fwd Segment Size', 
    'Bwd Seg Size Avg': 'Avg Bwd Segment Size', 
    'Fwd Byts/b Avg': 'Fwd Avg Bytes/Bulk', 
    'Fwd Pkts/b Avg': 'Fwd Avg Packets/Bulk', 
    'Fwd Blk Rate Avg': 'Fwd Avg Bulk Rate', 
    'Bwd Byts/b Avg': 'Bwd Avg Bytes/Bulk', 
    'Bwd Pkts/b Avg': 'Bwd Avg Packets/Bulk', 
    'Bwd Blk Rate Avg': 'Bwd Avg Bulk Rate', 
    'Subflow Fwd Pkts': 'Subflow Fwd Packets', 
    'Subflow Fwd Byts': 'Subflow Fwd Bytes', 
    'Subflow Bwd Pkts': 'Subflow Bwd Packets', 
    'Subflow Bwd Byts': 'Subflow Bwd Bytes', 
    'Init Fwd Win Byts': 'Init_Win_bytes_forward', 
    'Init Bwd Win Byts': 'Init_Win_bytes_backward', 
    'Fwd Act Data Pkts': 'act_data_pkt_fwd', 
    'Fwd Seg Size Min': 'min_seg_size_forward', 
    'Active Mean': 'Active Mean', 
    'Active Std': 'Active Std', 
    'Active Max': 'Active Max', 
    'Active Min': 'Active Min', 
    'Idle Mean': 'Idle Mean', 
    'Idle Std': 'Idle Std', 
    'Idle Max': 'Idle Max', 
    'Idle Min': 'Idle Min', 
    'Label': 'Label', 
}

In [None]:
df2018_original.rename(columns=formatted_data, inplace=True)


In [None]:
formatted_data = {
    'Flow Duration': 'Flow Duration', 
    'Total Fwd Packet': 'Total Fwd Packets', 
    'Total Bwd packets': 'Total Backward Packets', 
    'Total Length of Fwd Packet': 'Total Length of Fwd Packets', 
    'Total Length of Bwd Packet': 'Total Length of Bwd Packets', 
    'Fwd Packet Length Max': 'Fwd Packet Length Max', 
    'Fwd Packet Length Min': 'Fwd Packet Length Min', 
    'Fwd Packet Length Mean': 'Fwd Packet Length Mean', 
    'Fwd Packet Length Std': 'Fwd Packet Length Std', 
    'Bwd Packet Length Max': 'Bwd Packet Length Max', 
    'Bwd Packet Length Min': 'Bwd Packet Length Min', 
    'Bwd Packet Length Mean': 'Bwd Packet Length Mean', 
    'Bwd Packet Length Std': 'Bwd Packet Length Std', 
    'Flow Bytes/s': 'Flow Bytes/s', 
    'Flow Packets/s': 'Flow Packets/s', 
    'Flow IAT Mean': 'Flow IAT Mean', 
    'Flow IAT Std': 'Flow IAT Std', 
    'Flow IAT Max': 'Flow IAT Max', 
    'Flow IAT Min': 'Flow IAT Min', 
    'Fwd IAT Total': 'Fwd IAT Total', 
    'Fwd IAT Mean': 'Fwd IAT Mean', 
    'Fwd IAT Std': 'Fwd IAT Std', 
    'Fwd IAT Max': 'Fwd IAT Max', 
    'Fwd IAT Min': 'Fwd IAT Min', 
    'Bwd IAT Total': 'Bwd IAT Total', 
    'Bwd IAT Mean': 'Bwd IAT Mean', 
    'Bwd IAT Std': 'Bwd IAT Std', 
    'Bwd IAT Max': 'Bwd IAT Max', 
    'Bwd IAT Min': 'Bwd IAT Min', 
    'Fwd PSH Flags': 'Fwd PSH Flags', 
    'Bwd PSH Flags': 'Bwd PSH Flags', 
    'Fwd URG Flags': 'Fwd URG Flags', 
    'Bwd URG Flags': 'Bwd URG Flags', 
    'Fwd Header Length': 'Fwd Header Length', 
    'Bwd Header Length': 'Bwd Header Length', 
    'Fwd Packets/s': 'Fwd Packets/s', 
    'Bwd Packets/s': 'Bwd Packets/s', 
    'Packet Length Min': 'Min Packet Length', 
    'Packet Length Max': 'Max Packet Length', 
    'Packet Length Mean': 'Packet Length Mean', 
    'Packet Length Std': 'Packet Length Std', 
    'Packet Length Variance': 'Packet Length Variance', 
    'FIN Flag Count': 'FIN Flag Count', 
    'SYN Flag Count': 'SYN Flag Count', 
    'RST Flag Count': 'RST Flag Count', 
    'PSH Flag Count': 'PSH Flag Count', 
    'ACK Flag Count': 'ACK Flag Count', 
    'URG Flag Count': 'URG Flag Count', 
    'CWR Flag Count': 'CWE Flag Count', 
    'ECE Flag Count': 'ECE Flag Count', 
    'Down/Up Ratio': 'Down/Up Ratio', 
    'Average Packet Size': 'Average Packet Size', 
    'Fwd Segment Size Avg': 'Avg Fwd Segment Size', 
    'Bwd Segment Size Avg': 'Avg Bwd Segment Size', 
    'Fwd Bytes/Bulk Avg': 'Fwd Avg Bytes/Bulk', 
    'Fwd Packet/Bulk Avg': 'Fwd Avg Packets/Bulk', 
    'Fwd Bulk Rate Avg': 'Fwd Avg Bulk Rate', 
    'Bwd Bytes/Bulk Avg': 'Bwd Avg Bytes/Bulk', 
    'Bwd Packet/Bulk Avg': 'Bwd Avg Packets/Bulk', 
    'Bwd Bulk Rate Avg': 'Bwd Avg Bulk Rate', 
    'Subflow Fwd Packets': 'Subflow Fwd Packets', 
    'Subflow Fwd Bytes': 'Subflow Fwd Bytes', 
    'Subflow Bwd Packets': 'Subflow Bwd Packets', 
    'Subflow Bwd Bytes': 'Subflow Bwd Bytes', 
    'FWD Init Win Bytes': 'Init_Win_bytes_forward', 
    'Bwd Init Win Bytes': 'Init_Win_bytes_backward', 
    'Fwd Act Data Pkts': 'act_data_pkt_fwd', 
    'Fwd Seg Size Min': 'min_seg_size_forward', 
    'Active Mean': 'Active Mean', 
    'Active Std': 'Active Std', 
    'Active Max': 'Active Max', 
    'Active Min': 'Active Min', 
    'Idle Mean': 'Idle Mean', 
    'Idle Std': 'Idle Std', 
    'Idle Max': 'Idle Max', 
    'Idle Min': 'Idle Min'
}

In [None]:
df2018_corrected.rename(columns=formatted_data, inplace=True)

In [11]:
df2017_original['Label'].value_counts()

BENIGN                        2273097
DoS Hulk                       231073
PortScan                       158930
DDoS                           128027
DoS GoldenEye                   10293
FTP-Patator                      7938
SSH-Patator                      5897
DoS slowloris                    5796
DoS Slowhttptest                 5499
Bot                              1966
Web Attack � Brute Force         1507
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: Label, dtype: int64

In [12]:
df2017_corrected['Label'].value_counts()

BENIGN                                    1582566
Portscan                                   159066
DoS Hulk                                   158468
DDoS                                        95144
Infiltration - Portscan                     71767
DoS GoldenEye                                7567
Botnet - Attempted                           4067
FTP-Patator                                  3972
DoS Slowloris                                3859
DoS Slowhttptest - Attempted                 3368
SSH-Patator                                  2961
DoS Slowloris - Attempted                    1847
DoS Slowhttptest                             1740
Web Attack - Brute Force - Attempted         1292
Botnet                                        736
Web Attack - XSS - Attempted                  655
DoS Hulk - Attempted                          581
DoS GoldenEye - Attempted                      80
Web Attack - Brute Force                       73
Infiltration - Attempted                       45


In [None]:
df2018_original['Label'].value_counts()

In [None]:
df2018_corrected['Label'].value_counts()


### Remove duplicate and missing value rows

In [11]:
def drop_unnecessary(df) :
    print(f'~~~~~~~ before drop {df.shape} ~~~~~~~')
    print(f'dataset contains {df2017_original.duplicated().sum()} of duplicated values')
    print(f'dataset contains {df2017_original.isna().sum().sum()} of missing values')
    df.drop_duplicates(keep="first", inplace=True)
    df.dropna(inplace=True)
    print(f'~~~~~~~ after drop {df.shape} ~~~~~~~')
    

In [12]:
drop_unnecessary(df2017_original)

~~~~~~~ before drop (2830743, 79) ~~~~~~~
dataset contains 308381 of duplicated values
dataset contains 1358 of missing values
~~~~~~~ after drop (2522009, 79) ~~~~~~~


In [14]:
drop_unnecessary(df2017_corrected)

~~~~~~~ before drop (2099976, 91) ~~~~~~~
dataset contains 0 of duplicated values
dataset contains 0 of missing values
~~~~~~~ after drop (2099976, 91) ~~~~~~~


In [None]:
drop_unnecessary(df2018_original)

In [13]:
df2017_original.replace([np.inf, -np.inf], np.nan, inplace=True)

In [10]:
df2017_corrected.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
df2018_original.replace([np.inf, -np.inf], np.nan, inplace=True)

In [22]:
df2017_original[['Flow Bytes/s', 'Flow Packets/s']].describe()

Unnamed: 0,Flow Bytes/s,Flow Packets/s
count,2520798.0,2520798.0
mean,1410707.0,47291.88
std,26570840.0,202636.6
min,-261000000.0,-2000000.0
25%,119.4308,2.023326
50%,3715.038,69.74224
75%,107142.9,17857.14
max,2071000000.0,4000000.0


In [23]:
df2017_corrected[['Flow Bytes/s', 'Flow Packets/s']].describe()

Unnamed: 0,Flow Bytes/s,Flow Packets/s
count,2099971.0,2099971.0
mean,466026.9,19318.49
std,3977421.0,106862.6
min,0.0,0.02500009
25%,107.779,3.574474
50%,3864.201,73.69042
75%,60796.03,16194.33
max,253000000.0,3000000.0


### Reclassify the data labels to ['Benign', 'Attack']

In [14]:
def label_mapping(value):
    if value == 'BENIGN':
        return 0 #'Benigh'
    else:
        return 1 #'Attack'

In [15]:
def reclassify_label(df):
    df['label_encoded'] = df['Label'].map(label_mapping)
    return df

In [16]:
df2017_original = reclassify_label(df2017_original)

In [13]:
df2017_corrected = reclassify_label(df2017_corrected)

In [None]:
df2018_original = reclassify_label(df2018_original)

### Encode Labels - Depreciated

In [17]:
from sklearn.preprocessing import LabelEncoder

def encode_label(df) :
    le = LabelEncoder()
    df['label_encoded'] = le.fit_transform(df['Label'])
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print(f'label mapping value: {le_name_mapping}')
    print(df['label_encoded'].value_counts())

In [18]:
encode_label(df2017_original)

label mapping value: {'BENIGN': 0, 'Bot': 1, 'DDoS': 2, 'DoS GoldenEye': 3, 'DoS Hulk': 4, 'DoS Slowhttptest': 5, 'DoS slowloris': 6, 'FTP-Patator': 7, 'Heartbleed': 8, 'Infiltration': 9, 'PortScan': 10, 'SSH-Patator': 11, 'Web Attack � Brute Force': 12, 'Web Attack � Sql Injection': 13, 'Web Attack � XSS': 14}
0     2096134
4      172846
2      128016
10      90819
3       10286
7        5933
6        5385
5        5228
11       3219
1        1953
12       1470
14        652
9          36
13         21
8          11
Name: label_encoded, dtype: int64


### Prepare for algorithms

In [17]:
df2017_original_y = df2017_original['label_encoded'].values
#remove unneccessary and duplicate column
df2017_original_X = df2017_original.drop(['Destination Port', 'Fwd Header Length.1', 'label_encoded', 'Label'], axis=1) 

In [18]:
data = pd.DataFrame(data=df2017_original_X)

data['label_encoded'] = df2017_original_y
data['label_encoded'].value_counts()

0    2096134
1     425875
Name: label_encoded, dtype: int64

In [21]:
data.to_csv('results/data_2017_original', index = False)

In [22]:
pd.read_csv('results/data_2017_original')

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label_encoded
0,3,2,0,12,0,6,6,6.0,0.00000,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
1,109,1,1,6,6,6,6,6.0,0.00000,6,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
2,52,1,1,6,6,6,6,6.0,0.00000,6,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
3,34,1,1,6,6,6,6,6.0,0.00000,6,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
4,3,2,0,12,0,6,6,6.0,0.00000,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2522004,32215,4,2,112,152,28,28,28.0,0.00000,76,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
2522005,324,2,2,84,362,42,42,42.0,0.00000,181,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
2522006,82,2,1,31,6,31,0,15.5,21.92031,6,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
2522007,1048635,6,2,192,256,32,32,32.0,0.00000,128,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0


# Data Resampling
- under sampling for majority
- over sampling for minoirty

In [23]:
from imblearn.under_sampling import RandomUnderSampler


def under_sampling(X_train, y_train):
    # temp_data = X_train
    # temp_data = np.insert(temp_data, 0, y_train, axis=1)
    # temp_y = y_train.map(label_mapping)
    # print(f'~~~Before UnderSampling: {temp_y.value_counts()}')
    unique_values, value_counts = np.unique(y_train, return_counts=True)
    print(f'~~~Before UnderSampling: {value_counts}~~~')
    rus = RandomUnderSampler(random_state=42)
    # temp_data, y_train = rus.fit_resample(temp_data, temp_y)
    X_train, y_train = rus.fit_resample(X_train, y_train)
    # y_train = temp_data[:, 0]
    # X_train = np.delete(temp_data, 0, axis=1)
    unique_values, value_counts = np.unique(y_train, return_counts=True)
    print(f'~~~After UnderSampling: {value_counts}')
    return X_train, y_train

In [24]:
from imblearn.over_sampling import SMOTE

def over_sampling(X_train, y_train):
    unique_values, value_counts = np.unique(y_train, return_counts=True)
    print(f'~~~Before OverSampling: {value_counts}~~~')
    sm = SMOTE(random_state=2)
    X_train, y_train = sm.fit_resample(X_train, y_train.ravel())
    unique_values, value_counts = np.unique(y_train, return_counts=True)
    print(f'~~~After OverSampling: {value_counts}~~~')
    return X_train, y_train


In [25]:
X_train_resample, y_train_resample = under_sampling(df2017_original_X, df2017_original_y)

data_resampling = pd.DataFrame(data=X_train_resample)
data_resampling['label_encoded'] = y_train_resample
data_resampling['label_encoded'].value_counts()

~~~Before UnderSampling: [2096134  425875]~~~
~~~After UnderSampling: [425875 425875]


0    425875
1    425875
Name: label_encoded, dtype: int64

In [26]:
data_resampling.to_csv('results/data_2017_original_resampling', index = False)

In [27]:
pd.read_csv('results/data_2017_original_resampling')

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label_encoded
0,858591,2,2,72,124,36,36,36.000000,0.000000,62,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
1,60706,2,2,70,290,35,35,35.000000,0.000000,145,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
2,257846,1,1,46,208,46,46,46.000000,0.000000,208,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
3,3,2,0,0,0,0,0,0.000000,0.000000,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
4,1559160,35,42,2622,7038,408,0,74.914286,105.280961,976,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
851745,11512204,8,5,326,11632,326,0,40.750000,115.258405,10184,...,32,892.0,0.0,892,892,6507197.0,0.0,6507197,6507197,1
851746,11513325,5,5,471,3525,471,0,94.200000,210.637604,2077,...,32,918.0,0.0,918,918,6508582.0,0.0,6508582,6508582,1
851747,11509201,7,6,314,11632,314,0,44.857143,118.680845,5792,...,32,899.0,0.0,899,899,6503248.0,0.0,6503248,6503248,1
851748,11509095,8,5,369,11632,369,0,46.125000,130.461201,10184,...,32,914.0,0.0,914,914,6504954.0,0.0,6504954,6504954,1
