In [2]:
%pip install pandas numpy matplotlib seaborn scipy scikit-learn imbalanced-learn kagglehub



## Import Dependences

In [3]:
import pandas as pd
import numpy as np

## Set display unlimited number of lines

In [4]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Read all datasets and save then in dataframes with the same name in a dictionary:

In [5]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("solarmainframe/ids-intrusion-csv")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/solarmainframe/ids-intrusion-csv?dataset_version_number=1...


100%|██████████| 1.60G/1.60G [00:40<00:00, 42.4MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/solarmainframe/ids-intrusion-csv/versions/1


## To preprocess the 02-20-2018.csv dataset together with all the others we need more than 30 GB of RAM, so to run this notebook in Kaggle this dataset will not be loaded in the dictionary and some code cells bellow will also be commented

In [6]:
import os

# Define the path to the folder containing the CSV files
folder_path = path

# Get a list of all the CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

dfs = dict()

# Loop through each CSV file and read it into a dataframe
for file in csv_files:
    df_name = os.path.splitext(file)[0]  # Get the dataframe name from the file name
    file_path = os.path.join(folder_path, file)  # Get the full file path
    df = pd.read_csv(file_path, low_memory=False)  # Read the CSV file into a dataframe
    locals()[df_name] = df  # Assign the dataframe to a variable with the same name as the file (e.g. 'data1 = df1')
    dfs[df_name] = df  # Store the dataframe in the dictionary with the same name as the file (e.g. 'dfs['data1'] = df1')

# Pre-processing

## 1. Drop Flow ID, Src IP and Dst IP columns of the 02-20-2018 dataset for being strings:

In [7]:
for id, df in dfs.items():
    try:
        dfs[id] = df.drop(['Flow ID','Src IP','Dst IP'], axis=1)
    except Exception:
        pass

## 2. Check existence and drop -inf and +inf values

In [8]:
for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key
    print(f"Dataframe: '{key}'\n")
    count = df.values.ravel().tolist().count(np.inf)
    count = np.sum(df.values == np.inf)
    count2 = np.sum(df.values == -np.inf)
    print(f"Inf:'{count}'\n -Inf:'{count2}'\n")
    # replace +ve and -ve infinity with NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    # Drop all NaN values
    df.dropna(inplace=True)

Dataframe: '02-16-2018'

Inf:'0'
 -Inf:'0'

Dataframe: '02-14-2018'

Inf:'5371'
 -Inf:'0'

Dataframe: '02-23-2018'

Inf:'7662'
 -Inf:'0'

Dataframe: '03-01-2018'

Inf:'0'
 -Inf:'0'

Dataframe: '02-28-2018'

Inf:'0'
 -Inf:'0'

Dataframe: '02-15-2018'

Inf:'11133'
 -Inf:'0'

Dataframe: '02-21-2018'

Inf:'0'
 -Inf:'0'

Dataframe: '02-20-2018'

Inf:'82139'
 -Inf:'0'

Dataframe: '03-02-2018'

Inf:'5542'
 -Inf:'0'

Dataframe: '02-22-2018'

Inf:'7651'
 -Inf:'0'



In [10]:
unique_attack = set()

for file_name, df in dfs.items():
    attack_kinds = df.Label.unique()
    unique_attack.update(attack_kinds)
    if 'Label' in attack_kinds:
        print("Attacks at", file_name, 'contain Label value(s)')
    print(f"Dataframe ['{file_name}'] has", len(attack_kinds), "attack kinds:", attack_kinds)

print(f"\n {len(unique_attack)} Unique attack kinds in CIC-IDS 2018:", unique_attack)

Attacks at 02-16-2018 contain Label value(s)
Dataframe ['02-16-2018'] has 4 attack kinds: ['Benign' 'DoS attacks-SlowHTTPTest' 'DoS attacks-Hulk' 'Label']
Dataframe ['02-14-2018'] has 3 attack kinds: ['Benign' 'FTP-BruteForce' 'SSH-Bruteforce']
Dataframe ['02-23-2018'] has 4 attack kinds: ['Benign' 'Brute Force -Web' 'Brute Force -XSS' 'SQL Injection']
Attacks at 03-01-2018 contain Label value(s)
Dataframe ['03-01-2018'] has 3 attack kinds: ['Benign' 'Label' 'Infilteration']
Attacks at 02-28-2018 contain Label value(s)
Dataframe ['02-28-2018'] has 3 attack kinds: ['Benign' 'Label' 'Infilteration']
Dataframe ['02-15-2018'] has 3 attack kinds: ['Benign' 'DoS attacks-GoldenEye' 'DoS attacks-Slowloris']
Dataframe ['02-21-2018'] has 3 attack kinds: ['Benign' 'DDOS attack-LOIC-UDP' 'DDOS attack-HOIC']
Dataframe ['02-20-2018'] has 2 attack kinds: ['Benign' 'DDoS attacks-LOIC-HTTP']
Dataframe ['03-02-2018'] has 2 attack kinds: ['Benign' 'Bot']
Dataframe ['02-22-2018'] has 4 attack kinds: ['Ben

Check columns of datasets

In [None]:
for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key
    print(f"Dataframe: '{key}'\n")
    display(df.head())
    display(df.info())

Dataframe: '02-28-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,443,6,28/02/2018 08:22:13,94658,6,7,708,3718,387,0,...,20,0,0,0,0,0,0,0,0,Benign
1,443,6,28/02/2018 08:22:13,206,2,0,0,0,0,0,...,20,0,0,0,0,0,0,0,0,Benign
2,445,6,28/02/2018 08:22:15,165505,3,1,0,0,0,0,...,20,0,0,0,0,0,0,0,0,Benign
3,443,6,28/02/2018 08:22:16,102429,6,7,708,3718,387,0,...,20,0,0,0,0,0,0,0,0,Benign
4,443,6,28/02/2018 08:22:16,167,2,0,0,0,0,0,...,20,0,0,0,0,0,0,0,0,Benign


<class 'pandas.core.frame.DataFrame'>
Index: 609063 entries, 0 to 613103
Data columns (total 80 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Dst Port           609063 non-null  object
 1   Protocol           609063 non-null  object
 2   Timestamp          609063 non-null  object
 3   Flow Duration      609063 non-null  object
 4   Tot Fwd Pkts       609063 non-null  object
 5   Tot Bwd Pkts       609063 non-null  object
 6   TotLen Fwd Pkts    609063 non-null  object
 7   TotLen Bwd Pkts    609063 non-null  object
 8   Fwd Pkt Len Max    609063 non-null  object
 9   Fwd Pkt Len Min    609063 non-null  object
 10  Fwd Pkt Len Mean   609063 non-null  object
 11  Fwd Pkt Len Std    609063 non-null  object
 12  Bwd Pkt Len Max    609063 non-null  object
 13  Bwd Pkt Len Min    609063 non-null  object
 14  Bwd Pkt Len Mean   609063 non-null  object
 15  Bwd Pkt Len Std    609063 non-null  object
 16  Flow Byts/s        609063

None

Dataframe: '03-01-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,01/03/2018 08:17:11,115307855,5,0,0,0,0,0,...,0,1812348,0,1812348,1812348,56700000,6010057.622,61000000,52500000,Benign
1,0,0,01/03/2018 08:20:07,60997457,2,0,0,0,0,0,...,0,0,0,0,0,61000000,0.0,61000000,61000000,Benign
2,67,17,01/03/2018 08:17:18,61149019,5,0,1500,0,300,300,...,8,3530939,0,3530939,3530939,19200000,12500000.0,32600000,7999725,Benign
3,0,0,01/03/2018 08:22:09,60997555,2,0,0,0,0,0,...,0,0,0,0,0,61000000,0.0,61000000,61000000,Benign
4,0,0,01/03/2018 08:24:11,61997503,3,0,0,0,0,0,...,0,999909,0,999909,999909,61000000,0.0,61000000,61000000,Benign


<class 'pandas.core.frame.DataFrame'>
Index: 329291 entries, 0 to 331124
Data columns (total 80 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Dst Port           329291 non-null  object
 1   Protocol           329291 non-null  object
 2   Timestamp          329291 non-null  object
 3   Flow Duration      329291 non-null  object
 4   Tot Fwd Pkts       329291 non-null  object
 5   Tot Bwd Pkts       329291 non-null  object
 6   TotLen Fwd Pkts    329291 non-null  object
 7   TotLen Bwd Pkts    329291 non-null  object
 8   Fwd Pkt Len Max    329291 non-null  object
 9   Fwd Pkt Len Min    329291 non-null  object
 10  Fwd Pkt Len Mean   329291 non-null  object
 11  Fwd Pkt Len Std    329291 non-null  object
 12  Bwd Pkt Len Max    329291 non-null  object
 13  Bwd Pkt Len Min    329291 non-null  object
 14  Bwd Pkt Len Mean   329291 non-null  object
 15  Bwd Pkt Len Std    329291 non-null  object
 16  Flow Byts/s        329291

None

Dataframe: '02-16-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,16/02/2018 08:27:23,112640768,3,0,0,0,0,0,...,0,0,0,0,0,56300000,138.5929291,56300000,56300000,Benign
1,0,0,16/02/2018 08:30:12,112641773,3,0,0,0,0,0,...,0,0,0,0,0,56300000,263.7508294,56300000,56300000,Benign
2,35605,6,16/02/2018 08:26:55,20784143,23,44,2416,1344,240,64,...,20,2624734,0,2624734,2624734,9058214,0.0,9058214,9058214,Benign
3,0,0,16/02/2018 08:33:01,112640836,3,0,0,0,0,0,...,0,0,0,0,0,56300000,82.02438662,56300000,56300000,Benign
4,23,6,16/02/2018 08:27:59,20,1,1,0,0,0,0,...,20,0,0,0,0,0,0.0,0,0,Benign


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 80 columns):
 #   Column             Non-Null Count    Dtype 
---  ------             --------------    ----- 
 0   Dst Port           1048575 non-null  object
 1   Protocol           1048575 non-null  object
 2   Timestamp          1048575 non-null  object
 3   Flow Duration      1048575 non-null  object
 4   Tot Fwd Pkts       1048575 non-null  object
 5   Tot Bwd Pkts       1048575 non-null  object
 6   TotLen Fwd Pkts    1048575 non-null  object
 7   TotLen Bwd Pkts    1048575 non-null  object
 8   Fwd Pkt Len Max    1048575 non-null  object
 9   Fwd Pkt Len Min    1048575 non-null  object
 10  Fwd Pkt Len Mean   1048575 non-null  object
 11  Fwd Pkt Len Std    1048575 non-null  object
 12  Bwd Pkt Len Max    1048575 non-null  object
 13  Bwd Pkt Len Min    1048575 non-null  object
 14  Bwd Pkt Len Mean   1048575 non-null  object
 15  Bwd Pkt Len Std    1048575 non-null  object
 16  

None

Dataframe: '02-15-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,15/02/2018 08:25:18,112641158,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320579.0,704.2784,56321077,56320081,Benign
1,22,6,15/02/2018 08:29:05,37366762,14,12,2168,2993,712,0,...,32,1024353.0,649038.754495,1601183,321569,11431221.0,3644991.0,15617415,8960247,Benign
2,47514,6,15/02/2018 08:29:42,543,2,0,64,0,64,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
3,0,0,15/02/2018 08:28:07,112640703,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320351.5,366.9884,56320611,56320092,Benign
4,0,0,15/02/2018 08:30:56,112640874,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320437.0,719.8347,56320946,56319928,Benign


<class 'pandas.core.frame.DataFrame'>
Index: 1040548 entries, 0 to 1048574
Data columns (total 80 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1040548 non-null  int64  
 1   Protocol           1040548 non-null  int64  
 2   Timestamp          1040548 non-null  object 
 3   Flow Duration      1040548 non-null  int64  
 4   Tot Fwd Pkts       1040548 non-null  int64  
 5   Tot Bwd Pkts       1040548 non-null  int64  
 6   TotLen Fwd Pkts    1040548 non-null  int64  
 7   TotLen Bwd Pkts    1040548 non-null  int64  
 8   Fwd Pkt Len Max    1040548 non-null  int64  
 9   Fwd Pkt Len Min    1040548 non-null  int64  
 10  Fwd Pkt Len Mean   1040548 non-null  float64
 11  Fwd Pkt Len Std    1040548 non-null  float64
 12  Bwd Pkt Len Max    1040548 non-null  int64  
 13  Bwd Pkt Len Min    1040548 non-null  int64  
 14  Bwd Pkt Len Mean   1040548 non-null  float64
 15  Bwd Pkt Len Std    1040548 non-null  

None

Dataframe: '02-21-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,80,6,21/02/2018 08:33:25,37953,5,3,135,127,135,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
1,500,17,21/02/2018 08:33:06,117573474,3,0,1500,0,500,500,...,8,0.0,0.0,0,0,58800000.0,23800000.0,75600000,42000000,Benign
2,500,17,21/02/2018 08:33:06,117573474,3,0,1500,0,500,500,...,8,0.0,0.0,0,0,58800000.0,23800000.0,75600000,42000000,Benign
3,500,17,21/02/2018 08:33:11,99743998,5,0,2500,0,500,500,...,8,4000290.0,0.0,4000290,4000290,31900000.0,37900000.0,75600000,7200397,Benign
4,500,17,21/02/2018 08:33:11,99743999,5,0,2500,0,500,500,...,8,4000286.0,0.0,4000286,4000286,31900000.0,37900000.0,75600000,7200399,Benign


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 80 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1048575 non-null  int64  
 1   Protocol           1048575 non-null  int64  
 2   Timestamp          1048575 non-null  object 
 3   Flow Duration      1048575 non-null  int64  
 4   Tot Fwd Pkts       1048575 non-null  int64  
 5   Tot Bwd Pkts       1048575 non-null  int64  
 6   TotLen Fwd Pkts    1048575 non-null  int64  
 7   TotLen Bwd Pkts    1048575 non-null  int64  
 8   Fwd Pkt Len Max    1048575 non-null  int64  
 9   Fwd Pkt Len Min    1048575 non-null  int64  
 10  Fwd Pkt Len Mean   1048575 non-null  float64
 11  Fwd Pkt Len Std    1048575 non-null  float64
 12  Bwd Pkt Len Max    1048575 non-null  int64  
 13  Bwd Pkt Len Min    1048575 non-null  int64  
 14  Bwd Pkt Len Mean   1048575 non-null  float64
 15  Bwd Pkt Len Std    1048575 non-n

None

Dataframe: '03-02-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,443,6,02/03/2018 08:47:38,141385,9,7,553,3773.0,202,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,49684,6,02/03/2018 08:47:38,281,2,1,38,0.0,38,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,443,6,02/03/2018 08:47:40,279824,11,15,1086,10527.0,385,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,443,6,02/03/2018 08:47:40,132,2,0,0,0.0,0,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,443,6,02/03/2018 08:47:41,274016,9,13,1285,6141.0,517,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


<class 'pandas.core.frame.DataFrame'>
Index: 1044525 entries, 0 to 1048574
Data columns (total 80 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1044525 non-null  int64  
 1   Protocol           1044525 non-null  int64  
 2   Timestamp          1044525 non-null  object 
 3   Flow Duration      1044525 non-null  int64  
 4   Tot Fwd Pkts       1044525 non-null  int64  
 5   Tot Bwd Pkts       1044525 non-null  int64  
 6   TotLen Fwd Pkts    1044525 non-null  int64  
 7   TotLen Bwd Pkts    1044525 non-null  float64
 8   Fwd Pkt Len Max    1044525 non-null  int64  
 9   Fwd Pkt Len Min    1044525 non-null  int64  
 10  Fwd Pkt Len Mean   1044525 non-null  float64
 11  Fwd Pkt Len Std    1044525 non-null  float64
 12  Bwd Pkt Len Max    1044525 non-null  int64  
 13  Bwd Pkt Len Min    1044525 non-null  int64  
 14  Bwd Pkt Len Mean   1044525 non-null  float64
 15  Bwd Pkt Len Std    1044525 non-null  

None

Dataframe: '02-22-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,22,6,22/02/2018 08:26:03,20553406,10,7,1063,1297,744,0,...,20,1027304.0,0.0,1027304,1027304,19526080.0,0.0,19526080,19526080,Benign
1,34989,6,22/02/2018 08:26:24,790,2,0,848,0,848,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
2,500,17,22/02/2018 08:25:10,99745913,5,0,2500,0,500,500,...,8,4000203.0,0.0,4000203,4000203,31915240.0,37927870.0,75584115,7200679,Benign
3,500,17,22/02/2018 08:25:10,99745913,5,0,2500,0,500,500,...,8,4000189.0,0.0,4000189,4000189,31915240.0,37927880.0,75584130,7200693,Benign
4,500,17,22/02/2018 08:24:59,89481361,6,0,3000,0,500,500,...,8,4000554.0,0.0,4000554,4000554,21370200.0,15281090.0,41990741,7200848,Benign


<class 'pandas.core.frame.DataFrame'>
Index: 1042965 entries, 0 to 1048574
Data columns (total 80 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1042965 non-null  int64  
 1   Protocol           1042965 non-null  int64  
 2   Timestamp          1042965 non-null  object 
 3   Flow Duration      1042965 non-null  int64  
 4   Tot Fwd Pkts       1042965 non-null  int64  
 5   Tot Bwd Pkts       1042965 non-null  int64  
 6   TotLen Fwd Pkts    1042965 non-null  int64  
 7   TotLen Bwd Pkts    1042965 non-null  int64  
 8   Fwd Pkt Len Max    1042965 non-null  int64  
 9   Fwd Pkt Len Min    1042965 non-null  int64  
 10  Fwd Pkt Len Mean   1042965 non-null  float64
 11  Fwd Pkt Len Std    1042965 non-null  float64
 12  Bwd Pkt Len Max    1042965 non-null  int64  
 13  Bwd Pkt Len Min    1042965 non-null  int64  
 14  Bwd Pkt Len Mean   1042965 non-null  float64
 15  Bwd Pkt Len Std    1042965 non-null  

None

Dataframe: '02-20-2018'



Unnamed: 0,Src Port,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,45498,22,6,20/02/2018 08:34:07,888751,11,11,1249.0,1969.0,736.0,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,0,0,0,20/02/2018 08:33:22,112642816,3,0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,56300000.0,7.071068,56300000.0,56300000.0,Benign
2,0,0,0,20/02/2018 08:36:11,112642712,3,0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,56300000.0,18.384776,56300000.0,56300000.0,Benign
3,0,0,0,20/02/2018 08:39:00,112642648,3,0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,56300000.0,5.656854,56300000.0,56300000.0,Benign
4,0,0,0,20/02/2018 08:41:49,112642702,3,0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,56300000.0,65.053824,56300000.0,56300000.0,Benign


<class 'pandas.core.frame.DataFrame'>
Index: 7889295 entries, 0 to 7948747
Data columns (total 81 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Src Port           int64  
 1   Dst Port           int64  
 2   Protocol           int64  
 3   Timestamp          object 
 4   Flow Duration      int64  
 5   Tot Fwd Pkts       int64  
 6   Tot Bwd Pkts       int64  
 7   TotLen Fwd Pkts    float64
 8   TotLen Bwd Pkts    float64
 9   Fwd Pkt Len Max    float64
 10  Fwd Pkt Len Min    float64
 11  Fwd Pkt Len Mean   float64
 12  Fwd Pkt Len Std    float64
 13  Bwd Pkt Len Max    float64
 14  Bwd Pkt Len Min    float64
 15  Bwd Pkt Len Mean   float64
 16  Bwd Pkt Len Std    float64
 17  Flow Byts/s        float64
 18  Flow Pkts/s        float64
 19  Flow IAT Mean      float64
 20  Flow IAT Std       float64
 21  Flow IAT Max       float64
 22  Flow IAT Min       float64
 23  Fwd IAT Tot        float64
 24  Fwd IAT Mean       float64
 25  Fwd IAT Std        floa

None

Dataframe: '02-14-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,14/02/2018 08:31:01,112641719,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320859.5,139.300036,56320958,56320761,Benign
1,0,0,14/02/2018 08:33:50,112641466,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320733.0,114.551299,56320814,56320652,Benign
2,0,0,14/02/2018 08:36:39,112638623,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56319311.5,301.934596,56319525,56319098,Benign
3,22,6,14/02/2018 08:40:13,6453966,15,10,1239,2273,744,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
4,22,6,14/02/2018 08:40:23,8804066,14,11,1143,2209,744,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign


<class 'pandas.core.frame.DataFrame'>
Index: 1044751 entries, 0 to 1048574
Data columns (total 80 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1044751 non-null  int64  
 1   Protocol           1044751 non-null  int64  
 2   Timestamp          1044751 non-null  object 
 3   Flow Duration      1044751 non-null  int64  
 4   Tot Fwd Pkts       1044751 non-null  int64  
 5   Tot Bwd Pkts       1044751 non-null  int64  
 6   TotLen Fwd Pkts    1044751 non-null  int64  
 7   TotLen Bwd Pkts    1044751 non-null  int64  
 8   Fwd Pkt Len Max    1044751 non-null  int64  
 9   Fwd Pkt Len Min    1044751 non-null  int64  
 10  Fwd Pkt Len Mean   1044751 non-null  float64
 11  Fwd Pkt Len Std    1044751 non-null  float64
 12  Bwd Pkt Len Max    1044751 non-null  int64  
 13  Bwd Pkt Len Min    1044751 non-null  int64  
 14  Bwd Pkt Len Mean   1044751 non-null  float64
 15  Bwd Pkt Len Std    1044751 non-null  

None

Dataframe: '02-23-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,22,6,23/02/2018 08:18:29,1532698,11,11,1179,1969,648,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
1,500,17,23/02/2018 08:17:45,117573855,3,0,1500,0,500,500,...,8,0.0,0.0,0,0,58786927.5,23753240.0,75583006,41990849,Benign
2,500,17,23/02/2018 08:17:45,117573848,3,0,1500,0,500,500,...,8,0.0,0.0,0,0,58786924.0,23753250.0,75583007,41990841,Benign
3,22,6,23/02/2018 08:19:55,1745392,11,11,1179,1969,648,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
4,500,17,23/02/2018 08:18:17,89483474,6,0,3000,0,500,500,...,8,4000364.0,0.0,4000364,4000364,21370777.5,15280920.0,41989576,7200485,Benign


<class 'pandas.core.frame.DataFrame'>
Index: 1042867 entries, 0 to 1048574
Data columns (total 80 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1042867 non-null  int64  
 1   Protocol           1042867 non-null  int64  
 2   Timestamp          1042867 non-null  object 
 3   Flow Duration      1042867 non-null  int64  
 4   Tot Fwd Pkts       1042867 non-null  int64  
 5   Tot Bwd Pkts       1042867 non-null  int64  
 6   TotLen Fwd Pkts    1042867 non-null  int64  
 7   TotLen Bwd Pkts    1042867 non-null  int64  
 8   Fwd Pkt Len Max    1042867 non-null  int64  
 9   Fwd Pkt Len Min    1042867 non-null  int64  
 10  Fwd Pkt Len Mean   1042867 non-null  float64
 11  Fwd Pkt Len Std    1042867 non-null  float64
 12  Bwd Pkt Len Max    1042867 non-null  int64  
 13  Bwd Pkt Len Min    1042867 non-null  int64  
 14  Bwd Pkt Len Mean   1042867 non-null  float64
 15  Bwd Pkt Len Std    1042867 non-null  

None

Iterate trought all dictionary keys (in this case each key is a dataframe) and make a value count to see wich attacks there are in each day

In [None]:
for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key
    count = df['Label'].value_counts()  # Perform value count on the 'A' column
    print(f"Value counts for dataframe '{key}':\n{count}\n")

Value counts for dataframe '02-28-2018':
Label
Benign           540568
Infilteration     68462
Label                33
Name: count, dtype: int64

Value counts for dataframe '03-01-2018':
Label
Benign           236632
Infilteration     92634
Label                25
Name: count, dtype: int64

Value counts for dataframe '02-16-2018':
Label
DoS attacks-Hulk            461912
Benign                      446772
DoS attacks-SlowHTTPTest    139890
Label                            1
Name: count, dtype: int64

Value counts for dataframe '02-15-2018':
Label
Benign                   988050
DoS attacks-GoldenEye     41508
DoS attacks-Slowloris     10990
Name: count, dtype: int64

Value counts for dataframe '02-21-2018':
Label
DDOS attack-HOIC        686012
Benign                  360833
DDOS attack-LOIC-UDP      1730
Name: count, dtype: int64

Value counts for dataframe '03-02-2018':
Label
Benign    758334
Bot       286191
Name: count, dtype: int64

Value counts for dataframe '02-22-2018':
Label
Be

## 3. Convert Timestamp values to pandas date and time datetime64 format

In [None]:
for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce')

## 4. Convert datetime64 format to epoch format (epoch is Unix epoch time format, wich represents the number of seconds elapsed since January 1, 1970, at 00:00:00 UTC)

In [None]:
for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key
    df['Timestamp'] = (df['Timestamp'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

## 5. Change all misclassified datatypes to float and substitute errors by NaN

In [None]:
#Iterate throught each dataframe in the dictionary
for key in dfs.keys():
    df = dfs[key]
    for col in df.columns:
        #Check if the datatype of the column is object
        if df[col].dtype == 'object' and col != 'Label':
            # Change all values to numeric, and to NaN if it is a strig
            df[col] = pd.to_numeric(df[col], errors='coerce')



Count how many Na's there are in the dataframes:

In [None]:
for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key
    print(f"Dataframe: '{key}'\n")
    count_NA = df.isna().sum()
    print(count_NA)

Dataframe: '02-28-2018'

Dst Port         33
Protocol         33
Timestamp        33
Flow Duration    33
Tot Fwd Pkts     33
                 ..
Idle Mean        33
Idle Std         33
Idle Max         33
Idle Min         33
Label             0
Length: 80, dtype: int64
Dataframe: '03-01-2018'

Dst Port         25
Protocol         25
Timestamp        25
Flow Duration    25
Tot Fwd Pkts     25
                 ..
Idle Mean        25
Idle Std         25
Idle Max         25
Idle Min         25
Label             0
Length: 80, dtype: int64
Dataframe: '02-16-2018'

Dst Port         1
Protocol         1
Timestamp        1
Flow Duration    1
Tot Fwd Pkts     1
                ..
Idle Mean        1
Idle Std         1
Idle Max         1
Idle Min         1
Label            0
Length: 80, dtype: int64
Dataframe: '02-15-2018'

Dst Port         0
Protocol         0
Timestamp        0
Flow Duration    0
Tot Fwd Pkts     0
                ..
Idle Mean        0
Idle Std         0
Idle Max         0
Idle 

Drop Na values

In [None]:
for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key
    df.dropna(inplace=True)

## 6. Outliers Filtering

The objective of this filtering is removing extremely high values from the dataset that may have come from data noise

In [None]:
import numpy as np
from scipy import stats

for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key


    # Define a function to filter outliers using Z-score
    def filter_outliers_zscore(data, threshold):
        z_scores = np.abs(stats.zscore(data))
        outlier_mask = (z_scores > threshold).any(axis=1)
        return data[~outlier_mask]

    # Define a threshold value
    threshold = 7

    # Loop through the columns of the dataframe and filter outliers in each column
    filtered_cols = []

    for col in df.columns:
        if col != 'Label':
            filtered_col = filter_outliers_zscore(df[[col]], threshold)
            filtered_cols.append(filtered_col)

    # Combine the filtered columns back into a dataframe
    df_filtered = pd.concat(filtered_cols, axis=1)
    display(df.head())
    display(df_filtered.head())


Script for calculating z_score percentage equivalent:

In [None]:
from scipy.stats import norm
z_score= 7
percentage = (1- norm.cdf(z_score))*100

print("{:.10f}".format(percentage))

0.0000000001


Comparison between original and removed dataframes:

In [None]:

from scipy import stats

# # Set the random seed for reproducibility
np.random.seed(134)

for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key

    # Define a function to filter outliers using Z-score
    def filter_outliers_zscore(data, threshold):
        z_scores = np.abs(stats.zscore(data))
        outlier_mask = (z_scores > threshold).any(axis=1)
        return data[~outlier_mask], data[outlier_mask]

    # Define a threshold value
    threshold = 7

    # The filtering was removing all DDoS-LOIC-UDP, so we will not execute it on them

    if key == '02-21-2018':
        df_temp = df[df['Label'] == 'DDOS attack-LOIC-UDP']
        df = df[df['Label'] != 'DDOS attack-LOIC-UDP']


    # Loop through the columns of the dataframe and filter outliers in each column
    filtered_cols = []
    removed_outliers = []
    for col in df.columns:
        if col != 'Label':
            filtered_col, outliers = filter_outliers_zscore(df[[col]], threshold)

            filtered_cols.append(filtered_col)
            removed_outliers.append(outliers)

    # Combine the filtered columns back into a dataframe
    df_filtered = pd.concat(filtered_cols, axis=1)

    # Combine the removed outliers back into a dataframe
    df_outliers = pd.concat(removed_outliers, axis=1)






    # Dataframe filtering comparison

    print(f'\nDataframe: {key} Shape: {df.shape}')

    # Print the number of outliers removed for each column

    print('Outlier removal summary:')
    n_outliers = df_outliers.shape[0]
    print(f'{n_outliers} outliers rows to be removed')

    # Print the original dataframe and the filtered dataframe side by side
    print('\nOriginal dataframe:')
    display(df.head())

    # Assign filtered dataframe columns to original one
    columns = [col for col in df.columns if col != 'Label']
    df.loc[:,columns] = df_filtered.loc[:,columns]


    # Recombine rows from 'DDOS attack-LOIC-UDP'
    if key == '02-21-2018':
        df = pd.concat([df,df_temp])


    print('\nFiltered dataframe:')
    display(df.head())

    # Print the removed outliers dataframe
    print('\nRemoved outliers:')
    display(df_outliers.head())

    # Count how many rows of each attack were removed from original dataframe
    values_orig = df.loc[df.index.isin(df_outliers.index), 'Label']
    print(f'\n{values_orig.value_counts()}')


  a_zero_mean = a - mean
  a_zero_mean = a - mean



Dataframe: 02-28-2018 Shape: (609030, 80)
Outlier removal summary:
31452 outliers rows to be removed

Original dataframe:


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,443.0,6.0,1519806000.0,94658.0,6.0,7.0,708.0,3718.0,387.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,443.0,6.0,1519806000.0,206.0,2.0,0.0,0.0,0.0,0.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,445.0,6.0,1519806000.0,165505.0,3.0,1.0,0.0,0.0,0.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,443.0,6.0,1519806000.0,102429.0,6.0,7.0,708.0,3718.0,387.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,443.0,6.0,1519806000.0,167.0,2.0,0.0,0.0,0.0,0.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign



Filtered dataframe:


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,443.0,6.0,1519806000.0,94658.0,6.0,7.0,708.0,3718.0,387.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,443.0,6.0,1519806000.0,206.0,2.0,0.0,0.0,0.0,0.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,445.0,6.0,1519806000.0,165505.0,3.0,1.0,0.0,0.0,0.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,443.0,6.0,1519806000.0,102429.0,6.0,7.0,708.0,3718.0,387.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,443.0,6.0,1519806000.0,167.0,2.0,0.0,0.0,0.0,0.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign



Removed outliers:


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
35070,,,,,10016.0,24197.0,,34895410.0,,,...,,,,,,,,,,
43313,,,,,2039.0,3863.0,,5618659.0,,,...,,,,,,,,,,
46951,,,,,4596.0,9722.0,,13892215.0,,,...,,,,,,,,,,
80266,,,,,9758.0,24263.0,,34893950.0,,,...,,,,,,,,,,
84253,,,,,6103.0,13378.0,,19272652.0,,,...,,,,,,,,,,



Label
Benign           27743
Infilteration     3709
Name: count, dtype: int64


  a_zero_mean = a - mean
  a_zero_mean = a - mean



Dataframe: 03-01-2018 Shape: (329266, 80)
Outlier removal summary:
18650 outliers rows to be removed

Original dataframe:


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.0,0.0,1519892000.0,115307855.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1812348.0,0.0,1812348.0,1812348.0,56700000.0,6010058.0,61000000.0,52500000.0,Benign
1,0.0,0.0,1519892000.0,60997457.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,61000000.0,0.0,61000000.0,61000000.0,Benign
2,67.0,17.0,1519892000.0,61149019.0,5.0,0.0,1500.0,0.0,300.0,300.0,...,8.0,3530939.0,0.0,3530939.0,3530939.0,19200000.0,12500000.0,32600000.0,7999725.0,Benign
3,0.0,0.0,1519893000.0,60997555.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,61000000.0,0.0,61000000.0,61000000.0,Benign
4,0.0,0.0,1519893000.0,61997503.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,999909.0,0.0,999909.0,999909.0,61000000.0,0.0,61000000.0,61000000.0,Benign



Filtered dataframe:


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.0,0.0,1519892000.0,115307855.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1812348.0,0.0,1812348.0,1812348.0,56700000.0,6010058.0,61000000.0,52500000.0,Benign
1,0.0,0.0,1519892000.0,60997457.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,61000000.0,0.0,61000000.0,61000000.0,Benign
2,67.0,17.0,1519892000.0,61149019.0,5.0,0.0,1500.0,0.0,300.0,300.0,...,8.0,3530939.0,0.0,3530939.0,3530939.0,19200000.0,12500000.0,32600000.0,7999725.0,Benign
3,0.0,0.0,1519893000.0,60997555.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,61000000.0,0.0,61000000.0,61000000.0,Benign
4,0.0,0.0,1519893000.0,61997503.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,999909.0,0.0,999909.0,999909.0,61000000.0,0.0,61000000.0,61000000.0,Benign



Removed outliers:


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
561,,,,,2226.0,4918.0,,7154898.0,,,...,,,,,,,,,,
7010,,,,,4278.0,9580.0,,13800000.0,,,...,,,,,,,,,,
26537,,,,,670.0,1556.0,,2179217.0,,,...,,,,,,,,,,
29891,,,,,12387.0,26491.0,,38600000.0,,,...,,,,19000000.0,,,,40300000.0,,
39710,,,,,731.0,1578.0,,2221960.0,,,...,,,,,,,,,,



Label
Benign           14581
Infilteration     4069
Name: count, dtype: int64

Dataframe: 02-16-2018 Shape: (1048574, 80)
Outlier removal summary:
12035 outliers rows to be removed

Original dataframe:


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.0,0.0,1518770000.0,112640768.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,56300000.0,138.592929,56300000.0,56300000.0,Benign
1,0.0,0.0,1518770000.0,112641773.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,56300000.0,263.750829,56300000.0,56300000.0,Benign
2,35605.0,6.0,1518770000.0,20784143.0,23.0,44.0,2416.0,1344.0,240.0,64.0,...,20.0,2624734.0,0.0,2624734.0,2624734.0,9058214.0,0.0,9058214.0,9058214.0,Benign
3,0.0,0.0,1518770000.0,112640836.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,56300000.0,82.024387,56300000.0,56300000.0,Benign
4,23.0,6.0,1518770000.0,20.0,1.0,1.0,0.0,0.0,0.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign



Filtered dataframe:


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.0,,1518770000.0,,3.0,0.0,0.0,0.0,0.0,0.0,...,,0.0,0.0,0.0,0.0,,138.592929,,,Benign
1,0.0,,1518770000.0,,3.0,0.0,0.0,0.0,0.0,0.0,...,,0.0,0.0,0.0,0.0,,263.750829,,,Benign
2,35605.0,6.0,1518770000.0,20784143.0,,,2416.0,1344.0,240.0,,...,20.0,,0.0,,,9058214.0,0.0,9058214.0,9058214.0,Benign
3,0.0,,1518770000.0,,3.0,0.0,0.0,0.0,0.0,0.0,...,,0.0,0.0,0.0,0.0,,82.024387,,,Benign
4,23.0,6.0,1518770000.0,20.0,1.0,1.0,0.0,0.0,0.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign



Removed outliers:


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,,0.0,,112640768.0,,,,,,,...,,0.0,,,,,56300000.0,,56300000.0,56300000.0
1,,0.0,,112641773.0,,,,,,,...,,0.0,,,,,56300000.0,,56300000.0,56300000.0
3,,0.0,,112640836.0,,,,,,,...,,0.0,,,,,56300000.0,,56300000.0,56300000.0
5,,0.0,,112641178.0,,,,,,,...,,0.0,,,,,56300000.0,,56300000.0,56300000.0
6,,0.0,,112641264.0,,,,,,,...,,0.0,,,,,56300000.0,,56300000.0,56300000.0



Label
Benign              7646
DoS attacks-Hulk    4389
Name: count, dtype: int64

Dataframe: 02-15-2018 Shape: (1040548, 80)
Outlier removal summary:
48379 outliers rows to be removed

Original dataframe:


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,1518683118,112641158,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320579.0,704.2784,56321077,56320081,Benign
1,22,6,1518683345,37366762,14,12,2168,2993,712,0,...,32,1024353.0,649038.754495,1601183,321569,11431221.0,3644991.0,15617415,8960247,Benign
2,47514,6,1518683382,543,2,0,64,0,64,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
3,0,0,1518683287,112640703,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320351.5,366.9884,56320611,56320092,Benign
4,0,0,1518683456,112640874,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320437.0,719.8347,56320946,56319928,Benign



Filtered dataframe:


  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
 1.0000000e+00]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.loc[:,columns] = df_filtered.loc[:,columns]
 3.4000000e+01]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,1518683118,112641158,3.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,56320579.0,704.2784,56321077.0,56320081.0,Benign
1,22,6,1518683345,37366762,14.0,12.0,2168.0,2993.0,712.0,0.0,...,32,1024353.0,649038.754495,1601183.0,321569.0,11431221.0,3644991.0,15617415.0,8960247.0,Benign
2,47514,6,1518683382,543,2.0,0.0,64.0,0.0,64.0,0.0,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,0,0,1518683287,112640703,3.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,56320351.5,366.9884,56320611.0,56320092.0,Benign
4,0,0,1518683456,112640874,3.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,56320437.0,719.8347,56320946.0,56319928.0,Benign



Removed outliers:


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
5104,,,,,513.0,2036.0,8479538.0,,64440.0,,...,502.0,,,,,,,,,
5105,,,,,565.0,2792.0,8459810.0,,60144.0,,...,556.0,,,,,,,,,
28974,,,,,1381.0,3024.0,8613317.0,,34368.0,,...,1370.0,,,,,,,,,
28975,,,,,1374.0,3145.0,8606698.0,,45824.0,,...,1365.0,,,,,,,,,
28976,,,,,1422.0,3326.0,8579450.0,,20048.0,,...,1412.0,,,,,,,,,



Label
Benign                   38529
DoS attacks-Slowloris     8266
DoS attacks-GoldenEye     1584
Name: count, dtype: int64

Dataframe: 02-21-2018 Shape: (1046845, 80)
Outlier removal summary:
2478 outliers rows to be removed

Original dataframe:


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,80,6,1519202005,37953,5,3,135,127,135,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
1,500,17,1519201986,117573474,3,0,1500,0,500,500,...,8,0.0,0.0,0,0,58800000.0,23800000.0,75600000,42000000,Benign
2,500,17,1519201986,117573474,3,0,1500,0,500,500,...,8,0.0,0.0,0,0,58800000.0,23800000.0,75600000,42000000,Benign
3,500,17,1519201991,99743998,5,0,2500,0,500,500,...,8,4000290.0,0.0,4000290,4000290,31900000.0,37900000.0,75600000,7200397,Benign
4,500,17,1519201991,99743999,5,0,2500,0,500,500,...,8,4000286.0,0.0,4000286,4000286,31900000.0,37900000.0,75600000,7200399,Benign


  df.loc[:,columns] = df_filtered.loc[:,columns]
 1.51918041e+09 1.51918041e+09]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns


Filtered dataframe:


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,80,6.0,,37953.0,5.0,3.0,135.0,127.0,135,0.0,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,500,,,,3.0,0.0,1500.0,0.0,500,,...,,0.0,0.0,0.0,0.0,,,,,Benign
2,500,,,,3.0,0.0,1500.0,0.0,500,,...,,0.0,0.0,0.0,0.0,,,,,Benign
3,500,,,,5.0,0.0,2500.0,0.0,500,,...,,,0.0,,,,,,,Benign
4,500,,,,5.0,0.0,2500.0,0.0,500,,...,,,0.0,,,,,,,Benign



Removed outliers:


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
1,,17.0,1519202000.0,117573474.0,,,,,,500.0,...,,8.0,,,,,58800000.0,23800000.0,75600000.0,42000000.0
2,,17.0,1519202000.0,117573474.0,,,,,,500.0,...,,8.0,,,,,58800000.0,23800000.0,75600000.0,42000000.0
3,,17.0,1519202000.0,99743998.0,,,,,,500.0,...,,8.0,4000290.0,,4000290.0,4000290.0,31900000.0,37900000.0,75600000.0,7200397.0
4,,17.0,1519202000.0,99743999.0,,,,,,500.0,...,,8.0,4000286.0,,4000286.0,4000286.0,31900000.0,37900000.0,75600000.0,7200399.0
5,,17.0,1519202000.0,89479580.0,,,,,,500.0,...,,8.0,4000308.0,,4000308.0,4000308.0,21400000.0,15300000.0,42000000.0,7200316.0



Label
Benign              2307
DDOS attack-HOIC     171
Name: count, dtype: int64

Dataframe: 03-02-2018 Shape: (1044525, 80)
Outlier removal summary:
35037 outliers rows to be removed

Original dataframe:


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,443,6,1519980458,141385,9,7,553,3773.0,202,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,49684,6,1519980458,281,2,1,38,0.0,38,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,443,6,1519980460,279824,11,15,1086,10527.0,385,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,443,6,1519980460,132,2,0,0,0.0,0,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,443,6,1519980461,274016,9,13,1285,6141.0,517,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign



Filtered dataframe:


  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,443,6,1519980458,141385,9.0,7.0,553.0,3773.0,202,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,49684,6,1519980458,281,2.0,1.0,38.0,0.0,38,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,443,6,1519980460,279824,11.0,15.0,1086.0,10527.0,385,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,443,6,1519980460,132,2.0,0.0,0.0,0.0,0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,443,6,1519980461,274016,9.0,13.0,1285.0,6141.0,517,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign



Removed outliers:


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
838,,,,,1271.0,,57543.0,,,,...,1128.0,,,,,,,,,
839,,,,,830.0,,115759.0,,,,...,829.0,,,17200000.0,39600000.0,,,,,
1150,,,,,767.0,,27538.0,,,,...,583.0,,,,,,,,,
1206,,,,,1116.0,,34761.0,,,,...,717.0,,,,,,,,,
1250,,,,,736.0,,29866.0,,,,...,644.0,,,,,,,,,



Label
Benign    34609
Bot         428
Name: count, dtype: int64

Dataframe: 02-22-2018 Shape: (1042965, 80)
Outlier removal summary:
31187 outliers rows to be removed

Original dataframe:


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,22,6,1519287963,20553406,10,7,1063,1297,744,0,...,20,1027304.0,0.0,1027304,1027304,19526080.0,0.0,19526080,19526080,Benign
1,34989,6,1519287984,790,2,0,848,0,848,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
2,500,17,1519287910,99745913,5,0,2500,0,500,500,...,8,4000203.0,0.0,4000203,4000203,31915240.0,37927870.0,75584115,7200679,Benign
3,500,17,1519287910,99745913,5,0,2500,0,500,500,...,8,4000189.0,0.0,4000189,4000189,31915240.0,37927880.0,75584130,7200693,Benign
4,500,17,1519287899,89481361,6,0,3000,0,500,500,...,8,4000554.0,0.0,4000554,4000554,21370200.0,15281090.0,41990741,7200848,Benign



Filtered dataframe:


 1.51929244e+09 1.51927407e+09]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.loc[:,columns] = df_filtered.loc[:,columns]
 2.0134030e+06]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
 9.5319400e+05]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.loc[:,columns] = df_filtered.loc[:,columns]
 3.000000e+00]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.loc[:,columns] = df_filte

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,22,6,1519288000.0,20553406.0,10.0,7.0,1063.0,1297.0,744.0,0.0,...,20,1027304.0,0.0,1027304.0,1027304.0,19526080.0,0.0,19526080.0,19526080.0,Benign
1,34989,6,1519288000.0,790.0,2.0,0.0,848.0,0.0,848.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,500,17,1519288000.0,99745913.0,5.0,0.0,2500.0,0.0,500.0,,...,8,4000203.0,0.0,4000203.0,4000203.0,31915240.0,37927870.0,75584115.0,7200679.0,Benign
3,500,17,1519288000.0,99745913.0,5.0,0.0,2500.0,0.0,500.0,,...,8,4000189.0,0.0,4000189.0,4000189.0,31915240.0,37927880.0,75584130.0,7200693.0,Benign
4,500,17,1519288000.0,89481361.0,6.0,0.0,3000.0,0.0,500.0,,...,8,4000554.0,0.0,4000554.0,4000554.0,21370200.0,15281090.0,41990741.0,7200848.0,Benign



Removed outliers:


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
246433,,,788666.0,-188505000000.0,,,,,,,...,,,,,,,197474800000.0,141251700000.0,324807000000.0,21756000000.0
246434,,,907536.0,-74877000000.0,,,,,,,...,,,,,,,305678500000.0,99310480000.0,450862000000.0,239934000000.0
246435,,,882750.0,,,,,,,,...,,,,,,,375976000000.0,262247900000.0,846458000000.0,21291000000.0
246436,,,877892.0,-828220000000.0,,,,,,,...,,,,,,,,,,
246437,,,974449.0,-578768000000.0,,,,,,,...,,,,,,,395571400000.0,189762200000.0,722821000000.0,6352000000.0



Label
Benign              31075
Brute Force -Web       71
Brute Force -XSS       37
SQL Injection           4
Name: count, dtype: int64

Dataframe: 02-20-2018 Shape: (7889295, 81)
Outlier removal summary:
323450 outliers rows to be removed

Original dataframe:


Unnamed: 0,Src Port,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,45498,22,6,1519115647,888751,11,11,1249.0,1969.0,736.0,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,0,0,0,1519115602,112642816,3,0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,56300000.0,7.071068,56300000.0,56300000.0,Benign
2,0,0,0,1519115771,112642712,3,0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,56300000.0,18.384776,56300000.0,56300000.0,Benign
3,0,0,0,1519115940,112642648,3,0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,56300000.0,5.656854,56300000.0,56300000.0,Benign
4,0,0,0,1519116109,112642702,3,0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,56300000.0,65.053824,56300000.0,56300000.0,Benign


  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]



Filtered dataframe:


Unnamed: 0,Src Port,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,45498,22,6,1519115647,888751,11.0,11.0,1249.0,1969.0,736.0,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,0,0,0,1519115602,112642816,3.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,56300000.0,7.071068,56300000.0,56300000.0,Benign
2,0,0,0,1519115771,112642712,3.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,56300000.0,18.384776,56300000.0,56300000.0,Benign
3,0,0,0,1519115940,112642648,3.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,56300000.0,5.656854,56300000.0,56300000.0,Benign
4,0,0,0,1519116109,112642702,3.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,56300000.0,65.053824,56300000.0,56300000.0,Benign



Removed outliers:


Unnamed: 0,Src Port,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
415149,,,,,,280043.0,,8961376.0,,,...,280042.0,,,,,,,,,
415150,,,,,,229456.0,,7342592.0,,,...,229455.0,,,,,,,,,
415151,,,,,,213005.0,,6816160.0,,,...,213004.0,,,,,,,,,
415152,,,,,,194073.0,,6210336.0,,,...,194072.0,,,,,,,,,
415153,,,,,,220366.0,,7051712.0,,,...,220365.0,,,,,,,,,



Label
Benign                    320606
DDoS attacks-LOIC-HTTP      2844
Name: count, dtype: int64

Dataframe: 02-14-2018 Shape: (1044751, 80)
Outlier removal summary:
33654 outliers rows to be removed

Original dataframe:


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,1518597061,112641719,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320859.5,139.300036,56320958,56320761,Benign
1,0,0,1518597230,112641466,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320733.0,114.551299,56320814,56320652,Benign
2,0,0,1518597399,112638623,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56319311.5,301.934596,56319525,56319098,Benign
3,22,6,1518597613,6453966,15,10,1239,2273,744,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
4,22,6,1518597623,8804066,14,11,1143,2209,744,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign



Filtered dataframe:


 1.51860561e+09 1.51860561e+09]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.loc[:,columns] = df_filtered.loc[:,columns]
 5.23551100e+06 5.80725600e+06]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
 3.7000000e+01]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.loc[:,columns] = df_filtered.loc[:,columns]
 5.23551100e+06 5.80725600e+06]' has dtype incompatible with int64, please explicitly cast to a compatible dtype firs

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,1518597000.0,112641719.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,56320859.5,139.300036,56320958.0,56320761.0,Benign
1,0,0,1518597000.0,112641466.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,56320733.0,114.551299,56320814.0,56320652.0,Benign
2,0,0,1518597000.0,112638623.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,56319311.5,301.934596,56319525.0,56319098.0,Benign
3,22,6,1518598000.0,6453966.0,15.0,10.0,1239.0,2273.0,744.0,0.0,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,22,6,1518598000.0,8804066.0,14.0,11.0,1143.0,2209.0,744.0,0.0,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign



Removed outliers:


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
410956,,,356477.0,-11873000000.0,,,,,,,...,,,,,,,,,,
410957,,,631953.0,-681402000000.0,,,,,,,...,,,,,,,,,,
410958,,,976676.0,-919011000000.0,,,,,,,...,,,,,,,284112500000.0,193152400000.0,754847000000.0,12603000000.0
410959,,,983710.0,-273850000000.0,,,,,,,...,,,,,,,339450300000.0,243268200000.0,979781000000.0,7758000000.0
412184,,,985452.0,-529801000000.0,505.0,,,,,,...,,,,,,,333755600000.0,237751700000.0,948431000000.0,4908000000.0



Label
Benign    33654
Name: count, dtype: int64

Dataframe: 02-23-2018 Shape: (1042867, 80)
Outlier removal summary:
35803 outliers rows to be removed

Original dataframe:


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,22,6,1519373909,1532698,11,11,1179,1969,648,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
1,500,17,1519373865,117573855,3,0,1500,0,500,500,...,8,0.0,0.0,0,0,58786927.5,23753240.0,75583006,41990849,Benign
2,500,17,1519373865,117573848,3,0,1500,0,500,500,...,8,0.0,0.0,0,0,58786924.0,23753250.0,75583007,41990841,Benign
3,22,6,1519373995,1745392,11,11,1179,1969,648,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
4,500,17,1519373897,89483474,6,0,3000,0,500,500,...,8,4000364.0,0.0,4000364,4000364,21370777.5,15280920.0,41989576,7200485,Benign



Filtered dataframe:


  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
 1.41477e+05]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,columns] = df_filtered.loc[:,columns]
  df.loc[:,colum

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,22,6,1519373909,1532698,11.0,11.0,1179.0,1969.0,648.0,0.0,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0,0,Benign
1,500,17,1519373865,117573855,3.0,0.0,1500.0,0.0,500.0,,...,8,0.0,0.0,0.0,0.0,58786927.5,,75583006,41990849,Benign
2,500,17,1519373865,117573848,3.0,0.0,1500.0,0.0,500.0,,...,8,0.0,0.0,0.0,0.0,58786924.0,,75583007,41990841,Benign
3,22,6,1519373995,1745392,11.0,11.0,1179.0,1969.0,648.0,0.0,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0,0,Benign
4,500,17,1519373897,89483474,6.0,0.0,3000.0,0.0,500.0,,...,8,4000364.0,0.0,4000364.0,4000364.0,21370777.5,,41989576,7200485,Benign



Removed outliers:


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
5390,,,,,927.0,,29453.0,,,,...,200.0,,11370730.0,,11370730.0,11370730.0,,,,
5408,,,,,2627.0,5283.0,,7604995.0,,,...,,,,3867811.0,12271295.0,,,,,
10930,,,,,2075.0,4009.0,,5742019.0,,,...,115.0,,10826702.0,,10826702.0,10826702.0,,,,
19395,,,,,1294.0,22364.0,,32591560.0,,,...,,,23958133.0,,23958133.0,23958133.0,,,,
29302,,,,,2860.0,38417.0,,56069495.0,,,...,,,,,,,,,,



Label
Benign              35531
Brute Force -Web      190
Brute Force -XSS       77
SQL Injection           5
Name: count, dtype: int64


Count how many Na there are in the dataframes:

In [None]:
for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key
    print(f"Dataframe: '{key}'\n")
    count_NA = df.isna().sum()
    print(count_NA)

Dataframe: '02-28-2018'

Dst Port            0
Protocol            0
Timestamp           0
Flow Duration       0
Tot Fwd Pkts       57
                 ... 
Idle Mean         142
Idle Std         3601
Idle Max          126
Idle Min          153
Label               0
Length: 80, dtype: int64
Dataframe: '03-01-2018'

Dst Port            0
Protocol            0
Timestamp           0
Flow Duration       0
Tot Fwd Pkts       76
                 ... 
Idle Mean           9
Idle Std         2361
Idle Max            0
Idle Min           14
Label               0
Length: 80, dtype: int64
Dataframe: '02-16-2018'

Dst Port            0
Protocol          177
Timestamp           0
Flow Duration    8800
Tot Fwd Pkts       36
                 ... 
Idle Mean        8795
Idle Std           19
Idle Max         8797
Idle Min         8795
Label               0
Length: 80, dtype: int64
Dataframe: '02-15-2018'

Dst Port            0
Protocol            0
Timestamp           0
Flow Duration       0
Tot Fwd Pkt

Drop Na values

In [None]:
for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key
    df.dropna(inplace=True)

In [None]:
for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key
    print(df.shape)

(577578, 80)
(310616, 80)
(1036539, 80)
(992169, 80)
(1048575, 80)
(1009488, 80)
(1011778, 80)
(7565845, 81)
(1011097, 80)
(1007064, 80)


## 7. Normalization

In [None]:
for key in dfs.keys():
       df = dfs[key]  # Get the dataframe corresponding to the key
       print(f"Dataframe: '{key}'\n")
       display(df.describe())

Dataframe: '02-28-2018'



  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,577578.0,577578.0,577578.0,577578.0,577578.0,577578.0,577578.0,577578.0,577578.0,577578.0,...,577578.0,577578.0,577578.0,577578.0,577578.0,577578.0,577578.0,577578.0,577578.0,577578.0
mean,7170.355609,10.037747,1519802000.0,10741260.0,4.461875,4.122764,334.88365,1437.932316,167.851861,15.214754,...,1.891744,15.994937,104952.8,36889.48,163652.8,83310.21,3317246.0,96470.41,3395919.0,3209352.0
std,17006.779367,5.361862,14206.07,30622930.0,8.36153,9.538951,861.060033,8144.815179,275.228847,20.553933,...,4.662804,6.672846,677786.8,296751.6,926882.2,629010.9,11189670.0,764316.9,11342990.0,11070220.0
min,0.0,0.0,1519780000.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,53.0,6.0,1519787000.0,274.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,443.0,6.0,1519808000.0,2054.0,2.0,1.0,42.0,86.0,40.0,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2047.0,17.0,1519815000.0,1386482.0,6.0,5.0,373.0,355.0,193.0,36.0,...,3.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,65533.0,17.0,1519823000.0,120000000.0,1110.0,450.0,57019.0,616364.0,1968.0,190.0,...,1109.0,44.0,23500000.0,14544520.0,32637530.0,21000000.0,84600000.0,14428170.0,84600000.0,84600000.0


Dataframe: '03-01-2018'



  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,310616.0,310616.0,310616.0,310616.0,310616.0,310616.0,310616.0,310616.0,310616.0,310616.0,...,310616.0,310616.0,310616.0,310616.0,310616.0,310616.0,310616.0,310616.0,310616.0,310616.0
mean,7681.208128,9.745686,1519887000.0,11950110.0,4.557782,4.047203,332.831573,1187.849905,167.483211,14.293246,...,1.946487,16.207253,98520.43,35934.55,152096.8,76604.45,3651555.0,109684.1,3742385.0,3530737.0
std,17022.385545,5.387199,13705.39,32715790.0,7.339225,8.554676,808.379976,6353.951126,263.342858,20.20742,...,3.540871,7.045044,648341.2,288648.6,861454.6,602715.0,12372380.0,853574.8,12536140.0,12248250.0
min,0.0,0.0,1519866000.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,53.0,6.0,1519873000.0,276.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,443.0,6.0,1519882000.0,2280.0,2.0,1.0,42.0,81.0,40.0,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3389.0,17.0,1519900000.0,1430229.0,8.0,6.0,373.0,392.0,194.0,35.0,...,3.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,65535.0,17.0,1519909000.0,120000000.0,519.0,466.0,100441.0,501345.0,1968.0,272.0,...,280.0,48.0,21900000.0,13200000.0,29300000.0,18400000.0,84400000.0,14800000.0,84400000.0,84400000.0


Dataframe: '02-16-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,1036539.0,1036539.0,1036539.0,1036539.0,1036539.0,1036539.0,1036539.0,1036539.0,1036539.0,1036539.0,...,1036539.0,1036539.0,1036539.0,1036539.0,1036539.0,1036539.0,1036539.0,1036539.0,1036539.0,1036539.0
mean,19922.29,6.0,1518750000.0,1904887.0,3.19803,1.623287,399.8604,184.2974,399.3623,0.0,...,0.4414528,33.06835,0.9617265,0.0,0.9617265,0.9617265,252068.5,0.0,252068.5,252068.5
std,23750.05,0.0,10755.48,2212364.0,1.590024,1.650284,458.2399,243.2887,458.0075,0.0,...,0.5002269,2.762584,118.1721,0.0,118.1721,118.1721,1127024.0,0.0,1127024.0,1127024.0
min,21.0,6.0,1518744000.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,80.0,6.0,1518746000.0,2361.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,80.0,6.0,1518746000.0,48125.0,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,44304.0,6.0,1518746000.0,4256343.0,5.0,3.0,935.0,348.0,935.0,0.0,...,1.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,60998.0,6.0,1518785000.0,12757860.0,9.0,10.0,1656.0,2496.0,935.0,0.0,...,4.0,40.0,35754.0,0.0,35754.0,35754.0,5528848.0,0.0,5528848.0,5528848.0


Dataframe: '02-15-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,992169.0,992169.0,992169.0,992169.0,992169.0,992169.0,992169.0,992169.0,992169.0,992169.0,...,992169.0,992169.0,992169.0,992169.0,992169.0,992169.0,992169.0,992169.0,992169.0,992169.0
mean,6493.036612,9.700644,1518681000.0,12832890.0,4.345984,4.006306,298.110155,1623.094842,150.552403,14.132048,...,1.762919,16.32288,47390.55,27295.28,95373.43,32665.29,3399977.0,53862.37,3438787.0,3326073.0
std,16555.276929,5.365924,13736.25,31788960.0,6.779702,10.689804,785.520578,11771.232886,237.012596,20.205812,...,3.317067,7.448484,314890.5,164739.0,503793.9,290798.8,11801380.0,559139.4,11888920.0,11717920.0
min,0.0,0.0,1518656000.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,53.0,6.0,1518667000.0,456.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,80.0,6.0,1518686000.0,36958.0,2.0,1.0,44.0,102.0,41.0,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,445.0,17.0,1518692000.0,2880065.0,4.0,4.0,316.0,488.0,161.0,35.0,...,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,65534.0,17.0,1518700000.0,120000000.0,240.0,458.0,33521.0,610885.0,1968.0,175.0,...,41.0,56.0,6553140.0,2699346.0,8648668.0,6179575.0,101706400.0,16778270.0,101706400.0,101706400.0


Dataframe: '02-21-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,...,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0
mean,19587.64,6.037952,1519180000.0,399040.1,196.8099,1.309663,6563.581,248.6527,366.6938,0.2549479,...,194.1294,19.95892,7521.672,2589.427,10036.09,5446.253,31405.14,16026.45,57230.7,16334.81
std,27193.38,0.6495848,1774.113,6630337.0,4899.047,1.477423,156759.6,325.7857,422.696,8.978105,...,4899.113,0.7395666,203344.9,103444.8,282523.9,156878.9,841996.7,434022.9,1421182.0,589462.8
min,0.0,0.0,1519178000.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,80.0,6.0,1519179000.0,1408.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,80.0,6.0,1519180000.0,4720.0,3.0,0.0,247.0,0.0,32.0,0.0,...,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,53553.0,6.0,1519180000.0,14649.0,5.0,2.0,935.0,316.0,935.0,0.0,...,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,65534.0,17.0,1519210000.0,120000000.0,309629.0,124.0,9908128.0,9121.0,2224.0,976.0,...,309628.0,44.0,20000000.0,16200000.0,28000000.0,20000000.0,106000000.0,50300000.0,106000000.0,106000000.0


Dataframe: '03-02-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,1009488.0,1009488.0,1009488.0,1009488.0,1009488.0,1009488.0,1009488.0,1009488.0,1009488.0,1009488.0,...,1009488.0,1009488.0,1009488.0,1009488.0,1009488.0,1009488.0,1009488.0,1009488.0,1009488.0,1009488.0
mean,8096.96,8.117884,1519975000.0,9769624.0,4.635036,3.8564,356.4585,1031.235,211.0056,8.069473,...,1.86505,17.43518,33693.87,16059.26,64595.8,25432.64,4046526.0,24338.45,4061512.0,4005787.0
std,14627.42,4.461341,14667.59,27499530.0,7.249616,8.566998,587.7943,6702.563,266.5331,16.67014,...,3.089614,5.16228,367429.1,159005.8,527595.5,350805.4,16487910.0,267691.6,16501290.0,16471510.0
min,0.0,0.0,1519952000.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,80.0,6.0,1519961000.0,519.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3389.0,6.0,1519981000.0,11163.0,3.0,1.0,62.0,129.0,46.0,0.0,...,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8080.0,6.0,1519989000.0,2024434.0,6.0,5.0,364.0,488.0,326.0,0.0,...,3.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,65534.0,17.0,1519996000.0,120000000.0,522.0,813.0,15510.0,982644.0,1460.0,137.0,...,98.0,44.0,12400000.0,8798243.0,19200000.0,12400000.0,120000000.0,10300000.0,120000000.0,120000000.0


Dataframe: '02-22-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,1011778.0,1011778.0,1011778.0,1011778.0,1011778.0,1011778.0,1011778.0,1011778.0,1011778.0,1011778.0,...,1011778.0,1011778.0,1011778.0,1011778.0,1011778.0,1011778.0,1011778.0,1011778.0,1011778.0,1011778.0
mean,8415.083,9.204687,1519285000.0,17767720.0,4.984805,4.120253,354.4642,1553.536,176.5995,12.32186,...,2.095751,16.1334,27097.86,18821.14,61427.62,17237.42,9143422.0,88765.15,9205230.0,9046527.0
std,17536.41,5.152191,14124.7,35501260.0,7.450428,10.88784,756.6114,12148.28,265.6167,19.95305,...,3.423974,5.900407,188351.7,118497.5,346617.2,171074.1,24561760.0,1368473.0,24642420.0,24526310.0
min,0.0,0.0,1519261000.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,53.0,6.0,1519271000.0,596.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,443.0,6.0,1519290000.0,86478.5,2.0,1.0,45.0,106.0,41.0,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3389.0,17.0,1519298000.0,5012308.0,8.0,6.0,455.0,582.0,187.0,33.0,...,4.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,65533.0,17.0,1519304000.0,120000000.0,434.0,831.0,28790.0,1104225.0,1968.0,211.0,...,38.0,44.0,4270818.0,1796655.0,5805859.0,4179446.0,119998500.0,74935990.0,119998500.0,119998500.0


Dataframe: '02-20-2018'



Unnamed: 0,Src Port,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,7565845.0,7565845.0,7565845.0,7565845.0,7565845.0,7565845.0,7565845.0,7565845.0,7565845.0,7565845.0,...,7565845.0,7565845.0,7565845.0,7565845.0,7565845.0,7565845.0,7565845.0,7565845.0,7565845.0,7565845.0
mean,46681.25,6417.406,9.539141,1519111000.0,12048380.0,4.72083,4.023582,339.7468,1197.943,173.7333,...,2.056324,15.88454,72190.01,30595.14,119730.0,54242.24,4159707.0,67855.71,4214756.0,4074360.0
std,19978.64,16219.29,5.279353,14066.87,30829390.0,12.58477,8.147632,825.5895,6046.546,259.9331,...,10.64437,6.178594,564462.7,257486.7,770204.4,519738.8,13068670.0,633975.5,13165210.0,12991330.0
min,0.0,0.0,0.0,1519088000.0,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,49959.0,53.0,6.0,1519097000.0,511.0,1.0,1.0,20.0,0.0,20.0,...,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,52700.0,80.0,6.0,1519117000.0,51761.0,2.0,1.0,44.0,114.0,42.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,58150.0,3389.0,17.0,1519124000.0,2830910.0,7.0,6.0,436.0,964.0,194.0,...,3.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,65535.0,65535.0,17.0,1519132000.0,120000000.0,8164.0,997.0,261248.0,1201779.0,1968.0,...,8163.0,48.0,22811250.0,14010500.0,30564740.0,19282500.0,108261100.0,12867400.0,108261100.0,108261100.0


Dataframe: '02-14-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,1011097.0,1011097.0,1011097.0,1011097.0,1011097.0,1011097.0,1011097.0,1011097.0,1011097.0,1011097.0,...,1011097.0,1011097.0,1011097.0,1011097.0,1011097.0,1011097.0,1011097.0,1011097.0,1011097.0,1011097.0
mean,4617.869,8.169613,1518595000.0,6366756.0,5.005059,4.447752,379.2241,834.7496,169.2143,8.315534,...,2.648352,23.42916,14708.23,7105.414,32606.28,11848.84,1143227.0,17195.59,1153016.0,1111306.0
std,14076.74,4.505407,14437.07,22246210.0,6.989601,7.286993,750.7051,3461.69,268.5747,17.00098,...,5.011792,11.21421,155365.0,68708.2,260201.8,150615.1,6867112.0,344043.2,6894340.0,6829427.0
min,0.0,0.0,1518570000.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22.0,6.0,1518578000.0,7.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,53.0,6.0,1518602000.0,951.0,2.0,1.0,35.0,52.0,33.0,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,443.0,6.0,1518607000.0,387612.0,6.0,5.0,364.0,554.0,161.0,0.0,...,3.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,65533.0,17.0,1518613000.0,120000000.0,265.0,460.0,27690.0,543245.0,1460.0,137.0,...,40.0,48.0,4067542.0,1554120.0,5243341.0,3959858.0,119990700.0,70216830.0,119990700.0,119990700.0


Dataframe: '02-23-2018'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,1007064.0,1007064.0,1007064.0,1007064.0,1007064.0,1007064.0,1007064.0,1007064.0,1007064.0,1007064.0,...,1007064.0,1007064.0,1007064.0,1007064.0,1007064.0,1007064.0,1007064.0,1007064.0,1007064.0,1007064.0
mean,8846.537,9.062825,1519371000.0,17689820.0,5.108689,4.291095,360.443,1812.014,182.5204,11.76497,...,2.104868,16.32117,29228.85,22025.1,69348.26,17673.43,9371842.0,31613.54,9392247.0,9316843.0
std,17934.67,5.082191,13873.77,35300630.0,8.634089,13.56504,681.3042,16515.94,274.7218,19.58987,...,3.420717,5.869909,222786.7,154635.6,429228.7,199503.8,24978390.0,279427.8,24988810.0,24972850.0
min,0.0,0.0,1519348000.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,53.0,6.0,1519357000.0,587.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,443.0,6.0,1519376000.0,95047.5,2.0,1.0,45.0,102.0,41.0,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3389.0,17.0,1519383000.0,4892688.0,7.0,6.0,455.0,582.0,194.0,32.0,...,4.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,65534.0,17.0,1519391000.0,119999900.0,751.0,1489.0,15495.0,1984308.0,1968.0,211.0,...,32.0,48.0,5896809.0,3639945.0,9103289.0,5147631.0,119991600.0,11285980.0,119991600.0,119991600.0


Filter  inf and NaN again because some procedure added them again to 2 datasets:

In [None]:
for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key
    print(f"Dataframe: '{key}'\n")
    # replace +ve and -ve infinity with NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    # Drop all NaN values
    df.dropna(inplace=True)

Dataframe: '02-28-2018'

Dataframe: '03-01-2018'

Dataframe: '02-16-2018'

Dataframe: '02-15-2018'

Dataframe: '02-21-2018'

Dataframe: '03-02-2018'

Dataframe: '02-22-2018'

Dataframe: '02-20-2018'

Dataframe: '02-14-2018'

Dataframe: '02-23-2018'



In [None]:
from sklearn.preprocessing import MinMaxScaler
for key in dfs.keys():
       df = dfs[key]  # Get the dataframe corresponding to the key
       columns = [col for col in df.columns if col != 'Label']
       min_max_scaler = MinMaxScaler().fit(df[columns])
       print(f"Dataframe: {key}\n")
       df[columns] = min_max_scaler.transform(df[columns])
       display(df.head())

Dataframe: 02-28-2018



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.00676,0.352941,0.614204,0.000789,0.004509,0.015556,0.012417,0.006032,0.196646,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,0.00676,0.352941,0.614204,2e-06,0.000902,0.0,0.0,0.0,0.0,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,0.00679,0.352941,0.61425,0.001379,0.001803,0.002222,0.0,0.0,0.0,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,0.00676,0.352941,0.614273,0.000854,0.004509,0.015556,0.012417,0.006032,0.196646,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,0.00676,0.352941,0.614273,1e-06,0.000902,0.0,0.0,0.0,0.0,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


Dataframe: 03-01-2018



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
1,0.0,0.0,0.611287,0.508312,0.001931,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.722749,0.0,0.722749,0.722749,Benign
3,0.0,0.0,0.614111,0.508313,0.001931,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.722749,0.0,0.722749,0.722749,Benign
7,0.0,0.0,0.62263,0.508313,0.001931,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.722749,0.0,0.722749,0.722749,Benign
8,0.0,0.0,0.625454,0.508313,0.001931,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.722749,0.0,0.722749,0.722749,Benign
9,0.0,0.0,0.628278,0.508313,0.001931,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.722749,0.0,0.722749,0.722749,Benign


Dataframe: 02-16-2018



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
4,3.3e-05,0.0,0.632071,1e-06,0.0,0.1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
10,3.3e-05,0.0,0.646748,2e-06,0.0,0.1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
12,3.3e-05,0.0,0.666123,2e-06,0.0,0.1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
15,0.000968,0.0,0.672987,0.015545,0.125,0.0,0.0,0.0,0.0,0.0,...,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
17,0.000968,0.0,0.673011,0.065888,0.125,0.0,0.0,0.0,0.0,0.0,...,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


Dataframe: 02-15-2018



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.0,0.0,0.618487,0.938676,0.008368,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.553756,4.2e-05,0.553761,0.553752,Benign
1,0.000336,0.352941,0.623741,0.31139,0.054393,0.026201,0.064676,0.004899,0.361789,0.0,...,0.571429,0.156315,0.240443,0.185136,0.052037,0.112394,0.217245,0.153554,0.088099,Benign
2,0.725028,0.352941,0.624598,5e-06,0.004184,0.0,0.001909,0.0,0.03252,0.0,...,0.571429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,0.0,0.0,0.622399,0.938673,0.008368,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.553754,2.2e-05,0.553757,0.553752,Benign
4,0.0,0.0,0.626311,0.938674,0.008368,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.553755,4.3e-05,0.55376,0.55375,Benign


Dataframe: 02-21-2018



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.001221,0.352941,0.75372,0.000316,1.3e-05,0.024194,1.4e-05,0.013924,0.060701,0.0,...,0.727273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,0.00763,1.0,0.75312,0.979779,6e-06,0.0,0.000151,0.0,0.22482,0.512295,...,0.181818,0.0,0.0,0.0,0.0,0.554717,0.473161,0.713208,0.396226,Benign
2,0.00763,1.0,0.75312,0.979779,6e-06,0.0,0.000151,0.0,0.22482,0.512295,...,0.181818,0.0,0.0,0.0,0.0,0.554717,0.473161,0.713208,0.396226,Benign
3,0.00763,1.0,0.753278,0.8312,1.3e-05,0.0,0.000252,0.0,0.22482,0.512295,...,0.181818,0.200014,0.0,0.142867,0.200014,0.300943,0.753479,0.713208,0.067928,Benign
4,0.00763,1.0,0.753278,0.8312,1.3e-05,0.0,0.000252,0.0,0.22482,0.512295,...,0.181818,0.200014,0.0,0.142867,0.200014,0.300943,0.753479,0.713208,0.067928,Benign


Dataframe: 03-02-2018



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.00676,0.352941,0.649506,0.001178,0.015355,0.00861,0.035654,0.00384,0.138356,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,0.758141,0.352941,0.649506,2e-06,0.001919,0.00123,0.00245,0.0,0.026027,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,0.00676,0.352941,0.649552,0.002332,0.019194,0.01845,0.070019,0.010713,0.263699,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,0.00676,0.352941,0.649552,1e-06,0.001919,0.0,0.0,0.0,0.0,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,0.00676,0.352941,0.649575,0.002283,0.015355,0.01599,0.08285,0.006249,0.35411,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


Dataframe: 02-22-2018



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.000336,0.352941,0.619528,0.171278,0.020785,0.008424,0.036923,0.001175,0.378049,0.0,...,0.454545,0.24054,0.0,0.176943,0.245799,0.162719,0.0,0.162719,0.162719,Benign
1,0.533914,0.352941,0.620014,7e-06,0.002309,0.0,0.029455,0.0,0.430894,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
6,0.0,0.0,0.618162,0.938689,0.004619,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.46935,2e-06,0.469351,0.469349,Benign
13,0.0,0.0,0.622075,0.938679,0.004619,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.469345,2e-06,0.469346,0.469344,Benign
18,0.370195,0.352941,0.617862,0.044061,0.069284,0.038508,0.087808,0.00058,0.065041,0.303318,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


Dataframe: 02-20-2018



Unnamed: 0,Src Port,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.694255,0.000336,0.352941,0.630732,0.007406,0.001225,0.011033,0.004781,0.001638,0.373984,...,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,0.0,0.0,0.0,0.629691,0.93869,0.000245,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.520039,5.495334e-07,0.520039,0.520039,Benign
2,0.0,0.0,0.0,0.633603,0.938689,0.000245,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.520039,1.428787e-06,0.520039,0.520039,Benign
3,0.0,0.0,0.0,0.637515,0.938689,0.000245,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.520039,4.396267e-07,0.520039,0.520039,Benign
4,0.0,0.0,0.0,0.641427,0.938689,0.000245,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.520039,5.055707e-06,0.520039,0.520039,Benign


Dataframe: 02-14-2018



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.0,0.0,0.626427,0.938681,0.007576,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.469377,2e-06,0.469378,0.469376,Benign
1,0.0,0.0,0.630339,0.938679,0.007576,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.469376,2e-06,0.469376,0.469375,Benign
2,0.0,0.0,0.634251,0.938655,0.007576,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.469364,4e-06,0.469366,0.469362,Benign
3,0.000336,0.352941,0.639205,0.053783,0.05303,0.021739,0.044745,0.004184,0.509589,0.0,...,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,0.000336,0.352941,0.639436,0.073367,0.049242,0.023913,0.041278,0.004066,0.509589,0.0,...,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


Dataframe: 02-23-2018



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.000336,0.352941,0.609019,0.012772,0.013333,0.007388,0.076089,0.000992,0.329268,0.0,...,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,0.000336,0.352941,0.61101,0.014545,0.013333,0.007388,0.076089,0.000992,0.329268,0.0,...,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
6,0.0,0.0,0.608926,0.93868,0.002667,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.469372,1e-05,0.469373,0.469372,Benign
7,0.000336,0.352941,0.613,0.013053,0.013333,0.007388,0.076089,0.000992,0.329268,0.0,...,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
12,0.000336,0.352941,0.614991,0.013425,0.016,0.007388,0.076089,0.000992,0.329268,0.0,...,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


## 8. Feature Correlation Filtering

In [None]:
for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key


    columns = [col for col in df.columns if col != 'Label']

    corr_matrix = df[columns].corr().abs()

    threshold = 0.99
    # Find features with high correlation
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

    print(f'\nDataset: {key}')
    # Print features to drop
    print(f"The following {len(to_drop)} features will be dropped due to high correlation: {to_drop}")

    df = df.drop(to_drop, axis = 1)



Dataset: 02-28-2018
The following 16 features will be dropped due to high correlation: ['Fwd IAT Tot', 'Bwd Header Len', 'Pkt Len Min', 'Pkt Len Max', 'SYN Flag Cnt', 'ECE Flag Cnt', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'Subflow Fwd Pkts', 'Subflow Fwd Byts', 'Subflow Bwd Pkts', 'Subflow Bwd Byts', 'Idle Mean', 'Idle Max', 'Idle Min']

Dataset: 03-01-2018
The following 16 features will be dropped due to high correlation: ['Fwd IAT Tot', 'Fwd IAT Max', 'Fwd IAT Min', 'Pkt Len Min', 'SYN Flag Cnt', 'ECE Flag Cnt', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'Subflow Fwd Pkts', 'Subflow Fwd Byts', 'Subflow Bwd Pkts', 'Subflow Bwd Byts', 'Idle Mean', 'Idle Max', 'Idle Min']

Dataset: 02-16-2018
The following 36 features will be dropped due to high correlation: ['Fwd Pkt Len Max', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Std', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Bwd IAT Std'

Drop Na values

In [None]:
for key in dfs.keys():
    df = dfs[key]  # Get the dataframe corresponding to the key
    print(f"Dataframe: '{key}'\n")
    # replace +ve and -ve infinity with NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    # Drop all NaN values
    df.dropna(inplace=True)

Dataframe: '02-28-2018'

Dataframe: '03-01-2018'

Dataframe: '02-16-2018'

Dataframe: '02-15-2018'

Dataframe: '02-21-2018'

Dataframe: '03-02-2018'

Dataframe: '02-22-2018'

Dataframe: '02-20-2018'

Dataframe: '02-14-2018'

Dataframe: '02-23-2018'



## 9. Equalization

In [None]:
df = dfs['02-14-2018']

df = df.sample(frac=1) #Randomize rows's sequence


#df2 = df[df["Label"] == "FTP-BruteForce"][:193360]
#df3 = df[df["Label"] == "SSH-Bruteforce"][:187589]
df2 = df[df["Label"] == "FTP-BruteForce"]
df3 = df[df["Label"] == "SSH-Bruteforce"]
df0 = df[df["Label"] == "Benign"][:df2.shape[0]]
df1 = df[df["Label"] == "Benign"][:df3.shape[0]]


df_equal_FTP_BruteForce = pd.concat([ df0,df2], axis =0)
df_equal_SSH_BruteForce = pd.concat([ df1,df3], axis =0)

In [None]:
df = dfs['02-15-2018']

df = df.sample(frac=1) #Randomize rows's sequence


df2 = df[df["Label"] == "DoS attacks-GoldenEye"]
df3 = df[df["Label"] == "DoS attacks-Slowloris"]
df0 = df[df["Label"] == "Benign"][:df2.shape[0]]
df1 = df[df["Label"] == "Benign"][:df3.shape[0]*100]


df_equal_DoS_GoldenEye = pd.concat([ df0,df2], axis =0)
df_equal_DoS_Slowloris = pd.concat([ df1,df3], axis =0)

In [None]:
df = dfs['02-16-2018']

df = df.sample(frac=1) #Randomize rows's sequence



df3 = df[df["Label"] == "DoS attacks-SlowHTTPTest"]
df0 = df[df["Label"] == "Benign"]
df2 = df[df["Label"] == "DoS attacks-Hulk"][:df0.shape[0]]
df1 = df[df["Label"] == "Benign"][:df3.shape[0]]


df_equal_DoS_Hulk = pd.concat([ df0,df2], axis =0)
df_equal_DoS_SlowHTTPTest = pd.concat([ df1,df3], axis =0)

In [None]:
df = dfs['02-21-2018']

df = df.sample(frac=1) #Randomize rows's sequence



df3 = df[df["Label"] == "DDOS attack-LOIC-UDP"]
df0 = df[df["Label"] == "Benign"]
df2 = df[df["Label"] == "DDOS attack-HOIC"][:df0.shape[0]]
df1 = df[df["Label"] == "Benign"][:df3.shape[0]*100]


df_equal_DDoS_HOIC = pd.concat([ df0,df2], axis =0)
df_equal_DDoS_LOIC_UDP = pd.concat([ df1,df3], axis =0)

In [None]:
df_02_28 = dfs['02-28-2018']
df_03_01 = dfs['03-01-2018']

df_inf = pd.concat([ df_02_28,df_03_01], axis =0)

df_inf = df_inf.sample(frac=1) #Randomize rows's sequence


df2 = df_inf[df_inf["Label"] == "Infilteration"]
df1 = df_inf[df_inf["Label"] == "Benign"][:df2.shape[0]]


df_equal_Infilteration = pd.concat([ df1,df2], axis =0)

In [None]:
df = dfs['03-02-2018']

df = df.sample(frac=1) #Randomize rows's sequence


df2 = df[df["Label"] == "Bot"]
df1 = df[df["Label"] == "Benign"][:df2.shape[0]]


df_equal_Bot = pd.concat([ df1,df2], axis =0)


In [None]:
df = dfs['02-20-2018']

df = df.sample(frac=1) #Randomize rows's sequence

df2 = df[df["Label"] == "DDoS attacks-LOIC-HTTP"]
df1 = df[df["Label"] == "Benign"][:df2.shape[0]]

df_equal_DDoS_LOIC_HTTP = pd.concat([ df1,df2], axis =0)


Concat other dataframes that wont be equalized for being to unbalanced

In [None]:
df1 = dfs['02-23-2018']
df2 = dfs['02-22-2018']
df_BruteForce_Web_XSS = pd.concat([df1,df2], axis = 0)
df_BruteForce_Web_XSS["Label"] = df_BruteForce_Web_XSS.Label.map(lambda a:"Benign" if a == 'Benign' else "Attack")

Add all equalized dataframes in a dictionary

In [None]:
dfs_final = {}

# for name, df in zip(['df_equal_Bot','df_equal_DDoS_HOIC','df_equal_DDoS_LOIC_HTTP',
#                          'df_equal_DDoS_LOIC_UDP','df_equal_DoS_GoldenEye','df_equal_DoS_Hulk',
#                          'df_equal_DoS_SlowHTTPTest','df_equal_DoS_Slowloris','df_equal_FTP_BruteForce',
#                          'df_equal_Infilteration','df_equal_SSH_BruteForce','df_BruteForce_Web_XSS'],
#                          [df_equal_Bot,df_equal_DDoS_HOIC,df_equal_DDoS_LOIC_HTTP,
#                          df_equal_DDoS_LOIC_UDP,df_equal_DoS_GoldenEye,df_equal_DoS_Hulk,
#                          df_equal_DoS_SlowHTTPTest,df_equal_DoS_Slowloris,df_equal_FTP_BruteForce,
#                          df_equal_Infilteration,df_equal_SSH_BruteForce,df_BruteForce_Web_XSS]):
for name, df in zip(['df_equal_Bot','df_equal_DDoS_HOIC',
                         'df_equal_DDoS_LOIC_UDP','df_equal_DoS_GoldenEye','df_equal_DoS_Hulk',
                         'df_equal_DoS_SlowHTTPTest','df_equal_DoS_Slowloris','df_equal_FTP_BruteForce',
                         'df_equal_Infilteration','df_equal_SSH_BruteForce','df_BruteForce_Web_XSS'],
                         [df_equal_Bot,df_equal_DDoS_HOIC,
                         df_equal_DDoS_LOIC_UDP,df_equal_DoS_GoldenEye,df_equal_DoS_Hulk,
                         df_equal_DoS_SlowHTTPTest,df_equal_DoS_Slowloris,df_equal_FTP_BruteForce,
                         df_equal_Infilteration,df_equal_SSH_BruteForce,df_BruteForce_Web_XSS]):
    dfs_final[name] = df

In [None]:
for key in dfs_final.keys():
    df = dfs_final[key]  # Get the dataframe corresponding to the key
    print(f"Dataframe: '{key}'\n")
    print(f"{df['Label'].value_counts()}\n")

Dataframe: 'df_equal_Bot'

Label
Benign    285763
Bot       285763
Name: count, dtype: int64

Dataframe: 'df_equal_DDoS_HOIC'

Label
Benign              360833
DDOS attack-HOIC    360833
Name: count, dtype: int64

Dataframe: 'df_equal_DDoS_LOIC_UDP'

Label
Benign                  173000
DDOS attack-LOIC-UDP      1730
Name: count, dtype: int64

Dataframe: 'df_equal_DoS_GoldenEye'

Label
Benign                   39924
DoS attacks-GoldenEye    39924
Name: count, dtype: int64

Dataframe: 'df_equal_DoS_Hulk'

Label
Benign              439126
DoS attacks-Hulk    439126
Name: count, dtype: int64

Dataframe: 'df_equal_DoS_SlowHTTPTest'

Label
Benign                      139890
DoS attacks-SlowHTTPTest    139890
Name: count, dtype: int64

Dataframe: 'df_equal_DoS_Slowloris'

Label
Benign                   272400
DoS attacks-Slowloris      2724
Name: count, dtype: int64

Dataframe: 'df_equal_FTP_BruteForce'

Label
Benign            193354
FTP-BruteForce    193354
Name: count, dtype: int64

Dataf

## 10. Save datasets

Final visualizations:

In [None]:
for key in dfs_final.keys():
    df = dfs_final[key]  # Get the dataframe corresponding to the key
    print(f"Dataframe: '{key}'\n")
    display(df.describe())
    display(df.head())
    display(df.info())

Dataframe: 'df_equal_Bot'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,571526.0,571526.0,571526.0,571526.0,571526.0,571526.0,571526.0,571526.0,571526.0,571526.0,...,571526.0,571526.0,571526.0,571526.0,571526.0,571526.0,571526.0,571526.0,571526.0,571526.0
mean,0.126559,0.439652,0.516769,0.056957,0.005784,0.004048,0.019128,0.000758,0.133718,0.041016,...,0.014882,0.413794,0.001879,0.001283,0.002344,0.001412,0.023511,0.001698,0.023601,0.023266
std,0.19182,0.226567,0.348811,0.195393,0.011859,0.008927,0.032657,0.00567,0.164995,0.105003,...,0.027378,0.101823,0.024456,0.015144,0.022803,0.023309,0.115563,0.022242,0.115668,0.115423
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.00676,0.352941,0.181254,4e-06,0.001919,0.0,0.0,0.0,0.0,0.0,...,0.0,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.123295,0.352941,0.657816,8.6e-05,0.003839,0.00123,0.003868,0.000131,0.031507,0.0,...,0.010204,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.123295,0.352941,0.857937,0.001766,0.003839,0.00492,0.021019,0.000163,0.223288,0.0,...,0.010204,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,0.996161,0.771218,0.993681,0.781162,1.0,1.0,...,1.0,0.909091,1.0,0.99587,1.0,1.0,1.0,0.990291,1.0,1.0


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
390810,0.423826,0.352941,0.248038,2.833334e-07,0.001919,0.0,0.0,0.0,0.0,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
847992,0.000809,1.0,0.053219,0.002209909,0.001919,0.00246,0.003997,0.000279,0.021233,0.226277,...,0.181818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1038049,0.000809,1.0,0.005208,1.527501e-05,0.0,0.00123,0.004513,0.000201,0.047945,0.510949,...,0.181818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
370882,0.051714,0.352941,0.719415,0.02052084,0.013436,0.00861,0.073759,0.001609,0.463699,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
858575,0.051714,0.352941,0.038635,0.03081432,0.013436,0.00861,0.073759,0.001609,0.463699,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


<class 'pandas.core.frame.DataFrame'>
Index: 571526 entries, 390810 to 347458
Data columns (total 80 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Dst Port           571526 non-null  float64
 1   Protocol           571526 non-null  float64
 2   Timestamp          571526 non-null  float64
 3   Flow Duration      571526 non-null  float64
 4   Tot Fwd Pkts       571526 non-null  float64
 5   Tot Bwd Pkts       571526 non-null  float64
 6   TotLen Fwd Pkts    571526 non-null  float64
 7   TotLen Bwd Pkts    571526 non-null  float64
 8   Fwd Pkt Len Max    571526 non-null  float64
 9   Fwd Pkt Len Min    571526 non-null  float64
 10  Fwd Pkt Len Mean   571526 non-null  float64
 11  Fwd Pkt Len Std    571526 non-null  float64
 12  Bwd Pkt Len Max    571526 non-null  float64
 13  Bwd Pkt Len Min    571526 non-null  float64
 14  Bwd Pkt Len Mean   571526 non-null  float64
 15  Bwd Pkt Len Std    571526 non-null  float64
 16  Fl

None

Dataframe: 'df_equal_DDoS_HOIC'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,721666.0,721666.0,721666.0,721666.0,721666.0,721666.0,721666.0,721666.0,721666.0,721666.0,...,721666.0,721666.0,721666.0,721666.0,721666.0,721666.0,721666.0,721666.0,721666.0,721666.0
mean,0.433736,0.354634,0.054161,0.002477,9e-06,0.011882,5.1e-05,0.028602,0.225091,0.000301,...,2e-06,0.453843,0.000546,0.000232,0.000521,0.000396,0.00043,0.000463,0.000784,0.000224
std,0.438019,0.033499,0.04975,0.046454,5e-06,0.010743,4.5e-05,0.031416,0.198388,0.010972,...,2e-06,0.015263,0.012252,0.007696,0.012159,0.009453,0.009572,0.010398,0.016155,0.006702
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.001221,0.352941,0.042616,1.1e-05,3e-06,0.0,0.0,0.0,0.0,0.0,...,0.0,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.001221,0.352941,0.051967,2.8e-05,6e-06,0.016129,3.7e-05,0.029931,0.153777,0.0,...,3e-06,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.86642,0.352941,0.061349,0.00012,1.3e-05,0.016129,9.4e-05,0.034645,0.420414,0.0,...,3e-06,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,0.000475,1.0,0.001361,1.0,1.0,1.0,...,0.000472,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
480282,0.955596,0.352941,0.048112,7.9e-05,1.3e-05,0.016129,9.4e-05,0.033768,0.420414,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
660816,0.797021,0.352941,0.055252,1e-05,1.3e-05,0.016129,9.4e-05,0.033878,0.420414,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
773378,0.88287,0.352941,0.059706,1.1e-05,1.3e-05,0.016129,9.4e-05,0.037167,0.420414,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
396839,0.847652,0.352941,0.044795,6.6e-05,1.3e-05,0.016129,9.4e-05,0.030041,0.420414,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
874893,0.935362,0.352941,0.063813,0.000123,1.3e-05,0.016129,9.4e-05,0.035413,0.420414,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


<class 'pandas.core.frame.DataFrame'>
Index: 721666 entries, 480282 to 251611
Data columns (total 80 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Dst Port           721666 non-null  float64
 1   Protocol           721666 non-null  float64
 2   Timestamp          721666 non-null  float64
 3   Flow Duration      721666 non-null  float64
 4   Tot Fwd Pkts       721666 non-null  float64
 5   Tot Bwd Pkts       721666 non-null  float64
 6   TotLen Fwd Pkts    721666 non-null  float64
 7   TotLen Bwd Pkts    721666 non-null  float64
 8   Fwd Pkt Len Max    721666 non-null  float64
 9   Fwd Pkt Len Min    721666 non-null  float64
 10  Fwd Pkt Len Mean   721666 non-null  float64
 11  Fwd Pkt Len Std    721666 non-null  float64
 12  Bwd Pkt Len Max    721666 non-null  float64
 13  Bwd Pkt Len Min    721666 non-null  float64
 14  Bwd Pkt Len Mean   721666 non-null  float64
 15  Bwd Pkt Len Std    721666 non-null  float64
 16  Fl

None

Dataframe: 'df_equal_DDoS_LOIC_UDP'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,174730.0,174730.0,174730.0,174730.0,174730.0,174730.0,174730.0,174730.0,174730.0,174730.0,...,174730.0,174730.0,174730.0,174730.0,174730.0,174730.0,174730.0,174730.0,174730.0,174730.0
mean,0.857723,0.362621,0.07057,0.01426747,0.003765,0.015894,0.003846,0.032386,0.414103,0.000918,...,0.003756,0.45049,0.001028,0.000429,0.000974,0.000749,0.000833,0.0009,0.001524,0.000432
std,0.129194,0.07888,0.112196,0.1157993,0.038608,0.002865,0.038601,0.006175,0.049881,0.015923,...,0.038609,0.034141,0.016485,0.010178,0.016271,0.012635,0.013287,0.014475,0.02254,0.009309
min,0.0,0.0,0.0,1.666667e-08,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.810389,0.352941,0.049376,1.0325e-05,1.3e-05,0.016129,9.4e-05,0.029931,0.420414,0.0,...,3e-06,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.865154,0.352941,0.056595,1.226667e-05,1.3e-05,0.016129,9.4e-05,0.033878,0.420414,0.0,...,3e-06,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.929945,0.352941,0.063908,9.86e-05,1.3e-05,0.016129,9.4e-05,0.034645,0.420414,0.0,...,3e-06,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,0.999842,1.0,1.0,0.806452,1.0,0.965793,0.52518,1.0,...,1.0,0.909091,0.825,1.0,0.928571,0.65,0.95283,0.918489,0.95283,0.95283


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
480282,0.955596,0.352941,0.048112,7.9e-05,1.3e-05,0.016129,9.4e-05,0.033768,0.420414,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
660816,0.797021,0.352941,0.055252,1e-05,1.3e-05,0.016129,9.4e-05,0.033878,0.420414,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
773378,0.88287,0.352941,0.059706,1.1e-05,1.3e-05,0.016129,9.4e-05,0.037167,0.420414,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
396839,0.847652,0.352941,0.044795,6.6e-05,1.3e-05,0.016129,9.4e-05,0.030041,0.420414,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
874893,0.935362,0.352941,0.063813,0.000123,1.3e-05,0.016129,9.4e-05,0.035413,0.420414,0.0,...,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


<class 'pandas.core.frame.DataFrame'>
Index: 174730 entries, 480282 to 382
Data columns (total 80 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Dst Port           174730 non-null  float64
 1   Protocol           174730 non-null  float64
 2   Timestamp          174730 non-null  float64
 3   Flow Duration      174730 non-null  float64
 4   Tot Fwd Pkts       174730 non-null  float64
 5   Tot Bwd Pkts       174730 non-null  float64
 6   TotLen Fwd Pkts    174730 non-null  float64
 7   TotLen Bwd Pkts    174730 non-null  float64
 8   Fwd Pkt Len Max    174730 non-null  float64
 9   Fwd Pkt Len Min    174730 non-null  float64
 10  Fwd Pkt Len Mean   174730 non-null  float64
 11  Fwd Pkt Len Std    174730 non-null  float64
 12  Bwd Pkt Len Max    174730 non-null  float64
 13  Bwd Pkt Len Min    174730 non-null  float64
 14  Bwd Pkt Len Mean   174730 non-null  float64
 15  Bwd Pkt Len Std    174730 non-null  float64
 16  Flow 

None

Dataframe: 'df_equal_DoS_GoldenEye'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,79848.0,79848.0,79848.0,79848.0,79848.0,79848.0,79848.0,79848.0,79848.0,79848.0,...,79848.0,79848.0,79848.0,79848.0,79848.0,79848.0,79848.0,79848.0,79848.0,79848.0
mean,0.052729,0.466403,0.642422,0.091528,0.012654,0.007097,0.009031,0.001852,0.106903,0.042147,...,0.031111,0.425153,0.006118,0.00618,0.007706,0.004956,0.045844,0.002236,0.046133,0.045369
std,0.18981,0.252423,0.243512,0.201443,0.021463,0.017511,0.018355,0.014444,0.118951,0.092744,...,0.061725,0.169301,0.052387,0.050869,0.054078,0.049366,0.111148,0.028001,0.111687,0.110704
min,0.0,0.0,2.3e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.001221,0.352941,0.679593,0.000198,0.004184,0.0,0.0,0.0,0.0,0.0,...,0.0,0.357143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.001221,0.352941,0.722679,0.017839,0.008368,0.004367,0.002536,0.000344,0.026931,0.0,...,0.02439,0.571429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.001221,0.352941,0.743814,0.074633,0.012552,0.008734,0.012559,0.001591,0.203252,0.0,...,0.02439,0.571429,0.0,0.0,0.0,0.0,0.065764,0.0,0.065764,0.065063
max,0.999802,1.0,1.0,1.0,0.853556,0.89738,0.788521,0.853846,0.74187,0.794286,...,0.804878,1.0,0.940379,0.993974,0.91176,0.997226,0.763445,0.997888,0.763445,0.763445


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
236775,0.051714,0.352941,0.77437,0.02059401,0.029289,0.015284,0.034247,0.002588,0.344004,0.0,...,0.357143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
67116,0.051714,0.352941,0.643834,0.02510855,0.029289,0.015284,0.034128,0.002588,0.344004,0.0,...,0.357143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
76232,0.778115,0.352941,0.950022,8.333333e-09,0.008368,0.0,0.000925,0.0,0.015752,0.0,...,0.357143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
120979,0.00676,0.352941,0.01088,0.5041684,0.07113,0.041485,0.111244,0.012973,0.621951,0.0,...,0.357143,0.010053,0.049027,0.038852,0.001873,0.098376,0.00027,0.098467,0.098352,Benign
265464,0.00676,0.352941,0.215561,0.9644264,0.083682,0.048035,0.011008,0.013968,0.102134,0.0,...,0.357143,0.007808,0.024242,0.028514,0.003622,0.09856,0.002245,0.09955,0.09804,Benign


<class 'pandas.core.frame.DataFrame'>
Index: 79848 entries, 236775 to 6394
Data columns (total 80 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Dst Port           79848 non-null  float64
 1   Protocol           79848 non-null  float64
 2   Timestamp          79848 non-null  float64
 3   Flow Duration      79848 non-null  float64
 4   Tot Fwd Pkts       79848 non-null  float64
 5   Tot Bwd Pkts       79848 non-null  float64
 6   TotLen Fwd Pkts    79848 non-null  float64
 7   TotLen Bwd Pkts    79848 non-null  float64
 8   Fwd Pkt Len Max    79848 non-null  float64
 9   Fwd Pkt Len Min    79848 non-null  float64
 10  Fwd Pkt Len Mean   79848 non-null  float64
 11  Fwd Pkt Len Std    79848 non-null  float64
 12  Bwd Pkt Len Max    79848 non-null  float64
 13  Bwd Pkt Len Min    79848 non-null  float64
 14  Bwd Pkt Len Mean   79848 non-null  float64
 15  Bwd Pkt Len Std    79848 non-null  float64
 16  Flow Byts/s        7984

None

Dataframe: 'df_equal_DoS_Hulk'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,878252.0,878252.0,878252.0,878252.0,878252.0,878252.0,878252.0,878252.0,878252.0,878252.0,...,878252.0,878252.0,878252.0,878252.0,878252.0,878252.0,878252.0,878252.0,878252.0,878252.0
mean,0.385176,0.0,0.046558,0.1760404,0.321099,0.175216,0.284759,0.086769,0.503736,0.0,...,0.129989,0.599332,3.2e-05,0.0,3.2e-05,3.2e-05,0.053776,0.0,0.053776,0.053776
std,0.395478,0.0,0.00509,0.1753944,0.179027,0.175363,0.279361,0.09989,0.494597,0.0,...,0.125939,0.020011,0.003591,0.0,0.003591,0.003591,0.22039,0.0,0.22039,0.22039
min,1.6e-05,0.0,0.0,7.838306e-08,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.000968,0.0,0.045322,0.0009817478,0.125,0.0,0.0,0.0,0.0,0.0,...,0.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.000968,0.0,0.046539,0.2447624,0.375,0.3,0.225242,0.11899,0.394652,0.0,...,0.25,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.764797,0.0,0.047731,0.3372167,0.5,0.3,0.564614,0.141827,1.0,0.0,...,0.25,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,1.0,0.6,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
334082,0.892746,0.0,0.045346,0.334324,0.5,0.3,0.564614,0.135417,1.0,0.0,...,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
861664,0.700018,0.0,0.048145,0.332543,0.5,0.3,0.564614,0.125401,1.0,0.0,...,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
692163,0.927218,0.0,0.047245,0.334991,0.5,0.3,0.564614,0.136218,1.0,0.0,...,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
437819,0.580465,0.0,0.045906,0.333088,0.375,0.5,0.564614,0.116186,1.0,0.0,...,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
207051,0.791495,0.0,0.044665,0.40985,0.5,0.3,0.564614,0.14984,1.0,0.0,...,0.6,0.0,0.0,0.0,0.0,0.934558,0.0,0.934558,0.934558,Benign


<class 'pandas.core.frame.DataFrame'>
Index: 878252 entries, 334082 to 872109
Data columns (total 80 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Dst Port           878252 non-null  float64
 1   Protocol           878252 non-null  float64
 2   Timestamp          878252 non-null  float64
 3   Flow Duration      878252 non-null  float64
 4   Tot Fwd Pkts       878252 non-null  float64
 5   Tot Bwd Pkts       878252 non-null  float64
 6   TotLen Fwd Pkts    878252 non-null  float64
 7   TotLen Bwd Pkts    878252 non-null  float64
 8   Fwd Pkt Len Max    878252 non-null  float64
 9   Fwd Pkt Len Min    878252 non-null  float64
 10  Fwd Pkt Len Mean   878252 non-null  float64
 11  Fwd Pkt Len Std    878252 non-null  float64
 12  Bwd Pkt Len Max    878252 non-null  float64
 13  Bwd Pkt Len Min    878252 non-null  float64
 14  Bwd Pkt Len Mean   878252 non-null  float64
 15  Bwd Pkt Len Std    878252 non-null  float64
 16  Fl

None

Dataframe: 'df_equal_DoS_SlowHTTPTest'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,279780.0,279780.0,279780.0,279780.0,279780.0,279780.0,279780.0,279780.0,279780.0,279780.0,...,279780.0,279780.0,279780.0,279780.0,279780.0,279780.0,279780.0,279780.0,279780.0,279780.0
mean,0.384658,0.0,0.429474,0.1719125,0.245515,0.215016,0.279568,0.078234,0.49515,0.0,...,0.123793,0.799299,2.7e-05,0.0,2.7e-05,2.7e-05,0.053617,0.0,0.053617,0.053617
std,0.395872,0.0,0.383141,0.1763478,0.248102,0.125125,0.282293,0.090611,0.499976,0.0,...,0.125002,0.201745,0.003395,0.0,0.003395,0.003395,0.22007,0.0,0.22007,0.22007
min,0.0,0.0,0.013363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.046539,1.567661e-07,0.0,0.1,0.0,0.0,0.0,0.0,...,0.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,8e-06,0.0,0.78432,3.919153e-06,0.0,0.1,0.0,0.0,0.0,0.0,...,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.764838,0.0,0.80995,0.3368767,0.5,0.3,0.564614,0.140625,1.0,0.0,...,0.25,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,0.0,0.998832,1.0,1.0,1.0,0.564614,0.990385,1.0,0.0,...,0.75,1.0,0.983331,0.0,0.983331,0.983331,0.998928,0.0,0.998928,0.998928


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
334082,0.892746,0.0,0.045346,0.334324,0.5,0.3,0.564614,0.135417,1.0,0.0,...,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
861664,0.700018,0.0,0.048145,0.332543,0.5,0.3,0.564614,0.125401,1.0,0.0,...,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
692163,0.927218,0.0,0.047245,0.334991,0.5,0.3,0.564614,0.136218,1.0,0.0,...,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
437819,0.580465,0.0,0.045906,0.333088,0.375,0.5,0.564614,0.116186,1.0,0.0,...,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
207051,0.791495,0.0,0.044665,0.40985,0.5,0.3,0.564614,0.14984,1.0,0.0,...,0.6,0.0,0.0,0.0,0.0,0.934558,0.0,0.934558,0.934558,Benign


<class 'pandas.core.frame.DataFrame'>
Index: 279780 entries, 334082 to 1031063
Data columns (total 80 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Dst Port           279780 non-null  float64
 1   Protocol           279780 non-null  float64
 2   Timestamp          279780 non-null  float64
 3   Flow Duration      279780 non-null  float64
 4   Tot Fwd Pkts       279780 non-null  float64
 5   Tot Bwd Pkts       279780 non-null  float64
 6   TotLen Fwd Pkts    279780 non-null  float64
 7   TotLen Bwd Pkts    279780 non-null  float64
 8   Fwd Pkt Len Max    279780 non-null  float64
 9   Fwd Pkt Len Min    279780 non-null  float64
 10  Fwd Pkt Len Mean   279780 non-null  float64
 11  Fwd Pkt Len Std    279780 non-null  float64
 12  Bwd Pkt Len Max    279780 non-null  float64
 13  Bwd Pkt Len Min    279780 non-null  float64
 14  Bwd Pkt Len Mean   279780 non-null  float64
 15  Bwd Pkt Len Std    279780 non-null  float64
 16  F

None

Dataframe: 'df_equal_DoS_Slowloris'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,275124.0,275124.0,275124.0,275124.0,275124.0,275124.0,275124.0,275124.0,275124.0,275124.0,...,275124.0,275124.0,275124.0,275124.0,275124.0,275124.0,275124.0,275124.0,275124.0,275124.0
mean,0.102586,0.577906,0.557866,0.108599,0.014188,0.0089,0.008881,0.002742,0.073734,0.083501,...,0.044055,0.282657,0.007526,0.010719,0.011508,0.005442,0.032216,0.003412,0.032618,0.031435
std,0.256483,0.318297,0.322728,0.269972,0.02922,0.024128,0.023768,0.020003,0.120258,0.116471,...,0.082503,0.126685,0.049425,0.06328,0.059786,0.047873,0.116022,0.034004,0.116905,0.115158
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.000809,0.352941,0.225306,4e-06,0.0,0.002183,0.0,0.0,0.0,0.0,...,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.001221,0.352941,0.680965,0.000218,0.004184,0.002183,0.001313,0.000164,0.020833,0.0,...,0.0,0.357143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.00679,1.0,0.835274,0.020362,0.012552,0.008734,0.004803,0.000799,0.075203,0.205714,...,0.02439,0.357143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.999985,1.0,1.0,1.0,0.987448,1.0,0.858507,0.937471,0.74187,0.794286,...,1.0,1.0,0.945501,0.996472,1.0,0.997226,0.982567,0.99938,0.982567,0.982567


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
236775,0.051714,0.352941,0.77437,0.02059401,0.029289,0.015284,0.034247,0.002588,0.344004,0.0,...,0.357143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
67116,0.051714,0.352941,0.643834,0.02510855,0.029289,0.015284,0.034128,0.002588,0.344004,0.0,...,0.357143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
76232,0.778115,0.352941,0.950022,8.333333e-09,0.008368,0.0,0.000925,0.0,0.015752,0.0,...,0.357143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
120979,0.00676,0.352941,0.01088,0.5041684,0.07113,0.041485,0.111244,0.012973,0.621951,0.0,...,0.357143,0.010053,0.049027,0.038852,0.001873,0.098376,0.00027,0.098467,0.098352,Benign
265464,0.00676,0.352941,0.215561,0.9644264,0.083682,0.048035,0.011008,0.013968,0.102134,0.0,...,0.357143,0.007808,0.024242,0.028514,0.003622,0.09856,0.002245,0.09955,0.09804,Benign


<class 'pandas.core.frame.DataFrame'>
Index: 275124 entries, 236775 to 47405
Data columns (total 80 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Dst Port           275124 non-null  float64
 1   Protocol           275124 non-null  float64
 2   Timestamp          275124 non-null  float64
 3   Flow Duration      275124 non-null  float64
 4   Tot Fwd Pkts       275124 non-null  float64
 5   Tot Bwd Pkts       275124 non-null  float64
 6   TotLen Fwd Pkts    275124 non-null  float64
 7   TotLen Bwd Pkts    275124 non-null  float64
 8   Fwd Pkt Len Max    275124 non-null  float64
 9   Fwd Pkt Len Min    275124 non-null  float64
 10  Fwd Pkt Len Mean   275124 non-null  float64
 11  Fwd Pkt Len Std    275124 non-null  float64
 12  Bwd Pkt Len Max    275124 non-null  float64
 13  Bwd Pkt Len Min    275124 non-null  float64
 14  Bwd Pkt Len Mean   275124 non-null  float64
 15  Bwd Pkt Len Std    275124 non-null  float64
 16  Flo

None

Dataframe: 'df_equal_FTP_BruteForce'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,386708.0,386708.0,386708.0,386708.0,386708.0,386708.0,386708.0,386708.0,386708.0,386708.0,...,386708.0,386708.0,386708.0,386708.0,386708.0,386708.0,386708.0,386708.0,386708.0,386708.0
mean,0.056673,0.455419,0.738437,0.04249734,0.006136,0.004913,0.005811,0.000871,0.060418,0.04874,...,0.023388,0.581167,0.002821,0.003601,0.004849,0.002317,0.007616,0.0002,0.007684,0.007398
std,0.194608,0.242841,0.255259,0.1678103,0.014811,0.009705,0.018945,0.005546,0.14164,0.113778,...,0.060694,0.267777,0.033461,0.038927,0.043399,0.03318,0.051088,0.00456,0.051308,0.050789
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.00032,0.352941,0.726938,8.333334e-09,0.0,0.002174,0.0,0.0,0.0,0.0,...,0.0,0.416667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.00032,0.352941,0.83414,1.666667e-07,0.0,0.002174,0.0,0.0,0.0,0.0,...,0.0,0.833333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.001221,0.352941,0.887382,0.0001946521,0.003788,0.002174,0.001661,0.000204,0.028767,0.0,...,0.0,0.833333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.999985,1.0,1.0,0.999994,1.0,0.58913,1.0,0.57968,1.0,1.0,...,1.0,0.833333,1.0,0.999617,0.997257,0.998352,0.999861,1.0,0.999861,0.999861


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
934039,0.953336,0.352941,0.72916,1e-06,0.003788,0.0,0.0,0.0,0.0,0.0,...,0.416667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
748956,0.051714,0.352941,0.016274,0.026025,0.034091,0.023913,0.046876,0.003989,0.45274,0.0,...,0.416667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
471019,0.000809,1.0,0.162874,0.000473,0.003788,0.004348,0.002167,0.000438,0.020548,0.218978,...,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
393759,0.001221,0.352941,0.738281,0.000215,0.007576,0.004348,0.028855,0.001,0.54726,0.0,...,0.416667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
670704,0.00676,0.352941,0.80481,0.506987,0.049242,0.028261,0.024558,0.007956,0.234247,0.0,...,0.416667,0.028965,0.051137,0.053409,0.021541,0.083405,7e-06,0.083407,0.083397,Benign


<class 'pandas.core.frame.DataFrame'>
Index: 386708 entries, 934039 to 44417
Data columns (total 80 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Dst Port           386708 non-null  float64
 1   Protocol           386708 non-null  float64
 2   Timestamp          386708 non-null  float64
 3   Flow Duration      386708 non-null  float64
 4   Tot Fwd Pkts       386708 non-null  float64
 5   Tot Bwd Pkts       386708 non-null  float64
 6   TotLen Fwd Pkts    386708 non-null  float64
 7   TotLen Bwd Pkts    386708 non-null  float64
 8   Fwd Pkt Len Max    386708 non-null  float64
 9   Fwd Pkt Len Min    386708 non-null  float64
 10  Fwd Pkt Len Mean   386708 non-null  float64
 11  Fwd Pkt Len Std    386708 non-null  float64
 12  Bwd Pkt Len Max    386708 non-null  float64
 13  Bwd Pkt Len Min    386708 non-null  float64
 14  Bwd Pkt Len Mean   386708 non-null  float64
 15  Bwd Pkt Len Std    386708 non-null  float64
 16  Flo

None

Dataframe: 'df_equal_Infilteration'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,305722.0,305722.0,305722.0,305722.0,305722.0,305722.0,305722.0,305722.0,305722.0,305722.0,...,305722.0,305722.0,305722.0,305722.0,305722.0,305722.0,305722.0,305722.0,305722.0,305722.0
mean,0.113019,0.577284,0.49815,0.090431,0.004461,0.008698,0.004537,0.002294,0.080014,0.066683,...,0.003657,0.358589,0.004965,0.002509,0.005381,0.004675,0.039266,0.007775,0.040346,0.037817
std,0.258043,0.313579,0.335958,0.257965,0.010819,0.019961,0.012619,0.013067,0.134118,0.09582,...,0.008939,0.151648,0.030209,0.020588,0.029272,0.032814,0.134684,0.056783,0.136657,0.133161
min,0.0,0.0,4.6e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.000809,0.352941,0.139587,2e-06,0.0,0.002146,0.0,0.0,0.0,0.0,...,0.0,0.181818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.00676,0.352941,0.637793,1.3e-05,0.000902,0.002222,0.000617,0.000134,0.020325,0.0,...,0.0,0.416667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.051713,1.0,0.82113,0.011623,0.004509,0.008889,0.003714,0.000547,0.071138,0.154412,...,0.003571,0.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,0.999977,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.996587,1.0,0.995261,1.0,0.995261,0.995261


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
107488,0.000809,1.0,0.804741,0.0002175584,0.000902,0.004444,0.001263,0.000428,0.018293,0.189474,...,0.181818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
361456,0.259289,0.352941,0.135883,1.666667e-08,0.0,0.002222,0.0,0.0,0.0,0.0,...,0.545455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
155548,0.00676,0.352941,0.723095,0.977305,0.010821,0.026667,0.010418,0.007466,0.098577,0.0,...,0.454545,0.00491,0.005161,0.005162,0.002967,0.6913,0.044408,0.696655,0.685945,Benign
51908,0.000809,1.0,0.894928,1.581667e-05,0.0,0.002222,0.000614,0.000135,0.017785,0.184211,...,0.181818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
77971,0.00679,0.352941,0.757355,0.005580142,0.001931,0.004292,0.0,0.0,0.0,0.0,...,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


<class 'pandas.core.frame.DataFrame'>
Index: 305722 entries, 107488 to 277147
Data columns (total 80 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Dst Port           305722 non-null  float64
 1   Protocol           305722 non-null  float64
 2   Timestamp          305722 non-null  float64
 3   Flow Duration      305722 non-null  float64
 4   Tot Fwd Pkts       305722 non-null  float64
 5   Tot Bwd Pkts       305722 non-null  float64
 6   TotLen Fwd Pkts    305722 non-null  float64
 7   TotLen Bwd Pkts    305722 non-null  float64
 8   Fwd Pkt Len Max    305722 non-null  float64
 9   Fwd Pkt Len Min    305722 non-null  float64
 10  Fwd Pkt Len Mean   305722 non-null  float64
 11  Fwd Pkt Len Std    305722 non-null  float64
 12  Bwd Pkt Len Max    305722 non-null  float64
 13  Bwd Pkt Len Min    305722 non-null  float64
 14  Bwd Pkt Len Mean   305722 non-null  float64
 15  Bwd Pkt Len Std    305722 non-null  float64
 16  Fl

None

Dataframe: 'df_equal_SSH_BruteForce'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,375178.0,375178.0,375178.0,375178.0,375178.0,375178.0,375178.0,375178.0,375178.0,375178.0,...,375178.0,375178.0,375178.0,375178.0,375178.0,375178.0,375178.0,375178.0,375178.0,375178.0
mean,0.056736,0.4555,0.380976,0.04330484,0.026445,0.015952,0.023276,0.002101,0.169954,0.048781,...,0.123394,0.497821,0.002797,0.003608,0.004835,0.002291,0.007615,0.0002,0.007683,0.007397
std,0.194716,0.24291,0.321252,0.1677383,0.034809,0.020023,0.032783,0.005803,0.206969,0.113813,...,0.170327,0.191412,0.033182,0.039038,0.043327,0.032889,0.051083,0.004585,0.051304,0.050785
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.000336,0.352941,0.130281,1.083333e-07,0.0,0.002174,0.0,0.0,0.0,0.0,...,0.0,0.416667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.000336,0.352941,0.188106,0.0001933334,0.003788,0.002174,0.001661,0.000204,0.028767,0.0,...,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.001221,0.352941,0.727048,0.003212773,0.075758,0.043478,0.06905,0.004906,0.438356,0.0,...,0.4,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.999985,1.0,1.0,0.999994,1.0,0.58913,1.0,0.57968,1.0,1.0,...,1.0,0.833333,1.0,0.999617,0.997257,0.998352,0.999861,1.0,0.999861,0.999861


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
934039,0.953336,0.352941,0.72916,1e-06,0.003788,0.0,0.0,0.0,0.0,0.0,...,0.416667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
748956,0.051714,0.352941,0.016274,0.026025,0.034091,0.023913,0.046876,0.003989,0.45274,0.0,...,0.416667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
471019,0.000809,1.0,0.162874,0.000473,0.003788,0.004348,0.002167,0.000438,0.020548,0.218978,...,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
393759,0.001221,0.352941,0.738281,0.000215,0.007576,0.004348,0.028855,0.001,0.54726,0.0,...,0.416667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
670704,0.00676,0.352941,0.80481,0.506987,0.049242,0.028261,0.024558,0.007956,0.234247,0.0,...,0.416667,0.028965,0.051137,0.053409,0.021541,0.083405,7e-06,0.083407,0.083397,Benign


<class 'pandas.core.frame.DataFrame'>
Index: 375178 entries, 934039 to 258195
Data columns (total 80 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Dst Port           375178 non-null  float64
 1   Protocol           375178 non-null  float64
 2   Timestamp          375178 non-null  float64
 3   Flow Duration      375178 non-null  float64
 4   Tot Fwd Pkts       375178 non-null  float64
 5   Tot Bwd Pkts       375178 non-null  float64
 6   TotLen Fwd Pkts    375178 non-null  float64
 7   TotLen Bwd Pkts    375178 non-null  float64
 8   Fwd Pkt Len Max    375178 non-null  float64
 9   Fwd Pkt Len Min    375178 non-null  float64
 10  Fwd Pkt Len Mean   375178 non-null  float64
 11  Fwd Pkt Len Std    375178 non-null  float64
 12  Bwd Pkt Len Max    375178 non-null  float64
 13  Bwd Pkt Len Min    375178 non-null  float64
 14  Bwd Pkt Len Mean   375178 non-null  float64
 15  Bwd Pkt Len Std    375178 non-null  float64
 16  Fl

None

Dataframe: 'df_BruteForce_Web_XSS'



Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,2018842.0,2018842.0,2018842.0,2018842.0,2018842.0,2018842.0,2018842.0,2018842.0,2018842.0,2018842.0,...,2018842.0,2018842.0,2018842.0,2018842.0,2018842.0,2018842.0,2018842.0,2018842.0,2018842.0,2018842.0
mean,0.131693,0.5372895,0.5484077,0.1477406,0.007344866,0.003922449,0.01777419,0.001160613,0.09123627,0.05708087,...,0.06045183,0.3533774,0.005652426,0.008268464,0.009102566,0.003779629,0.0771479,0.001990953,0.07749086,0.07651458
std,0.2706631,0.3010519,0.3241216,0.2950113,0.01476332,0.01133628,0.03661285,0.009761032,0.1373034,0.09371883,...,0.09898134,0.1290342,0.04107647,0.05554513,0.05382829,0.03986322,0.2064313,0.02176179,0.2068085,0.2062621
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0008087528,0.3529412,0.2268571,4.916667e-06,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.1818182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.00675985,0.3529412,0.6660802,0.0007580046,0.002309469,0.001203369,0.002452404,6.973217e-05,0.02083333,0.0,...,0.0,0.4166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0517144,1.0,0.8375425,0.04143242,0.009333333,0.004813478,0.0233067,0.0004419389,0.09603659,0.1516588,...,0.1052632,0.4545455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.000336,0.352941,0.609019,0.012772,0.013333,0.007388,0.076089,0.000992,0.329268,0.0,...,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,0.000336,0.352941,0.61101,0.014545,0.013333,0.007388,0.076089,0.000992,0.329268,0.0,...,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
6,0.0,0.0,0.608926,0.93868,0.002667,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.469372,1e-05,0.469373,0.469372,Benign
7,0.000336,0.352941,0.613,0.013053,0.013333,0.007388,0.076089,0.000992,0.329268,0.0,...,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
12,0.000336,0.352941,0.614991,0.013425,0.016,0.007388,0.076089,0.000992,0.329268,0.0,...,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


<class 'pandas.core.frame.DataFrame'>
Index: 2018842 entries, 0 to 1048574
Data columns (total 80 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Dst Port           float64
 1   Protocol           float64
 2   Timestamp          float64
 3   Flow Duration      float64
 4   Tot Fwd Pkts       float64
 5   Tot Bwd Pkts       float64
 6   TotLen Fwd Pkts    float64
 7   TotLen Bwd Pkts    float64
 8   Fwd Pkt Len Max    float64
 9   Fwd Pkt Len Min    float64
 10  Fwd Pkt Len Mean   float64
 11  Fwd Pkt Len Std    float64
 12  Bwd Pkt Len Max    float64
 13  Bwd Pkt Len Min    float64
 14  Bwd Pkt Len Mean   float64
 15  Bwd Pkt Len Std    float64
 16  Flow Byts/s        float64
 17  Flow Pkts/s        float64
 18  Flow IAT Mean      float64
 19  Flow IAT Std       float64
 20  Flow IAT Max       float64
 21  Flow IAT Min       float64
 22  Fwd IAT Tot        float64
 23  Fwd IAT Mean       float64
 24  Fwd IAT Std        float64
 25  Fwd IAT Max        floa

None

In [None]:
combined = pd.concat(dfs_final.values())

Merge all attacks in one dataset for future algorithm analysis. There will be some null values because the datasets don't have the same number of columns. Those values will be replaced by NaN droped

In [None]:
for column in combined.columns:
    if (combined[column] == 0).all():
        combined[column] = combined[column].replace(0,np.nan)

combined=combined.dropna(axis=1,how='all')

In [None]:
combined.describe()

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,6067376.0,6067376.0,6067376.0,6067376.0,6067376.0,6067376.0,6067376.0,6067376.0,6067376.0,6067376.0,...,6067376.0,6067376.0,6067376.0,6067376.0,6067376.0,6067376.0,6067376.0,6067376.0,6067376.0,6067376.0
mean,0.2236834,0.3914372,0.395645,0.1047105,0.06395943,0.04106954,0.06450288,0.02140946,0.2005419,0.03674262,...,0.05786123,0.4514294,0.003182855,0.004058433,0.00484778,0.002301969,0.04321522,0.001504475,0.0434847,0.04280323
std,0.351064,0.3061562,0.3596527,0.2314279,0.1464189,0.1013113,0.1644697,0.05524785,0.3036082,0.0869165,...,0.1044076,0.1806451,0.0313743,0.03873132,0.03919678,0.03046238,0.1661555,0.02130553,0.1665322,0.1659264
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0008087528,0.3529412,0.04877811,6.050001e-06,3.229682e-06,0.0,0.0,0.0,0.0,0.0,...,0.0,0.4166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.00675985,0.3529412,0.2177365,0.0001423436,0.002309469,0.002183406,0.001076763,0.0001164134,0.02337398,0.0,...,0.0,0.4545455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.5576037,0.3529412,0.7947406,0.03252026,0.02078522,0.01612903,0.0210187,0.003467101,0.3683943,0.0,...,0.05102041,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
combined['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
Benign,4263038
DoS attacks-Hulk,439126
DDOS attack-HOIC,360833
Bot,285763
FTP-BruteForce,193354
SSH-Bruteforce,187589
Infilteration,152861
DoS attacks-SlowHTTPTest,139890
DoS attacks-GoldenEye,39924
DoS attacks-Slowloris,2724


In [None]:
df_std = combined.select_dtypes(include=np.number).std()
display(df_std.sort_values())

Unnamed: 0,0
FIN Flag Cnt,0.000406
Idle Std,0.021306
Bwd IAT Min,0.027300
Active Min,0.030462
Active Mean,0.031374
...,...
Init Bwd Win Byts,0.368085
ECE Flag Cnt,0.394477
RST Flag Cnt,0.394477
ACK Flag Cnt,0.491794


In [None]:
combined.shape

(6067376, 70)

In [None]:
output_dir = 'cicids2018_preprocessed.parquet'
combined.to_parquet(output_dir)

## The datasets output are preprocessed and ready to use in ML modeling