In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataset_path = r'/content/drive/MyDrive/cic-ids2'

In [None]:
import numpy as np
import pandas as pd
import glob
import os

In [None]:
types = {
    'Dst Port': np.int32,
    'Protocol': np.int8,
    'Flow Duration': np.int64,
    'Tot Fwd Pkts': np.int16,
    'Tot Bwd Pkts': np.int16,
    'TotLen Fwd Pkts': np.int32,
    'TotLen Bwd Pkts': np.int32,
    'Fwd Pkt Len Max': np.int32,
    'Fwd Pkt Len Min': np.int32,
    'Fwd Pkt Len Mean': np.float64,
    'Fwd Pkt Len Std': np.float64,
    'Bwd Pkt Len Max': np.int16,
    'Bwd Pkt Len Min': np.int16,
    'Bwd Pkt Len Mean': np.float64,
    'Bwd Pkt Len Std': np.float64,
    'Flow Byts/s': np.float64,
    'Flow Pkts/s': np.float64,
    'Flow IAT Mean': np.float64,
    'Flow IAT Std': np.float64,
    'Flow IAT Max': np.int64,
    'Flow IAT Min': np.int32,
    'Fwd IAT Tot': np.int32,
    'Fwd IAT Mean': np.float32,
    'Fwd IAT Std': np.float64,
    'Fwd IAT Max': np.int32, 
    'Fwd IAT Min': np.int32,
    'Bwd IAT Tot': np.int32,
    'Bwd IAT Mean': np.float64,
    'Bwd IAT Std': np.float64,
    'Bwd IAT Max': np.int64,
    'Bwd IAT Min': np.int64,
    'Fwd PSH Flags': np.int8,
    'Bwd PSH Flags': np.int8,
    'Fwd URG Flags': np.int8,
    'Bwd URG Flags': np.int8,
    'Fwd Header Len': np.int32,
    'Bwd Header Len': np.int32,
    'Fwd Pkts/s' : np.float64,
    'Bwd Pkts/s': np.float64,
    'Pkt Len Min': np.int16,
    'Pkt Len Max': np.int32,
    'Pkt Len Mean': np.float64,
    'Pkt Len Std': np.float64,
    'Pkt Len Var': np.float64,
    'FIN Flag Cnt': np.int8,
    'SYN Flag Cnt': np.int8,
    'RST Flag Cnt': np.int8,
    'PSH Flag Cnt': np.int8,
    'ACK Flag Cnt': np.int8,
    'URG Flag Cnt': np.int8,
    'CWE Flag Count': np.int8,
    'ECE Flag Cnt': np.int8,
    'Pkt Size Avg': np.float32,
    'Fwd Seg Size Avg': np.float32,
    'Bwd Seg Size Avg': np.float32,
    'Fwd Byts/b Avg': np.int8,
    'Fwd Pkts/b Avg': np.int8,
    'Fwd Blk Rate Avg': np.int8,
    'Bwd Byts/b Avg': np.int8,
    'Bwd Pkts/b Avg': np.int8,
    'Bwd Blk Rate Avg': np.int8,
    'Subflow Fwd Pkts': np.int16,
    'Subflow Fwd Byts': np.int32,
    'Subflow Bwd Pkts': np.int16,
    'Subflow Bwd Byts': np.int32,
    'Init Fwd Win Byts': np.int32, 
    'Init Bwd Win Byts': np.int32,
    'Fwd Act Data Pkts': np.int16,
    'Fwd Seg Size Min': np.int8,
    'Active Mean': np.float64,
    'Active Std': np.float64,
    'Active Max': np.int32,
    'Active Min': np.int32,
    'Idle Mean': np.float64,
    'Idle Std': np.float64,
    'Idle Max': np.int64,
    'Idle Min': np.int64,
    'Label': object
}
used_cols = (types.keys())

In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/cic-ids2/02-14-2018.csv',dtype=types, usecols=used_cols)
df2 = pd.read_csv('/content/drive/MyDrive/cic-ids2/02-15-2018.csv',dtype=types, usecols=used_cols)
df9= pd.read_csv('/content/drive/MyDrive/cic-ids2/03-02-2018.csv',dtype=types, usecols=used_cols)


In [None]:
merge = [
    df1, 
    df2, 
    df9
]


In [None]:
df = pd.concat(merge)
del merge


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3145725 entries, 0 to 1048574
Data columns (total 78 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Dst Port           int32  
 1   Protocol           int8   
 2   Flow Duration      int64  
 3   Tot Fwd Pkts       int16  
 4   Tot Bwd Pkts       int16  
 5   TotLen Fwd Pkts    int32  
 6   TotLen Bwd Pkts    int32  
 7   Fwd Pkt Len Max    int32  
 8   Fwd Pkt Len Min    int32  
 9   Fwd Pkt Len Mean   float64
 10  Fwd Pkt Len Std    float64
 11  Bwd Pkt Len Max    int16  
 12  Bwd Pkt Len Min    int16  
 13  Bwd Pkt Len Mean   float64
 14  Bwd Pkt Len Std    float64
 15  Flow Byts/s        float64
 16  Flow Pkts/s        float64
 17  Flow IAT Mean      float64
 18  Flow IAT Std       float64
 19  Flow IAT Max       int64  
 20  Flow IAT Min       int32  
 21  Fwd IAT Tot        int32  
 22  Fwd IAT Mean       float32
 23  Fwd IAT Std        float64
 24  Fwd IAT Max        int32  
 25  Fwd IAT Min       

# Missing Filling Values

There was inf values. they cannot use the calculation .So first we have to identify the inf value columns 

In [None]:
df.isna().sum().sum()

9756

In [None]:
columns = [val for val in df.columns if df[df[val] == np.inf][val].count() > 0]

print(columns)

['Flow Byts/s', 'Flow Pkts/s']


The two columns of inf values replaced with nan value

In [None]:
for col in columns:
  df[col].replace([np.inf, -np.inf],np.nan, inplace=True)
  mean = df[col].mean()
  df[col].fillna(mean, inplace=True)

In [None]:
df.isna().sum().sum()

0

# Assuming Protocol is a Categorized Label

In [None]:
df["Protocol"].unique() 

array([ 0,  6, 17], dtype=int8)

In [None]:
df= df.astype({"Protocol": str})

In [None]:
df["Protocol"].unique() 

array(['0', '6', '17'], dtype=object)

In [None]:
df = pd.get_dummies(df, columns=['Protocol'], drop_first=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3145725 entries, 0 to 1048574
Data columns (total 79 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Dst Port           int32  
 1   Flow Duration      int64  
 2   Tot Fwd Pkts       int16  
 3   Tot Bwd Pkts       int16  
 4   TotLen Fwd Pkts    int32  
 5   TotLen Bwd Pkts    int32  
 6   Fwd Pkt Len Max    int32  
 7   Fwd Pkt Len Min    int32  
 8   Fwd Pkt Len Mean   float64
 9   Fwd Pkt Len Std    float64
 10  Bwd Pkt Len Max    int16  
 11  Bwd Pkt Len Min    int16  
 12  Bwd Pkt Len Mean   float64
 13  Bwd Pkt Len Std    float64
 14  Flow Byts/s        float64
 15  Flow Pkts/s        float64
 16  Flow IAT Mean      float64
 17  Flow IAT Std       float64
 18  Flow IAT Max       int64  
 19  Flow IAT Min       int32  
 20  Fwd IAT Tot        int32  
 21  Fwd IAT Mean       float32
 22  Fwd IAT Std        float64
 23  Fwd IAT Max        int32  
 24  Fwd IAT Min        int32  
 25  Bwd IAT Tot       

In [None]:
df["Label"].unique()

array(['Benign', 'FTP-BruteForce', 'SSH-Bruteforce',
       'DoS attacks-GoldenEye', 'DoS attacks-Slowloris', 'Bot'],
      dtype=object)

In [None]:
df.to_csv("/content/drive/MyDrive/cic-ids/cleaned_data1.csv", index=False)