# Data Pre-Processing [Cleaning + Feature Engineering] 

## Knowing the features

### Importing Packages

In [6]:
import tensorflow as tf
import pandas as pd
import numpy as np

### Setting up paths to csv files / datasets

In [7]:
# CSV-01-12
path_DrDoS_DNS = "../CICDDoS-2019/CSV-01-12/01-12/DrDoS_DNS.csv"
path_DrDoS_MSSQL = "../CICDDoS-2019/CSV-01-12/01-12/DrDoS_MSSQL.csv"
path_DrDoS_LDAP = "../CICDDoS-2019/CSV-01-12/01-12/DrDoS_LDAP.csv"
path_DrDoS_NTP = "../CICDDoS-2019/CSV-01-12/01-12/DrDoS_NTP.csv"
path_DrDoS_NetBIOS = "../CICDDoS-2019/CSV-01-12/01-12/DrDoS_NetBIOS.csv"
path_DrDoS_SNMP = "../CICDDoS-2019/CSV-01-12/01-12/DrDoS_SNMP.csv"
path_DrDoS_SSDP = "../CICDDoS-2019/CSV-01-12/01-12/DrDoS_SSDP.csv"
path_DrDoS_UDP = "../CICDDoS-2019/CSV-01-12/01-12/DrDoS_UDP.csv"
path_Syn = "../CICDDoS-2019/CSV-01-12/01-12/Syn.csv"
path_TFTP = "../CICDDoS-2019/CSV-01-12/01-12/TFTP.csv"
path_UDPLag = "../CICDDoS-2019/CSV-01-12/01-12/UDPLag.csv"

# CSV-03-11
path__LDAP = "../CICDDoS-2019/CSV-03-11/03-11/LDAP.csv"
path__MSSQL = "../CICDDoS-2019/CSV-03-11/03-11/MSSQL.csv"
path__NetBIOS = "../CICDDoS-2019/CSV-03-11/03-11/NetBIOS.csv"
path__Portmap = "../CICDDoS-2019/CSV-03-11/03-11/Portmap.csv"
path__Syn = "../CICDDoS-2019/CSV-03-11/03-11/Syn.csv"
path__UDP = "../CICDDoS-2019/CSV-03-11/03-11/UDP.csv"
path__UDPLag = "../CICDDoS-2019/CSV-03-11/03-11/UDPLag.csv"

paths = [path_DrDoS_DNS, path_DrDoS_MSSQL, path_DrDoS_LDAP, path_DrDoS_NTP, path_DrDoS_NetBIOS, path_DrDoS_SNMP,
         path_DrDoS_SSDP, path_DrDoS_UDP, path_Syn, path_TFTP, path_UDPLag, path__LDAP, path__MSSQL, path__NetBIOS, path__Portmap, path__Syn, path__UDP, path__UDPLag]


### Making a large csv file workable in our machine and returning the results as df

In [8]:
def readALargeCSVFileAndGetResultAsDF(path):
    mylist = []

    for chunk in pd.read_csv(path, chunksize=20000, low_memory=False):
        mylist.append(chunk)

    big_data = pd.concat(mylist, axis=0)
    del mylist
    return big_data

### Function that gives us a complete rundown about which features are crossing the threshold mark of having zeros[0] ... So that we can come to a decision for eliminating those

In [9]:
featureSelectedBasedOn0Results = []

def giveInfoAboutFile(path, threshHoldPercentage, showPercentage):
    
    if path == path_TFTP:
        df = readALargeCSVFileAndGetResultAsDF(path)
    else:
        df = pd.read_csv(path, low_memory=False)
    
    totalCols = df.shape[1]
    totalRows = len(df)
    unnecessaryFeatureCount = 0
    unnecessaryFeatureNames = []

    for column in df:
        zerosInCol = (df[column] == 0).sum()
        if zerosInCol != 0:
            percentageOfZerosInRow = ((zerosInCol*100)/totalRows)
            
            if showPercentage:
                print(column , " - ", zerosInCol, " - ", percentageOfZerosInRow)

            if percentageOfZerosInRow > threshHoldPercentage:
                unnecessaryFeatureNames.append(column)
                unnecessaryFeatureCount = unnecessaryFeatureCount + 1
                
    print()
        
    unitResult = [path, threshHoldPercentage, unnecessaryFeatureNames]
    
    featureSelectedBasedOn0Results.append(unitResult)
    
    print()
    print("In", path, "Total features having more than ", threshHoldPercentage,
          "% zero are - ", unnecessaryFeatureCount, "out of ", totalCols)

### Running the {{giveInfoAboutFile}} function for all the csv files

In [10]:
for path in paths:
    giveInfoAboutFile(path, 99, False)
    
featureSelectedBasedOn0Results



In ../CICDDoS-2019/CSV-01-12/01-12/DrDoS_DNS.csv Total features having more than  99 % zero are -  48 out of  88


In ../CICDDoS-2019/CSV-01-12/01-12/DrDoS_MSSQL.csv Total features having more than  99 % zero are -  48 out of  88


In ../CICDDoS-2019/CSV-01-12/01-12/DrDoS_LDAP.csv Total features having more than  99 % zero are -  48 out of  88


In ../CICDDoS-2019/CSV-01-12/01-12/DrDoS_NTP.csv Total features having more than  99 % zero are -  34 out of  88


In ../CICDDoS-2019/CSV-01-12/01-12/DrDoS_NetBIOS.csv Total features having more than  99 % zero are -  48 out of  88


In ../CICDDoS-2019/CSV-01-12/01-12/DrDoS_SNMP.csv Total features having more than  99 % zero are -  48 out of  88


In ../CICDDoS-2019/CSV-01-12/01-12/DrDoS_SSDP.csv Total features having more than  99 % zero are -  43 out of  88


In ../CICDDoS-2019/CSV-01-12/01-12/DrDoS_UDP.csv Total features having more than  99 % zero are -  43 out of  88


In ../CICDDoS-2019/CSV-01-12/01-12/Syn.csv Total features having more

[['../CICDDoS-2019/CSV-01-12/01-12/DrDoS_DNS.csv',
  99,
  [' Total Backward Packets',
   ' Total Length of Bwd Packets',
   ' Fwd Packet Length Std',
   'Bwd Packet Length Max',
   ' Bwd Packet Length Min',
   ' Bwd Packet Length Mean',
   ' Bwd Packet Length Std',
   ' Flow IAT Std',
   ' Fwd IAT Std',
   'Bwd IAT Total',
   ' Bwd IAT Mean',
   ' Bwd IAT Std',
   ' Bwd IAT Max',
   ' Bwd IAT Min',
   'Fwd PSH Flags',
   ' Bwd PSH Flags',
   ' Fwd URG Flags',
   ' Bwd URG Flags',
   ' Bwd Header Length',
   ' Bwd Packets/s',
   ' Packet Length Std',
   ' Packet Length Variance',
   'FIN Flag Count',
   ' SYN Flag Count',
   ' RST Flag Count',
   ' PSH Flag Count',
   ' ACK Flag Count',
   ' URG Flag Count',
   ' CWE Flag Count',
   ' ECE Flag Count',
   ' Down/Up Ratio',
   ' Avg Bwd Segment Size',
   'Fwd Avg Bytes/Bulk',
   ' Fwd Avg Packets/Bulk',
   ' Fwd Avg Bulk Rate',
   ' Bwd Avg Bytes/Bulk',
   ' Bwd Avg Packets/Bulk',
   'Bwd Avg Bulk Rate',
   ' Subflow Bwd Packets',
   ' S

### Seeing the feature counts 

In [11]:
for case in featureSelectedBasedOn0Results:
    print(case[0], "---", len(case[2]))
    print()

../CICDDoS-2019/CSV-01-12/01-12/DrDoS_DNS.csv --- 48

../CICDDoS-2019/CSV-01-12/01-12/DrDoS_MSSQL.csv --- 48

../CICDDoS-2019/CSV-01-12/01-12/DrDoS_LDAP.csv --- 48

../CICDDoS-2019/CSV-01-12/01-12/DrDoS_NTP.csv --- 34

../CICDDoS-2019/CSV-01-12/01-12/DrDoS_NetBIOS.csv --- 48

../CICDDoS-2019/CSV-01-12/01-12/DrDoS_SNMP.csv --- 48

../CICDDoS-2019/CSV-01-12/01-12/DrDoS_SSDP.csv --- 43

../CICDDoS-2019/CSV-01-12/01-12/DrDoS_UDP.csv --- 43

../CICDDoS-2019/CSV-01-12/01-12/Syn.csv --- 38

../CICDDoS-2019/CSV-01-12/01-12/TFTP.csv --- 46

../CICDDoS-2019/CSV-01-12/01-12/UDPLag.csv --- 25

../CICDDoS-2019/CSV-03-11/03-11/LDAP.csv --- 48

../CICDDoS-2019/CSV-03-11/03-11/MSSQL.csv --- 48

../CICDDoS-2019/CSV-03-11/03-11/NetBIOS.csv --- 48

../CICDDoS-2019/CSV-03-11/03-11/Portmap.csv --- 29

../CICDDoS-2019/CSV-03-11/03-11/Syn.csv --- 21

../CICDDoS-2019/CSV-03-11/03-11/UDP.csv --- 43

../CICDDoS-2019/CSV-03-11/03-11/UDPLag.csv --- 18



## 
---
---
---

## Two approaches for DATA CLEANING...

**1. As the feature counts are not the same so have to intersect them and after getting a small subset [features that are 0 across all the files] and dropping those features... Merging files will be easy**
<br/>
<br/>
**2. Drop the features as per the result...  Merging all the files will be difficult as files will be then with different features**

### Function that will save the new csv to a proper destination

In [41]:
def saveNewCSV(path, newPathDir, eliminatingFeatures, fileNewName):
    
    # read files
    if path == path_TFTP:
        df = readALargeCSVFileAndGetResultAsDF(path)
    else:
        df = pd.read_csv(path, low_memory=False)
        
    # remove the cols
    df.drop(eliminatingFeatures, axis=1, inplace=True)

    # save to directory
    df.to_csv(newPathDir + fileNewName)


### Approach 01 [intersecting features]

#### Find the intersecting feature sets

**This piece of code is for getting intersecting elements between two lists**

In [13]:
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

**Running for the intersecting elements... These elements are giving at least threshold percentage of 0 in all the csv files**

In [19]:
eliminatingFeaturesBasedOnIntersection = []
for featureSet in featureSelectedBasedOn0Results:
    if len(eliminatingFeaturesBasedOnIntersection) == 0:
        eliminatingFeaturesBasedOnIntersection = featureSet[2]
    else:
        eliminatingFeaturesBasedOnIntersection = intersection(
            eliminatingFeaturesBasedOnIntersection, featureSet[2])
print(len(eliminatingFeaturesBasedOnIntersection))
print(eliminatingFeaturesBasedOnIntersection)

17
[' Bwd Packet Length Std', 'Fwd PSH Flags', ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags', 'FIN Flag Count', ' SYN Flag Count', ' RST Flag Count', ' PSH Flag Count', ' CWE Flag Count', ' ECE Flag Count', 'Fwd Avg Bytes/Bulk', ' Fwd Avg Packets/Bulk', ' Fwd Avg Bulk Rate', ' Bwd Avg Bytes/Bulk', ' Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate']


#### Dropping the features

**run code for all the files**
**saving to '../newDatasets/Intersecting Feature Elimination/' directory**

In [37]:
newDir = '../newDatasets/Intersecting Feature Elimination/'

for path in paths:
    
    # make the new name
    name = path.split('/')
    name = name[len(name)-2] + '__' + name[len(name)-1]
    
    saveNewCSV(path, newDir, eliminatingFeaturesBasedOnIntersection, name)


### Approach 02 [individual features]

#### Dropping the features

**run code for all the files**
**saving to '../newDatasets/Individual Elimination/' directory**

In [42]:
newDir = '../newDatasets/Individual Elimination/'

for eliminationInfo in featureSelectedBasedOn0Results:
    
    path = eliminationInfo[0]
    eliminatingFeaturesBasedOnIndividual = eliminationInfo[2]
    
    # make the new name
    name = path.split('/')
    name = name[len(name)-2] + '__' + name[len(name)-1]
    
    saveNewCSV(path, newDir, eliminatingFeaturesBasedOnIndividual, name)

## 
---
---
---