# Data Pre-Processing [Cleaning + Feature Engineering] 

## Knowing the features

### Importing Packages

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import random

### Setting up paths to csv files / datasets

In [None]:
# CSV-01-12
path_DrDoS_DNS = "../CICDDoS-2019/CSV-01-12/01-12/DrDoS_DNS.csv"
path_DrDoS_MSSQL = "../CICDDoS-2019/CSV-01-12/01-12/DrDoS_MSSQL.csv"
path_DrDoS_LDAP = "../CICDDoS-2019/CSV-01-12/01-12/DrDoS_LDAP.csv"
path_DrDoS_NTP = "../CICDDoS-2019/CSV-01-12/01-12/DrDoS_NTP.csv"
path_DrDoS_NetBIOS = "../CICDDoS-2019/CSV-01-12/01-12/DrDoS_NetBIOS.csv"
path_DrDoS_SNMP = "../CICDDoS-2019/CSV-01-12/01-12/DrDoS_SNMP.csv"
path_DrDoS_SSDP = "../CICDDoS-2019/CSV-01-12/01-12/DrDoS_SSDP.csv"
path_DrDoS_UDP = "../CICDDoS-2019/CSV-01-12/01-12/DrDoS_UDP.csv"
path_Syn = "../CICDDoS-2019/CSV-01-12/01-12/Syn.csv"
path_TFTP = "../CICDDoS-2019/CSV-01-12/01-12/TFTP.csv"
path_UDPLag = "../CICDDoS-2019/CSV-01-12/01-12/UDPLag.csv"

# CSV-03-11
path__LDAP = "../CICDDoS-2019/CSV-03-11/03-11/LDAP.csv"
path__MSSQL = "../CICDDoS-2019/CSV-03-11/03-11/MSSQL.csv"
path__NetBIOS = "../CICDDoS-2019/CSV-03-11/03-11/NetBIOS.csv"
path__Portmap = "../CICDDoS-2019/CSV-03-11/03-11/Portmap.csv"
path__Syn = "../CICDDoS-2019/CSV-03-11/03-11/Syn.csv"
path__UDP = "../CICDDoS-2019/CSV-03-11/03-11/UDP.csv"
path__UDPLag = "../CICDDoS-2019/CSV-03-11/03-11/UDPLag.csv"

paths = [path_DrDoS_DNS, path_DrDoS_MSSQL, path_DrDoS_LDAP, path_DrDoS_NTP, path_DrDoS_NetBIOS, path_DrDoS_SNMP,
         path_DrDoS_SSDP, path_DrDoS_UDP, path_Syn, path_TFTP, path_UDPLag, path__LDAP, path__MSSQL, path__NetBIOS, path__Portmap, path__Syn, path__UDP, path__UDPLag]


### Making a large csv file workable in our machine and returning the results as df

In [None]:
def readALargeCSVFileAndGetResultAsDF(path):
    mylist = []

    for chunk in pd.read_csv(path, chunksize=20000, low_memory=False):
        mylist.append(chunk)

    big_data = pd.concat(mylist, axis=0)
    del mylist
    return big_data

### Function that gives us a complete rundown about which features are crossing the threshold mark of having zeros[0] ... So that we can come to a decision for eliminating those

In [None]:
featureSelectedBasedOn0Results = []

def giveInfoAboutFile(path, threshHoldPercentage, showPercentage):
    
    if path == path_TFTP:
        df = readALargeCSVFileAndGetResultAsDF(path)
    else:
        df = pd.read_csv(path, low_memory=False)
    
    df = df.dropna()
    
    totalCols = df.shape[1]
    totalRows = len(df)
    unnecessaryFeatureCount = 0
    unnecessaryFeatureNames = []

    for column in df:
        zerosInCol = (df[column] == 0).sum()
        if zerosInCol != 0:
            percentageOfZerosInRow = ((zerosInCol*100)/totalRows)
            
            if showPercentage:
                print(column , " - ", zerosInCol, " - ", percentageOfZerosInRow)

            if percentageOfZerosInRow > threshHoldPercentage:
                unnecessaryFeatureNames.append(column)
                unnecessaryFeatureCount = unnecessaryFeatureCount + 1
                
    print()
        
    unitResult = [path, threshHoldPercentage, unnecessaryFeatureNames]
    
    featureSelectedBasedOn0Results.append(unitResult)
    
    print()
    print("In", path, "Total features having more than ", threshHoldPercentage,
          "% zero are - ", unnecessaryFeatureCount, "out of ", totalCols)

### Running the {{giveInfoAboutFile}} function for all the csv files

In [None]:
for path in paths:
    giveInfoAboutFile(path, 99, False)
    
featureSelectedBasedOn0Results

### Seeing the feature counts 

In [None]:
for case in featureSelectedBasedOn0Results:
    print(case[0], "---", len(case[2]))
    print()

## 
---
---
---

## Two approaches for DATA CLEANING...

**1. As the feature counts are not the same so have to intersect them and after getting a small subset [features that are 0 across all the files] and dropping those features... Merging files will be easy**
<br/>
<br/>
**2. Drop the features as per the result...  Merging all the files will be difficult as files will be then with different features**

### Function that will save the new csv to a proper destination

In [None]:
def saveNewCSV(path, newPathDir, eliminatingFeatures, fileNewName):
    
    # read files
    if path == path_TFTP:
        df = readALargeCSVFileAndGetResultAsDF(path)
    else:
        df = pd.read_csv(path, low_memory=False)
        
    # remove the cols
    df.drop(eliminatingFeatures, axis=1, inplace=True)

    # save to directory
    df.to_csv(newPathDir + fileNewName)


### Approach 01 [intersecting features]

#### Find the intersecting feature sets

**This piece of code is for getting intersecting elements between two lists**

In [None]:
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

**Running for the intersecting elements... These elements are giving at least threshold percentage of 0 in all the csv files**

In [None]:
eliminatingFeaturesBasedOnIntersection = []
for featureSet in featureSelectedBasedOn0Results:
    if len(eliminatingFeaturesBasedOnIntersection) == 0:
        eliminatingFeaturesBasedOnIntersection = featureSet[2]
    else:
        eliminatingFeaturesBasedOnIntersection = intersection(
            eliminatingFeaturesBasedOnIntersection, featureSet[2])
print(len(eliminatingFeaturesBasedOnIntersection))
print(eliminatingFeaturesBasedOnIntersection)

#### Dropping the features

**run code for all the files**
**saving to '../FinalSmallDatasets/Intersecting Feature Elimination/' directory**

In [None]:
newDir = '../FinalSmallDatasets/Intersecting Feature Elimination/'

for path in paths:
    
    # make the new name
    name = path.split('/')
    name = name[len(name)-2] + '__' + name[len(name)-1]
    
    saveNewCSV(path, newDir, eliminatingFeaturesBasedOnIntersection, name)


### Approach 02 [individual features]

#### Dropping the features

**run code for all the files**
**saving to '../FinalSmallDatasets/Individual Elimination/' directory**

In [None]:
newDir = '../FinalSmallDatasets/Individual Elimination/'

for eliminationInfo in featureSelectedBasedOn0Results:
    
    path = eliminationInfo[0]
    eliminatingFeaturesBasedOnIndividual = eliminationInfo[2]
    
    # make the new name
    name = path.split('/')
    name = name[len(name)-2] + '__' + name[len(name)-1]
    
    saveNewCSV(path, newDir, eliminatingFeaturesBasedOnIndividual, name)

## 
---
---
---

## Taking Less Data

### Get files paths


In [None]:
# dropped by intersecting features files paths

path_dropped_intersection_01_12__DrDoS_DNS = '../FinalSmallDatasets/Intersecting Feature Elimination/01-12__DrDoS_DNS.csv'
path_dropped_intersection_01_12__DrDoS_LDAP = '../FinalSmallDatasets/Intersecting Feature Elimination/01-12__DrDoS_LDAP.csv'
path_dropped_intersection_01_12__DrDoS_MSSQL = '../FinalSmallDatasets/Intersecting Feature Elimination/01-12__DrDoS_MSSQL.csv'
path_dropped_intersection_01_12__DrDoS_NetBIOS = '../FinalSmallDatasets/Intersecting Feature Elimination/01-12__DrDoS_NetBIOS.csv'
path_dropped_intersection_01_12__DrDoS_NTP = '../FinalSmallDatasets/Intersecting Feature Elimination/01-12__DrDoS_NTP.csv'
path_dropped_intersection_01_12__DrDoS_SNMP = '../FinalSmallDatasets/Intersecting Feature Elimination/01-12__DrDoS_SNMP.csv'
path_dropped_intersection_01_12__DrDoS_SSDP = '../FinalSmallDatasets/Intersecting Feature Elimination/01-12__DrDoS_SSDP.csv'
path_dropped_intersection_01_12__DrDoS_UDP = '../FinalSmallDatasets/Intersecting Feature Elimination/01-12__DrDoS_UDP.csv'
path_dropped_intersection_01_12__Syn = '../FinalSmallDatasets/Intersecting Feature Elimination/01-12__Syn.csv'
path_dropped_intersection_01_12__TFTP = '../FinalSmallDatasets/Intersecting Feature Elimination/01-12__TFTP.csv'
path_dropped_intersection_01_12__UDPLag = '../FinalSmallDatasets/Intersecting Feature Elimination/01-12__UDPLag.csv'
path_dropped_intersection_03_11__LDAP = '../FinalSmallDatasets/Intersecting Feature Elimination/03-11__LDAP.csv'
path_dropped_intersection_03_11__MSSQL = '../FinalSmallDatasets/Intersecting Feature Elimination/03-11__MSSQL.csv'
path_dropped_intersection_03_11__NetBIOS = '../FinalSmallDatasets/Intersecting Feature Elimination/03-11__NetBIOS.csv'
path_dropped_intersection_03_11__Portmap = '../FinalSmallDatasets/Intersecting Feature Elimination/03-11__Portmap.csv'
path_dropped_intersection_03_11__Syn = '../FinalSmallDatasets/Intersecting Feature Elimination/03-11__Syn.csv'
path_dropped_intersection_03_11__UDP = '../FinalSmallDatasets/Intersecting Feature Elimination/03-11__UDP.csv'
path_dropped_intersection_03_11__UDPLag = '../FinalSmallDatasets/Intersecting Feature Elimination/03-11__UDPLag.csv'

pathsForIntersectingDroppedFiles = [path_dropped_intersection_01_12__DrDoS_DNS, path_dropped_intersection_01_12__DrDoS_LDAP, path_dropped_intersection_01_12__DrDoS_MSSQL, path_dropped_intersection_01_12__DrDoS_NetBIOS, path_dropped_intersection_01_12__DrDoS_NTP, path_dropped_intersection_01_12__DrDoS_SNMP, path_dropped_intersection_01_12__DrDoS_SSDP, path_dropped_intersection_01_12__DrDoS_UDP, path_dropped_intersection_01_12__Syn, path_dropped_intersection_01_12__TFTP, path_dropped_intersection_01_12__UDPLag, path_dropped_intersection_03_11__LDAP, path_dropped_intersection_03_11__MSSQL, path_dropped_intersection_03_11__NetBIOS, path_dropped_intersection_03_11__Portmap, path_dropped_intersection_03_11__Syn, path_dropped_intersection_03_11__UDP, path_dropped_intersection_03_11__UDPLag]



# dropped by individual features files paths

path_dropped_individual_01_12__DrDoS_DNS = '../FinalSmallDatasets/Individual Elimination/01-12__DrDoS_DNS.csv'
path_dropped_individual_01_12__DrDoS_LDAP = '../FinalSmallDatasets/Individual Elimination/01-12__DrDoS_LDAP.csv'
path_dropped_individual_01_12__DrDoS_MSSQL = '../FinalSmallDatasets/Individual Elimination/01-12__DrDoS_MSSQL.csv'
path_dropped_individual_01_12__DrDoS_NetBIOS = '../FinalSmallDatasets/Individual Elimination/01-12__DrDoS_NetBIOS.csv'
path_dropped_individual_01_12__DrDoS_NTP = '../FinalSmallDatasets/Individual Elimination/01-12__DrDoS_NTP.csv'
path_dropped_individual_01_12__DrDoS_SNMP = '../FinalSmallDatasets/Individual Elimination/01-12__DrDoS_SNMP.csv'
path_dropped_individual_01_12__DrDoS_SSDP = '../FinalSmallDatasets/Individual Elimination/01-12__DrDoS_SSDP.csv'
path_dropped_individual_01_12__DrDoS_UDP = '../FinalSmallDatasets/Individual Elimination/01-12__DrDoS_UDP.csv'
path_dropped_individual_01_12__Syn = '../FinalSmallDatasets/Individual Elimination/01-12__Syn.csv'
path_dropped_individual_01_12__TFTP = '../FinalSmallDatasets/Individual Elimination/01-12__TFTP.csv'
path_dropped_individual_01_12__UDPLag = '../FinalSmallDatasets/Individual Elimination/01-12__UDPLag.csv'
path_dropped_individual_03_11__LDAP = '../FinalSmallDatasets/Individual Elimination/03-11__LDAP.csv'
path_dropped_individual_03_11__MSSQL = '../FinalSmallDatasets/Individual Elimination/03-11__MSSQL.csv'
path_dropped_individual_03_11__NetBIOS = '../FinalSmallDatasets/Individual Elimination/03-11__NetBIOS.csv'
path_dropped_individual_03_11__Portmap = '../FinalSmallDatasets/Individual Elimination/03-11__Portmap.csv'
path_dropped_individual_03_11__Syn = '../FinalSmallDatasets/Individual Elimination/03-11__Syn.csv'
path_dropped_individual_03_11__UDP = '../FinalSmallDatasets/Individual Elimination/03-11__UDP.csv'
path_dropped_individual_03_11__UDPLag = '../FinalSmallDatasets/Individual Elimination/03-11__UDPLag.csv'

pathsForIndividualDroppedFiles = [path_dropped_individual_01_12__DrDoS_DNS, path_dropped_individual_01_12__DrDoS_LDAP, path_dropped_individual_01_12__DrDoS_MSSQL, path_dropped_individual_01_12__DrDoS_NetBIOS, path_dropped_individual_01_12__DrDoS_NTP, path_dropped_individual_01_12__DrDoS_SNMP, path_dropped_individual_01_12__DrDoS_SSDP, path_dropped_individual_01_12__DrDoS_UDP,
                                    path_dropped_individual_01_12__Syn, path_dropped_individual_01_12__TFTP, path_dropped_individual_01_12__UDPLag, path_dropped_individual_03_11__LDAP, path_dropped_individual_03_11__MSSQL, path_dropped_individual_03_11__NetBIOS, path_dropped_individual_03_11__Portmap, path_dropped_individual_03_11__Syn, path_dropped_individual_03_11__UDP, path_dropped_individual_03_11__UDPLag]


### Function for Random Selection Per File & Save as new CSV

In [None]:
def randomSelectionAndSave(path, newDir, sampleSize, name):
    
    if path == path_TFTP:
        df = readALargeCSVFileAndGetResultAsDF(path)
    else:
        df = pd.read_csv(path, low_memory=False)
        
    df.dropna(inplace=True)
    # number of records in file (excludes header)
    totalFileRowCount = sum(1 for line in open(path)) - 1
    # the 0-indexed header will not be included in the skip list
    skip = sorted(random.sample(range(1, totalFileRowCount+1), totalFileRowCount-sampleSize))
    df = pd.read_csv(path, skiprows=skip, low_memory=False)
    
    # df = df.sample(n=sampleSize)
    
    df.to_csv(newDir + name)

### 10k per file

In [None]:
sampleSize = 10000

# intersecting ones
newDirIntersecting = '../FinalSmallDatasets/Intersecting Feature Elimination/sets-of-10k/'

for path in pathsForIntersectingDroppedFiles:

    # make the new name
    name = path.split('/')
    name = (name[len(name)-2] + '__' + name[len(name)-1]).replace(" ", "")

    randomSelectionAndSave(path, newDirIntersecting, sampleSize, name)

# individual ones
newDirIndividual = '../FinalSmallDatasets/Individual Elimination/sets-of-10k/'

for path in pathsForIndividualDroppedFiles:

    # make the new name
    name = path.split('/')
    name = (name[len(name)-2] + '__' + name[len(name)-1]).replace(" ", "")

    randomSelectionAndSave(path, newDirIndividual, sampleSize, name)


In [None]:
sampleSize = 10000

newDirIntersecting = '../FinalSmallDatasets/Individual Elimination/sets-of-10k/'
path = path_dropped_individual_01_12__TFTP
name = path.split('/')
name = (name[len(name)-2] + '__' + name[len(name)-1]).replace(" ", "")

name

df = readALargeCSVFileAndGetResultAsDF(path)
df
# randomSelectionAndSave(path, newDirIntersecting, sampleSize, name)

In [None]:
df.isna().sum()

In [None]:
df.isna().sum().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
sampleSize = 10000
df = df.sample(n=sampleSize)

In [None]:
newDirIndividual = '../FinalSmallDatasets/Individual Elimination/sets-of-10k/'
path = path_dropped_individual_01_12__TFTP
name = path.split('/')
name = (name[len(name)-2] + '__' + name[len(name)-1]).replace(" ", "")

df.to_csv(newDirIndividual + name)


In [None]:
df1 = pd.read_csv('../FinalSmallDatasets/Individual Elimination/sets-of-10k/IndividualElimination__03-11__LDAP.csv', low_memory=False)

df.isna().sum().sum()

df.shape

In [None]:
sampleSize = 10000

newDirIndividual = '../FinalSmallDatasets/Individual Elimination/sets-of-10k/'
path = path_dropped_individual_01_12__TFTP
name = path.split('/')
name = (name[len(name)-2] + '__' + name[len(name)-1]).replace(" ", "")

randomSelectionAndSave(path, newDirIndividual, sampleSize, name)


### Function for Random Selection Considering All Files & Save as new CSV

In [2]:
def randomSelectionOverAllFileAndSave(pathArray, newDir, sampleSize, name):
    
    perSampleSize = int(float(sampleSize / len(pathArray)))
    
    counter = 0
    randomIndex = random.randint(0, len(pathArray) - 1)
    
    bigDf = pd.DataFrame()

    for path in pathArray:
        
        if counter == randomIndex:
            perSampleSize = abs(sampleSize - (len(pathArray) * perSampleSize))
        
        
        df = pd.read_csv(path, low_memory=False)
            
        # # number of records in file (excludes header)
        # totalFileRowCount = sum(1 for line in open(path)) - 1
        # # the 0-indexed header will not be included in the skip list
        # skip = sorted(random.sample(range(1, totalFileRowCount+1),
        #                 totalFileRowCount-perSampleSize))
        # df = pd.read_csv(path, skiprows=skip, low_memory=False)

        df = df.sample(n=perSampleSize)

        bigDf = bigDf.append(df)
        
        counter = counter + 1
    
    bigDf.to_csv(newDir + name)

### Overall 10k file [based on intersection]

In [3]:
newDirForRandomSelectionOverAll = '../FinalSmallDatasets/Intersecting Feature Elimination/overall-10k/'
nameForRandomSelectionOverAll = 'data10k.csv'
sampleSizeForRandomSelectionOverAll = 10000

pathsForIntersectingDroppedAndReducedFiles = [
    '../FinalSmallDatasets/Intersecting Feature Elimination/sets-of-10k/IntersectingFeatureElimination__01-12__DrDoS_DNS.csv',
    '../FinalSmallDatasets/Intersecting Feature Elimination/sets-of-10k/IntersectingFeatureElimination__01-12__DrDoS_LDAP.csv',
    '../FinalSmallDatasets/Intersecting Feature Elimination/sets-of-10k/IntersectingFeatureElimination__01-12__DrDoS_MSSQL.csv',
    '../FinalSmallDatasets/Intersecting Feature Elimination/sets-of-10k/IntersectingFeatureElimination__01-12__DrDoS_NetBIOS.csv',
    '../FinalSmallDatasets/Intersecting Feature Elimination/sets-of-10k/IntersectingFeatureElimination__01-12__DrDoS_NTP.csv',
    '../FinalSmallDatasets/Intersecting Feature Elimination/sets-of-10k/IntersectingFeatureElimination__01-12__DrDoS_SNMP.csv',
    '../FinalSmallDatasets/Intersecting Feature Elimination/sets-of-10k/IntersectingFeatureElimination__01-12__DrDoS_SSDP.csv',
    '../FinalSmallDatasets/Intersecting Feature Elimination/sets-of-10k/IntersectingFeatureElimination__01-12__DrDoS_UDP.csv',
    '../FinalSmallDatasets/Intersecting Feature Elimination/sets-of-10k/IntersectingFeatureElimination__01-12__Syn.csv',
    '../FinalSmallDatasets/Intersecting Feature Elimination/sets-of-10k/IntersectingFeatureElimination__01-12__TFTP.csv',
    '../FinalSmallDatasets/Intersecting Feature Elimination/sets-of-10k/IntersectingFeatureElimination__01-12__UDPLag.csv',
    '../FinalSmallDatasets/Intersecting Feature Elimination/sets-of-10k/IntersectingFeatureElimination__03-11__LDAP.csv',
    '../FinalSmallDatasets/Intersecting Feature Elimination/sets-of-10k/IntersectingFeatureElimination__03-11__MSSQL.csv',
    '../FinalSmallDatasets/Intersecting Feature Elimination/sets-of-10k/IntersectingFeatureElimination__03-11__NetBIOS.csv',
    '../FinalSmallDatasets/Intersecting Feature Elimination/sets-of-10k/IntersectingFeatureElimination__03-11__Portmap.csv',
    '../FinalSmallDatasets/Intersecting Feature Elimination/sets-of-10k/IntersectingFeatureElimination__03-11__Syn.csv',
    '../FinalSmallDatasets/Intersecting Feature Elimination/sets-of-10k/IntersectingFeatureElimination__03-11__UDP.csv',
    '../FinalSmallDatasets/Intersecting Feature Elimination/sets-of-10k/IntersectingFeatureElimination__03-11__UDPLag.csv'
]

randomSelectionOverAllFileAndSave(pathsForIntersectingDroppedAndReducedFiles, newDirForRandomSelectionOverAll,
                                  sampleSizeForRandomSelectionOverAll, nameForRandomSelectionOverAll)


  bigDf = bigDf.append(df)
  bigDf = bigDf.append(df)
  bigDf = bigDf.append(df)
  bigDf = bigDf.append(df)
  bigDf = bigDf.append(df)
  bigDf = bigDf.append(df)
  bigDf = bigDf.append(df)
  bigDf = bigDf.append(df)
  bigDf = bigDf.append(df)
  bigDf = bigDf.append(df)
  bigDf = bigDf.append(df)
  bigDf = bigDf.append(df)
  bigDf = bigDf.append(df)
  bigDf = bigDf.append(df)
  bigDf = bigDf.append(df)
  bigDf = bigDf.append(df)
  bigDf = bigDf.append(df)
  bigDf = bigDf.append(df)


## 
---
---
---