In [None]:
import pandas as pd
import numpy as np
import boto3
import pickle

In [None]:
MY_BUCKET = 'sagemaker-studio-8x6b1t9vueh'
file_name = 'df_base.pkl'

In [None]:
s3 = boto3.resource('s3')

In [None]:
pickled_df = s3.Object(MY_BUCKET, file_name)
df = pickle.loads(pickled_df.get()['Body'].read())

Since, certain columns are too unique to be considered towards modeling. Therefore, we will be dropping these columns.

In [4]:
columns_to_be_dropped = ['Dst Port', 'Timestamp', 'Src IP', 'Src Port', 'Dst IP', 'Flow ID']

In [5]:
df.drop(columns=columns_to_be_dropped, inplace=True)

In [8]:
pd.set_option("display.max_rows", 10, "display.max_columns", None)

## Checking for outliers...

In [6]:
print('Printing cols with nan presence...')

def check_nulls():
    for col in df.columns:
        val = df[col].isnull().sum()
        if val > 0:
            print(f'{col} -> {val}')

check_nulls()

Printing cols with nan presence...
Flow Byts/s -> 59721


In [9]:
entries_with_NAN = df.loc[df['Flow Byts/s'].isnull()]
print(f'Labels affected {entries_with_NAN["Label"].unique()}')
print(f'All NaN entries have corresponding unique values in "Flow Pkts/s" {entries_with_NAN["Flow Pkts/s"].unique()}')
entries_with_NAN

Labels affected ['Benign' 'Infilteration' 'FTP-BruteForce']
All NaN entries have corresponding unique values in "Flow Pkts/s" [inf]


Unnamed: 0,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
108,6,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,176.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
111,6,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,180.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
119,6,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,405.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
194,6,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,31.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
204,6,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,126.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16224829,6,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
16225597,6,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
16225679,6,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
16226223,6,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


### All NaNs have corresponding inf!

In [10]:
print('Check for positive and negative inf!')

def check_inf():
    for col in df.columns:
        if not str(df[col].dtype) == 'object': 
            val = (np.abs(df[col]) == np.inf).sum()
            if val > 0:
                print(f'{col} -> {val}')

check_inf()

Check for positive and negative inf!
Flow Byts/s -> 36039
Flow Pkts/s -> 95760


In [11]:
entries_with_inf = df.loc[(np.abs(df['Flow Byts/s']) == np.inf) | (np.abs(df['Flow Pkts/s']) == np.inf)]
print(f'Labels affected {entries_with_inf["Label"].unique()}')
print(f'All Inf entries have corresponding unique values in "Flow Byts/s" {entries_with_inf["Flow Byts/s"].unique()}')
print(f'Unique Flow durations during the records with inf {entries_with_inf["Flow Duration"].unique()}')
print(f'Unique Flow Means during the records with inf {entries_with_inf["Flow Duration"].unique()}')
entries_with_inf

Labels affected ['Benign' 'Infilteration' 'FTP-BruteForce']
All Inf entries have corresponding unique values in "Flow Byts/s" [nan inf]
Unique Flow durations during the records with inf [0.]
Unique Flow Means during the records with inf [0.]


Unnamed: 0,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
108,6,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,40.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,176.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
111,6,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,40.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,180.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
119,6,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,40.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,405.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
148,6,0.0,2.0,0.0,31.0,0.0,31.0,0.0,15.5,21.920311,0.0,0.0,0.0,0.0,inf,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,40.0,0.0,0.0,0.0,0.0,31.0,20.666666,17.897858,320.333344,0,1,0,0,1,0,0,0,0.0,31.0,15.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,31.0,0.0,0.0,946.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
194,6,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,40.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,31.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16224829,6,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,40.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,1,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
16225597,6,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,40.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,1,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
16225679,6,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,40.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,1,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
16226223,6,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,40.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,1,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


In [15]:
print('checking for negatives outliers...')

def check_for_negatives():
    cols = []
    for col in df.columns:
        if not str(df[col].dtype) == 'object':
            val = (df[col] < 0).sum()
            if val > 0:
                unique_values = df.loc[df[col] < 0][col].unique()
                print(f'{col} -> {val} with unique values -> ', unique_values)
                cols.append(col)

    return cols


cols = check_for_negatives()

checking for negatives outliers...
Flow Duration -> 14 with unique values ->  [-1.88505e+11 -7.48770e+10 -4.83400e+09 -8.28220e+11 -5.78768e+11
 -6.99056e+11 -5.53312e+11 -1.10116e+11 -6.42052e+11 -1.18730e+10
 -6.81402e+11 -9.19011e+11 -2.73850e+11 -5.29801e+11]
Flow Pkts/s -> 14 with unique values ->  [-4.7744092e-05 -1.0684190e-04 -8.8953245e-03 -2.4148173e-06
 -8.2934785e-05 -5.4358999e-05 -6.1448154e-05 -7.1742531e-04
 -2.5387350e-04 -1.6844941e-04 -4.4026874e-06 -8.8138229e-05
 -1.0297608e-03 -9.5318811e-04]
Flow IAT Mean -> 14 with unique values ->  [-2.35631247e+10 -1.06967142e+10 -1.15095240e+08 -8.28219982e+11
 -1.23142124e+10 -1.88934062e+10 -1.67670303e+10 -1.41174362e+09
 -3.96328397e+09 -1.18730004e+10 -3.40700987e+11 -1.14876375e+10
 -9.74555136e+08 -1.05119245e+09]
Flow IAT Max -> 3 with unique values ->  [-8.28220e+11 -1.18730e+10 -1.59438e+11]
Flow IAT Min -> 15 with unique values ->  [-2.22800e+03 -4.49709e+11 -9.17310e+11 -9.35931e+11 -8.28220e+11
 -8.81562e+11 -6.1

## Correcting Outliers

We can drop the records where the values for columns are negative except for the columns "Init Fwd Win Byts" and "Init Bwd Win Byts".

In [16]:
for col in cols:
    if col not in ["Init Fwd Win Byts", "Init Bwd Win Byts"]:
        index = df[col].loc[df[col] < 0].index
        df.drop(index=index, inplace=True)

In [17]:
print(f'Remaning negative columns {check_for_negatives()}')

Init Fwd Win Byts -> 4432594 with unique values ->  [-1.]
Init Bwd Win Byts -> 8255535 with unique values ->  [-1.]
Remaning negative columns ['Init Fwd Win Byts', 'Init Bwd Win Byts']


Setting remaining columns "Init Fwd Win Byts" and "Init Bwd Win Byts" to zero from -1.

In [18]:
for col in ["Init Fwd Win Byts", "Init Bwd Win Byts"]:
    index = df[col].loc[df[col] < 0].index
    df.loc[index, col] = 0

In [19]:
print(f'Remaning negative columns {check_for_negatives()}')

Remaning negative columns []


### Now all negative values have been dealt with. We will now remove NaNs and inf.

We will verify the formula that "Flow Pkts/s" = ("Total Fwd Pkts" + "Tot Bwd Pkts") / ('Flow Duration' * 10 ^ -6)

In [20]:
for i in range(100):
      assert np.allclose(df.loc[i]['Flow Pkts/s'], 
                         (df.loc[i]['Tot Fwd Pkts'] + df.loc[i]['Tot Bwd Pkts']) / (df.loc[i]['Flow Duration'] * 1e-6))

We will verify the formula that "Flow Byts/s" = ("TotLen Fwd Pkts" + "TotLen Bwd Pkts") / ('Flow Duration' * 10 ^ -6)

In [21]:
for i in range(100):
      assert np.allclose(df.loc[i]['Flow Byts/s'], 
                         (df.loc[i]['TotLen Fwd Pkts'] + df.loc[i]['TotLen Bwd Pkts']) / (df.loc[i]['Flow Duration'] * 1e-6))

To resolve inf and nans we need to fix the 'Flow Duration' column. We can do this by replacing the zero entries with the corresponding 'Label' mean of 'Flow Duration'.

In [22]:
indices = []

for label in entries_with_inf["Label"].unique():
    df_slice = df.loc[df["Label"] == label]['Flow Duration']
    mean_flow_duration = df_slice.sum() / np.count_nonzero(df_slice)
    
    index = df_slice[df_slice == 0].index
    indices.append(index)
    
    df.loc[index, 'Flow Duration'] = mean_flow_duration


Now we will recalculate 'Flow Pkts/s' and 'Flow Bytes/s'

In [23]:
for index in indices:
    df.loc[index, 'Flow Pkts/s'] = (df.loc[index, 'Tot Fwd Pkts'] + df.loc[index, 'Tot Bwd Pkts']) / (df.loc[index, 'Flow Duration'] * 1e-6)
    df.loc[index, 'Flow Byts/s'] = (df.loc[index, 'TotLen Fwd Pkts'] + df.loc[index, 'TotLen Bwd Pkts']) / (df.loc[index, 'Flow Duration'] * 1e-6)

### Checking for remaining NaNs and Infs

In [24]:
check_nulls()

In [25]:
check_inf()

### Dropping instances of attacks which have negligible count compared to others. We have decided the limit to be of less than 1000 instances for disqualification.

In [9]:
label_counts = df['Label'].value_counts()

labels_to_be_dropped = label_counts[label_counts < 1000].index
labels_to_be_dropped

Index(['Brute Force -Web', 'Brute Force -XSS', 'SQL Injection'], dtype='object')

In [10]:
indexes_labels_to_be_dropped = [df.loc[df['Label'] == label].index for label in labels_to_be_dropped]


for index in indexes_labels_to_be_dropped:
    df.drop(index=index, inplace=True)

In [11]:
label_counts

Benign                      13484693
DDOS attack-HOIC              686012
DDoS attacks-LOIC-HTTP        576191
DoS attacks-Hulk              461912
Bot                           286191
FTP-BruteForce                193360
SSH-Bruteforce                187589
Infilteration                 161934
DoS attacks-SlowHTTPTest      139890
DoS attacks-GoldenEye          41508
DoS attacks-Slowloris          10990
DDOS attack-LOIC-UDP            1730
Brute Force -Web                 611
Brute Force -XSS                 230
SQL Injection                     87
Name: Label, dtype: int64

In [12]:
df['Label'].value_counts()

Benign                      13484693
DDOS attack-HOIC              686012
DDoS attacks-LOIC-HTTP        576191
DoS attacks-Hulk              461912
Bot                           286191
FTP-BruteForce                193360
SSH-Bruteforce                187589
Infilteration                 161934
DoS attacks-SlowHTTPTest      139890
DoS attacks-GoldenEye          41508
DoS attacks-Slowloris          10990
DDOS attack-LOIC-UDP            1730
Name: Label, dtype: int64

## Saving the processed dataframe to S3 Bucket.

In [15]:
MY_BUCKET = 'sagemaker-studio-8x6b1t9vueh'
file_name = 'df_processed.pkl'

In [14]:
s3 = boto3.resource('s3')

In [17]:
my_preprocessed_df_ref = s3.Object(MY_BUCKET, file_name)
_ = my_preprocessed_df_ref.put(Body=pickle.dumps(df))

{'ResponseMetadata': {'RequestId': '7722N76XWFRYDC4X',
  'HostId': 'lUhXUgbRBrqfJvUEQo0DOcNfKSxy0F3Y15Ei1AbUAbWCEnAJg33jp0CIsLYGQ9NdLfvtvOZla+s=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'lUhXUgbRBrqfJvUEQo0DOcNfKSxy0F3Y15Ei1AbUAbWCEnAJg33jp0CIsLYGQ9NdLfvtvOZla+s=',
   'x-amz-request-id': '7722N76XWFRYDC4X',
   'date': 'Sat, 13 Nov 2021 06:32:59 GMT',
   'etag': '"2aeacb529007e51de8a31d4a58c07a8a"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"2aeacb529007e51de8a31d4a58c07a8a"'}