In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
from IPython.display import display
import gc
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Data Loading

In [None]:
# define datatype for the columns in the dataset
# loading data with predefined datatype helps improve RAM utilization
dtypes = {
    'Src IP': 'category',
    'Src Port': 'uint16',
    'Dst IP': 'category',
    'Dst Port': 'uint16',
    'Protocol': 'category',
    'Flow Duration': 'uint32',
    'Tot Fwd Pkts': 'uint32',
    'Tot Bwd Pkts': 'uint32',
    'TotLen Fwd Pkts': 'float32',
    'TotLen Bwd Pkts': 'float32',
    'Fwd Pkt Len Max': 'float32',
    'Fwd Pkt Len Min': 'float32',
    'Fwd Pkt Len Mean': 'float32',
    'Fwd Pkt Len Std': 'float32',
    'Bwd Pkt Len Max': 'float32',
    'Bwd Pkt Len Min': 'float32',
    'Bwd Pkt Len Mean': 'float32',
    'Bwd Pkt Len Std': 'float32',
    'Flow Byts/s': 'float32',
    'Flow Pkts/s': 'float32',
    'Flow IAT Mean': 'float32',
    'Flow IAT Std': 'float32',
    'Flow IAT Max': 'float32',
    'Flow IAT Min': 'float32',
    'Fwd IAT Tot': 'float32',
    'Fwd IAT Mean': 'float32',
    'Fwd IAT Std': 'float32',
    'Fwd IAT Max': 'float32',
    'Fwd IAT Min': 'float32',
    'Bwd IAT Tot': 'float32',
    'Bwd IAT Mean': 'float32',
    'Bwd IAT Std': 'float32',
    'Bwd IAT Max': 'float32',
    'Bwd IAT Min': 'float32',
    'Fwd PSH Flags': 'category',
    'Bwd PSH Flags': 'category',
    'Fwd URG Flags': 'category',
    'Bwd URG Flags': 'category',
    'Fwd Header Len': 'uint32',
    'Bwd Header Len': 'uint32',
    'Fwd Pkts/s': 'float32',
    'Bwd Pkts/s': 'float32',
    'Pkt Len Min': 'float32',
    'Pkt Len Max': 'float32',
    'Pkt Len Mean': 'float32',
    'Pkt Len Std': 'float32',
    'Pkt Len Var': 'float32',
    'FIN Flag Cnt': 'category',
    'SYN Flag Cnt': 'category',
    'RST Flag Cnt': 'category',
    'PSH Flag Cnt': 'category',
    'ACK Flag Cnt': 'category',
    'URG Flag Cnt': 'category',
    'CWE Flag Count': 'category',
    'ECE Flag Cnt': 'category',
    'Down/Up Ratio': 'float32',
    'Pkt Size Avg': 'float32',
    'Fwd Seg Size Avg': 'float32',
    'Bwd Seg Size Avg': 'float32',
    'Fwd Byts/b Avg': 'uint32',
    'Fwd Pkts/b Avg': 'uint32',
    'Fwd Blk Rate Avg': 'uint32',
    'Bwd Byts/b Avg': 'uint32',
    'Bwd Pkts/b Avg': 'uint32',
    'Bwd Blk Rate Avg': 'uint32',
    'Subflow Fwd Pkts': 'uint32',
    'Subflow Fwd Byts': 'uint32',
    'Subflow Bwd Pkts': 'uint32',
    'Subflow Bwd Byts': 'uint32',
    'Init Fwd Win Byts': 'uint32',
    'Init Bwd Win Byts': 'uint32',
    'Fwd Act Data Pkts': 'uint32',
    'Fwd Seg Size Min': 'uint32',
    'Active Mean': 'float32',
    'Active Std': 'float32',
    'Active Max': 'float32',
    'Active Min': 'float32',
    'Idle Mean': 'float32',
    'Idle Std': 'float32',
    'Idle Max': 'float32',
    'Idle Min': 'float32',
    'Label': 'category'
}

In [None]:
# load the data
df = pd.read_csv(
    '/kaggle/input/ddos-datasets/ddos_balanced/final_dataset.csv',
    dtype=dtypes,
    parse_dates=['Timestamp'],
    usecols=[*dtypes.keys(), 'Timestamp'],
    engine='c',
    low_memory=True
)
del dtypes
gc.collect()

In [None]:
df.shape

In [None]:
df.describe(include='all')

## Data preparation

In [None]:
def print_mem_usage(df):
    mb = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(mb))

In [None]:
print_mem_usage(df)

### Handling missing values
We find the % of missing values for each column. If a column has more than 50% missing values, then we drop the entire column. If the column has less than 5% missing values, then we drop those rows where the column value is missing.

In [None]:
colsToDrop = np.array([])
dropnaCols = np.array([])

In [None]:
missing = df.isna().sum()
missing = pd.DataFrame({'count': missing, '% of total': missing/len(df)*100}, index=df.columns)
missing.T

We observe that only Flow Byts/s has about 0.2% missing values. We therefore drop the corresponding rows.

In [None]:
colsToDrop = np.union1d(colsToDrop, missing[missing['% of total'] >= 50].index.values)
dropnaCols = missing[(missing['% of total'] > 0) & (missing['% of total'] <= 5)].index.values

### Handling incorrect/corrupt data

From the data statistics computed earlier, we can see that some columns have only one value. Such columns will not provide any significant information for our classification task. We will therefore drop these columns.

In [None]:
colsToDrop = np.union1d(colsToDrop, ['Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg', 'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg'])
gc.collect()

Let us now see some statistics for the categorical variables

In [None]:
# counting unique values and checking for skewness in the data
rowbuilder = lambda col: {'col': col, 'unique_values': df[col].nunique(), 'most_frequent_value': df[col].value_counts().index[0],'frequency': df[col].value_counts(normalize=True).values[0]}
frequency = [rowbuilder(col) for col in df.select_dtypes(include=['category']).columns]
stats = pd.DataFrame(frequency).sort_values(by='frequency', ascending=False)
stats

We can see that some categorical variables have very high dominance of a single category. For classification task, such categorical variables will be of little use. We will therefore drop those columns where the dominance of the most frequent category is more than 95%

In [None]:
skewed = stats[stats['frequency'] >= 0.95]
colsToDrop = np.union1d(colsToDrop, skewed['col'].values)
colsToDrop
del skewed
del rowbuilder
del frequency
gc.collect()

We also observe that some columns have infinity values. ML algorithms cannot work on infinity values. There are two ways to handle this. First, impute the infinity values to contain very large numbers less than infinity. Second, drop the rows that contain infinity values. In our case, only ~ 2% of the data contains infinity values. Thus we will adopt the second strategy.

In [None]:
df['Flow Byts/s'].replace(np.inf, np.nan, inplace=True)
df['Flow Pkts/s'].replace(np.inf, np.nan, inplace=True)
dropnaCols = np.union1d(dropnaCols, ['Flow Byts/s', 'Flow Pkts/s'])

In [None]:
colsToDrop

In [None]:
dropnaCols

In [None]:
# perform actual drop
df.drop(columns=colsToDrop, inplace=True)
df.dropna(subset=dropnaCols, inplace=True)
gc.collect()

We also observe from the data statistics that some columns have negative values. Based on our understanding of the variables, negative values indicate incorrect/faulty data. We will therefore filter out all the negative values from our dataset.

In [None]:
negValCols = ['Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Max', 'Flow IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Max', 'Bwd IAT Min']
for col in negValCols:
    df = df[df[col] >= 0]

In [None]:
print_mem_usage(df)

## Train-test split

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df["Label"]):
    traindf = df.iloc[train_index]
traindf.to_csv('train.csv', index=False)
gc.collect();

In [None]:
# traindf = pd.read_csv(
#     'train.csv',
#     dtype=dtypes,
#     parse_dates=['Timestamp'],
#     engine='c',
#     low_memory=True
# )

## Data Visualization

In [None]:
# plotting the target variable
labelCount = traindf['Label'].value_counts(normalize=True)*100
ax = sns.barplot(x=labelCount.index, y=labelCount.values)
ax1 = ax.twinx()
ax.set_ylabel('Frequency [%]')
ax1.set_ylabel("Count (in millions)")
ax1.set_ylim(0, len(traindf)/10**6)
ax.set_ylim(0, 100)
plt.title('Target Variable')

Our target variable is very balanced. Dataset contains almost equal instances of ddos and benign network activity.

In [None]:
cnt = pd.crosstab(traindf['Protocol'], traindf['Label'])
cnt = cnt.stack().reset_index().rename(columns={0: 'Count'})
sns.barplot(x=cnt['Protocol'], y=cnt['Count'], hue=cnt['Label'])

In [None]:
def getNetworkClass(col):
    networkClasses = col.str.split('.',n=1, expand=True)[0]
    networkClasses = networkClasses.astype('uint8')
    networkClasses = pd.cut(
        networkClasses,
        bins=[0, 127, 191, 223, 239, np.inf],
        labels=['A', 'B', 'C', 'D', 'E'],
        include_lowest=True
    )
    return networkClasses

In [None]:
srcNetworkClass = getNetworkClass(traindf['Src IP'])
dstNetworkClass = getNetworkClass(traindf['Dst IP'])

In [None]:
cnt = pd.crosstab(srcNetworkClass, traindf['Label'], rownames=['Class'])
cnt = cnt.stack().reset_index().rename(columns={0: 'Count'})
sns.barplot(x=cnt['Class'], y=cnt['Count'], hue=cnt['Label'])

We can see that ddos attacks Source IPs belong to primarily Class A & Class B networks

In [None]:
cnt = pd.crosstab(dstNetworkClass, traindf['Label'], rownames=['Class'])
cnt = cnt.stack().reset_index().rename(columns={0: 'Count'})
sns.barplot(x=cnt['Class'], y=cnt['Count'], hue=cnt['Label'])

In [None]:
del srcNetworkClass
del dstNetworkClass
gc.collect()

In [None]:
num_cols = traindf.select_dtypes(exclude=['category', 'datetime64[ns]']).columns
fwd_cols = [col for col in num_cols if 'Fwd' in col]
bwd_cols = [col for col in num_cols if 'Bwd' in col]

In [None]:
corr = traindf[fwd_cols].corr()

In [None]:
mask = np.triu(np.ones_like(corr, dtype=np.bool))
plt.subplots(figsize=(10,10))
sns.heatmap(corr, mask=mask)

In [None]:
def getCorrelatedFeatures(corr):
    correlatedFeatures = set()
    for i in range(len(corr.columns)):
        for j in range(i):
            if abs(corr.iloc[i, j]) > 0.8:
                correlatedFeatures.add(corr.columns[i])
    return correlatedFeatures

In [None]:
correlatedFeatures = set()
correlatedFeatures = correlatedFeatures | getCorrelatedFeatures(corr)

In [None]:
corr = traindf[bwd_cols].corr()

In [None]:
mask = np.triu(np.ones_like(corr, dtype=np.bool))
plt.subplots(figsize=(10,10))
sns.heatmap(corr, mask=mask)

In [None]:
correlatedFeatures = correlatedFeatures | getCorrelatedFeatures(corr)
correlatedFeatures

So our intuition was correct. There is high correlation in data. Lets drop these columns

In [None]:
traindf.drop(columns=correlatedFeatures, inplace=True)

In [None]:
gc.collect()

Now lets check correlation between forward & backward direction predictors

In [None]:
num_cols = set(traindf.select_dtypes(exclude=['category', 'datetime64[ns]']).columns)
cols = [col for col in num_cols if 'Fwd' in col or 'Bwd' in col]

In [None]:
corr = traindf[cols].corr()

In [None]:
mask = np.triu(np.ones_like(corr, dtype=np.bool))
plt.subplots(figsize=(10,10))
sns.heatmap(corr, mask=mask)

In [None]:
correlatedFeatures = correlatedFeatures | getCorrelatedFeatures(corr)
traindf.drop(columns=getCorrelatedFeatures(corr), inplace=True)

In [None]:
traindf.shape

In [None]:
traindf.describe()

In [None]:
num_cols = traindf.select_dtypes(exclude=['category', 'datetime64[ns]']).columns
skew = traindf[num_cols].skew().sort_values(ascending=False)

In [None]:
skew

A skew value of greater than 1 determines very high skew in the data. We will need to perform log transformation to lower the skew

## Data Pipeline
### Pipeline for numerical columns

In [None]:
del traindf
gc.collect();

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df["Label"]):
    traindf = df.iloc[train_index]
gc.collect();

In [None]:
def logTransformation(X):
    for col in X.columns:
        X.loc[X[col] == 0] = 1
    return np.log10(X)

In [None]:
def dropCorrelatedFeatures(X):
    return X.drop(columns=correlatedFeatures)

In [None]:
num_pipeline = Pipeline([
    ('dropCorrelatedFeatures', FunctionTransformer(dropCorrelatedFeatures)),
    ('logTransformation', FunctionTransformer(logTransformation))
])

In [None]:
num_cols = list(traindf.columns[(traindf.dtypes != 'category') &  (traindf.dtypes != 'datetime64[ns]')])
X = num_pipeline.transform(traindf[num_cols])
X.head()

### Pipeline for categorical data

In [None]:
def addNetworkClasses(X):
    X['SrcIPClass'] = getNetworkClass(X['Src IP'])
    X['DstIPClass'] = getNetworkClass(X['Dst IP'])
    return X.drop(columns=['Src IP', 'Dst IP'])

In [None]:
cat_pipeline = Pipeline([
    ('AddNewCols', FunctionTransformer(addNetworkClasses)),
    ('OrdinalEncoding', OrdinalEncoder())
])

### Full Pipeline

In [None]:
num_cols = list(traindf.columns[(traindf.dtypes != 'category') &  (traindf.dtypes != 'datetime64[ns]')])
cat_cols = list(traindf.columns[traindf.dtypes == 'category'])

In [None]:
full_pipeline = ColumnTransformer([
    ('numColTransformer', num_pipeline, num_cols),
    ('catColTransformer', cat_pipeline, cat_cols)
])

In [None]:
X = full_pipeline.fit_transform(traindf)