In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler  
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

In [None]:
date_columns = ["Timestamp"]
df = pd.read_csv("ddos_dataset.csv", parse_dates=date_columns,index_col=None)
df.head()

In [None]:
df.info()

## Feature Extraction from `Flow ID` column

In [None]:
df[['Source', 'Destination', 'Source Port', 'Dest Port', 'Other']] = df['Flow ID'].str.split('-', expand=True)
df.head()

In [None]:
df = df.sort_values("Timestamp")

In [None]:
# Dropping Timestamp and ports (not sure about the data for ports hence deleting for safety)
df = df.drop(columns=["Timestamp", "Source Port", "Dest Port", "Other"])
df.head()

## Handling IP Addresses before using an ML model

In [None]:
df[['SourceIP_1', 'SourceIP_2', 'SourceIP_3', 'SourceIP_4']] = df.Source.str.split('.', expand=True)
df[['DestinationIP_1', 'DestinationIP_2', 'DestinationIP_3', 'DestinationIP_4']] = df.Destination.str.split('.', expand=True)
df = df.drop(columns=["Source", "Destination", "Flow ID"])
df.head()

## Checking for missing values

In [None]:
df.isna().sum()

## Check for class imbalance

In [None]:
sns.countplot(df['Label']);

## Encoding the target variable

In [None]:
le = LabelEncoder()
df['Label'] = le.fit_transform(df['Label'])
df.head()

## Performing Random Undersampling and splitting the dataset

Since we have over 200K data points for the minority class, random undersampling is performed to get a balanced dataset.

In [None]:
X = df.drop('Label' , axis = 1)
y = df['Label']
RUS = RandomUnderSampler(random_state=42)
X_rus, y_rus = RUS.fit_resample(X,y)

## Splitting the dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_rus, y_rus, test_size=0.3, random_state=42)

## Finding our Base Model

In [None]:
models = {"Logistic Regression": LogisticRegression(), "Random Forest": RandomForestClassifier(),
         "KNN": KNeighborsClassifier(), "AdaBoost": AdaBoostClassifier()}

cv = KFold(n_splits=10)
import time

def fit_and_score(models, X_train, X_test, y_train, y_test):
    model_scores = {}
    model_roc_auc_scores = {}
    model_time = {}
    for name, model in models.items():
        start = time.process_time()
        model.fit(X_train, y_train)
        model_time[name] = time.process_time() - start
        scores = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
        model_roc_auc_scores[name] = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])
        model_scores[name] = model.score(X_test, y_test)
    return model_scores, model_roc_auc_scores, model_time

In [None]:
model_scores, model_roc_auc_scores, model_time = fit_and_score(models, X_train, X_test, y_train, y_test)
print("ACCURACIES : ")
model_scores

In [None]:
print("ROC AUC SCORES : ")
model_roc_auc_scores

In [None]:
print("Time : ")
model_time

In [None]:
model_compare = pd.DataFrame(model_scores, index=['ROC AUC Score'])
model_compare.T.plot.bar();

Seeing the above, we use the Random Forest classifier.

## Training an Random Forest Classifier

In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

## Evaluating a model

In [None]:
plot_roc_curve(clf, X_test, y_test)

In [None]:
y_preds = clf.predict(X_test)
conf_mat = confusion_matrix(y_test, y_preds)
sns.heatmap(conf_mat, annot=True);

In [None]:
print(classification_report(y_test, y_preds))

In [None]:
average_precision = average_precision_score(y_test, clf.predict_proba(X_test)[:,1])
print('Average precision-recall score: {0:0.2f}'.format(average_precision))

In [None]:
disp = plot_precision_recall_curve(clf, X_test, y_test)
disp.ax_.set_title('Precision-Recall curve: '
                   'AP={0:0.2f}'.format(average_precision));