<a href="https://colab.research.google.com/github/r-dube/CICIDS/blob/main/cicids_classifiers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# Load the top modules that are used in multiple places
import numpy as np
import pandas as pd

In [15]:
# Some global variables to drive the script
# data_url is the location of the data
# Data is not loaded from a local file
# Data is loaded from a prepocessed dataset
data_url="https://raw.githubusercontent.com/r-dube/CICIDS/main/MachineLearningCVE/processed/bal-cicids2017.csv"

In [16]:
# Column name mapping from original data to compact form
# All the X** are features and the YY is the label
feature_map = {
 ' Destination Port' : 'X1',
 ' Flow Duration' : 'X2', 
 ' Total Fwd Packets' : 'X3', 
 ' Total Backward Packets' : 'X4', 
 'Total Length of Fwd Packets' : 'X5', 
 ' Total Length of Bwd Packets' : 'X6', 
 ' Fwd Packet Length Max' : 'X7', 
 ' Fwd Packet Length Min' : 'X8', 
 ' Fwd Packet Length Mean' : 'X9', 
 ' Fwd Packet Length Std' : 'X10', 
 'Bwd Packet Length Max' : 'X11', 
 ' Bwd Packet Length Min' : 'X12', 
 ' Bwd Packet Length Mean' : 'X13', 
 ' Bwd Packet Length Std' : 'X14', 
 'Flow Bytes/s' : 'X15', 
 ' Flow Packets/s' : 'X16', 
 ' Flow IAT Mean' : 'X17', 
 ' Flow IAT Std' : 'X18', 
 ' Flow IAT Max' : 'X19', 
 ' Flow IAT Min' : 'X20', 
 'Fwd IAT Total' : 'X21', 
 ' Fwd IAT Mean' : 'X22', 
 ' Fwd IAT Std' : 'X23', 
 ' Fwd IAT Max' : 'X24', 
 ' Fwd IAT Min' : 'X25', 
 'Bwd IAT Total' : 'X26', 
 ' Bwd IAT Mean' : 'X27', 
 ' Bwd IAT Std' : 'X28', 
 ' Bwd IAT Max' : 'X29', 
 ' Bwd IAT Min' : 'X30', 
 'Fwd PSH Flags' : 'X31', 
 ' Bwd PSH Flags' : 'X32', 
 ' Fwd URG Flags' : 'X33', 
 ' Bwd URG Flags' : 'X34', 
 ' Fwd Header Length' : 'X35', 
 ' Bwd Header Length' : 'X36', 
 'Fwd Packets/s' : 'X37', 
 ' Bwd Packets/s' : 'X38', 
 ' Min Packet Length' : 'X39', 
 ' Max Packet Length' : 'X40', 
 ' Packet Length Mean' : 'X41', 
 ' Packet Length Std' : 'X42', 
 ' Packet Length Variance' : 'X43', 
 'FIN Flag Count' : 'X44', 
 ' SYN Flag Count' : 'X45', 
 ' RST Flag Count' : 'X46', 
 ' PSH Flag Count' : 'X47', 
 ' ACK Flag Count' : 'X48', 
 ' URG Flag Count' : 'X49', 
 ' CWE Flag Count' : 'X50', 
 ' ECE Flag Count' : 'X51', 
 ' Down/Up Ratio' : 'X52', 
 ' Average Packet Size' : 'X53', 
 ' Avg Fwd Segment Size' : 'X54', 
 ' Avg Bwd Segment Size' : 'X55', 
 ' Fwd Header Length.1' : 'X56', 
 'Fwd Avg Bytes/Bulk' : 'X57', 
 ' Fwd Avg Packets/Bulk' : 'X58', 
 ' Fwd Avg Bulk Rate' : 'X59', 
 ' Bwd Avg Bytes/Bulk' : 'X60', 
 ' Bwd Avg Packets/Bulk' : 'X61', 
 'Bwd Avg Bulk Rate' : 'X62', 
 'Subflow Fwd Packets' : 'X63', 
 ' Subflow Fwd Bytes' : 'X64', 
 ' Subflow Bwd Packets' : 'X65', 
 ' Subflow Bwd Bytes' : 'X66', 
 'Init_Win_bytes_forward' : 'X67', 
 ' Init_Win_bytes_backward' : 'X68', 
 ' act_data_pkt_fwd' : 'X69', 
 ' min_seg_size_forward' : 'X70', 
 'Active Mean' : 'X71', 
 ' Active Std' : 'X72', 
 ' Active Max' : 'X73', 
 ' Active Min' : 'X74', 
 'Idle Mean' : 'X75', 
 ' Idle Std' : 'X76', 
 ' Idle Max' : 'X77', 
 ' Idle Min' : 'X78', 
 ' Label': 'YY'
}

# label names (YY) in the data and their
# mapping to numerical values
label_map = {
 'BENIGN' : 0,
 'FTP-Patator' : 1,
 'SSH-Patator' : 2,
 'DoS slowloris' : 3,
 'DoS Slowhttptest': 4,
 'DoS Hulk' : 5,
 'DoS GoldenEye' : 6,
 'Heartbleed' : 7,
 'Web Attack � Brute Force' : 8,
 'Web Attack � XSS' : 8,
 'Web Attack � Sql Injection' : 8,
 'Infiltration' : 9,
 'Bot' : 10,
 'PortScan' : 11,
 'DDoS' : 12,
}

num_ids_features = 76
num_ids_classes = 13
ids_classes = [ 'BENIGN', 'FTP-Patator', 'SSH-Patator', 'DoS slowloris', 'DoS Slowhttptest', 'DoS Hulk', 'DoS GoldenEye', 'Heartbleed', 'Brute Force', 'XSS', 'Sql Injection', 'Infiltration', 'Bot', 'PortScan', 'DDoS',]

In [17]:
# Utility functions used by classifiers
# In particular to load and split data and output results
def ids_load_df_from_csv():
    """
    Load dataframe from csv file
    Input:
        None
    Returns:
        None
    """

    df = pd.read_csv(data_url)

    print ("load Dataframe shape", df.shape)

    return df

def ids_split(df):
    """
    Input:
        Dataframe that has columns of covariates followed by a column of labels
    Returns:
        X_train, X_val, X_test, y_train, y_val, y_test as numpy arrays
    """

    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler

    numcols = len(df.columns)
    print("df.shape", df.shape)

    X = df.iloc[:, 0:numcols-1]
    y = df.loc[:, 'YY']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)
    print ("X_train.shape", X_train.shape, "y_train.shape", y_train.shape)
    print ("X_val.shape", X_val.shape, "y_val.shape", y_val.shape)
    print ("X_test.shape", X_test.shape, "y_test.shape", y_test.shape)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    y_train = y_train.values
    y_val = y_val.values
    y_test = y_test.values

    return X_train, X_val, X_test, y_train, y_val, y_test

def ids_accuracy (y_actual, y_pred):
    """
    Input:
        Numpy arrays with actual and predicted labels
    Returns:
        multiclass accuracy and f1 scores; two class accuracy and f1 scores
    """

    from sklearn.metrics import accuracy_score
    from sklearn.metrics import f1_score

    # modify labels to get results for two class classification
    y_actual_2 = (y_actual > 0).astype(int)
    y_pred_2 = (y_pred > 0).astype(int)

    acc = accuracy_score (y_actual, y_pred)
    f1 = f1_score(y_actual, y_pred, average='macro')
    acc_2 = accuracy_score (y_actual_2, y_pred_2)
    f1_2 = f1_score(y_actual_2, y_pred_2)
    
    return acc, f1, acc_2, f1_2
    

def ids_metrics(y_actual, y_pred):
    """
    Input:
        Numpy arrays with actual and predicted labels
    Returns:
        None
    Print: various classification metrics
    """

    from sklearn.metrics import confusion_matrix

    cm = confusion_matrix (y_actual, y_pred)
    print (cm)

    acc, f1, acc_2, f1_2 = ids_accuracy (y_actual, y_pred)
    print('Classifier accuracy : {:.4f}'.format(acc), 'F1 score: {:.4f}'.format(f1))
    print('Two class classifier accuracy : {:.4f}'.format(acc_2), 'F1 score: {:.4f}'.format(f1_2))


In [18]:
# Classification using Logistic regression
from sklearn.linear_model import LogisticRegression

def ids_logistic():
    """
    Classify processed data set stored as csv file using logistic regression
    Print: accuracy, confusion matrix, f1 score on the validation data set
    Input:
        None    
    Returns:
        None
    """

    df = ids_load_df_from_csv ()
    X_train, X_val, X_test, y_train, y_val, y_test = ids_split(df)

    # max_iter could be set to a large value (10000) to prevent 
    # LogisticRegression() from complaining that # it is not coverging
    logreg = LogisticRegression(max_iter=100)
    logreg.fit(X_train, y_train)

    y_pred = logreg.predict(X_val)
    
    ids_metrics(y_val, y_pred)

In [19]:
ids_logistic()

load Dataframe shape (136000, 77)
df.shape (136000, 77)
X_train.shape (98260, 76) y_train.shape (98260,)
X_val.shape (17340, 76) y_val.shape (17340,)
X_test.shape (20400, 76) y_test.shape (20400,)
[[4514   21   73    9   24   68   11    0   45   23  239   94   57]
 [   2 1042    2    0    0    1    0    0    0    0    0    0    0]
 [   3    4 1033    0    0    1    0    0    0    0    0    0    0]
 [  36   15    2  931   14    0    0    0    1    0    1    0    0]
 [  10    0    5   54  911    0    1    0    2    0    0    0    0]
 [  16    0    0    0    1 1025    1    0    0    0    0    0    9]
 [  11    0   14    1    1    1 1005    0    4    0    0    0    0]
 [   0    0    0    0    0    0    0 1039    0    0    0    0    0]
 [   1    0   79    0    0   12    7    0  925    0    0    0    0]
 [  24   26    0    0    0    0    0    0    0  959    0    0   28]
 [  60    0    0    0    0    0    0    0    0    0  643  271    0]
 [   0    1    0    2    0    1    0    0    4    0    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [20]:
''' 
Fully connected three layer neural network using PyTorch
With utility functions based on deeplizard tutorial
https://deeplizard.com/learn/video/v5cngxo4mIg
'''

# Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms

from torch.utils.data import TensorDataset, DataLoader
from IPython.display import display, clear_output
import pandas as pd
import time

from itertools import product
from collections import namedtuple
from collections import OrderedDict

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

class RunBuilder():
    @staticmethod
    def get_runs(params):
        Run = namedtuple('Run', params.keys())
        runs = []
        for v in product(*params.values()):
            runs.append(Run(*v))
        return runs
    
class RunManager():
    def __init__(self):
        self.epoch_count = 0
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        self.epoch_start_time = None
    
        self.run_params = None
        self.run_count = 0
        self.run_data = []
        self.run_start_time = None
    
        self.network = None

    def begin_run(self, run, network, loss_fn, train_inputs, train_targets, X_val, y_val):
        self.run_start_time = time.time()
        self.run_params = run
        self.run_count += 1
    
        self.network = network
        self.loss_fn = loss_fn
        self.train_inputs = train_inputs
        self.train_targets = train_targets
        self.X_val = X_val
        self.y_val = y_val

    def end_run(self):
        self.epoch_count = 0

    def begin_epoch(self):
        self.epoch_start_time = time.time()
        self.epoch_count += 1
        self.epoch_loss = 0
        self.epoch_num_correct = 0

    def end_epoch(self):
        epoch_duration = time.time() - self.epoch_start_time
        run_duration = time.time() - self.run_start_time

        with torch.no_grad():
            loss = (self.loss_fn(self.network(self.train_inputs), self.train_targets)).item()

            val_inputs = torch.from_numpy(self.X_val).float()
            val_preds = self.network(val_inputs)
            y_pred = val_preds.argmax(dim=1)
            accuracy = accuracy_score (self.y_val, y_pred)
    
        results = OrderedDict()
        results["run"] = self.run_count
        results["epoch"] = self.epoch_count
        results['loss'] = loss
        results["accuracy"] = accuracy
        results['epoch duration'] = epoch_duration
        results['run duration'] = run_duration
        for k,v in self.run_params._asdict().items(): results[k] = v
        self.run_data.append(results)
    
def ids_nn():
    params = OrderedDict(
        lr = [.008]
        ,batch_size = [256]
        ,num_epochs = [10]
        ,step_size = [5]
        ,gamma = [0.50]
    )

    rm = RunManager()

    df = ids_load_df_from_csv ()
    X_train, X_val, X_test, y_train, y_val, y_test = ids_split(df)
    train_inputs = torch.from_numpy(X_train).float()
    train_targets = torch.from_numpy(y_train).long()
    train_ds = TensorDataset(train_inputs, train_targets)

    # Run for each combination of params
    for run in RunBuilder.get_runs(params):
        torch.manual_seed(42)
        print (run)

        network = nn.Sequential(
            nn.Linear(num_ids_features, num_ids_features)
            ,nn.ReLU()
            ,nn.Linear(num_ids_features, num_ids_classes)
        )

        train_dl = DataLoader(train_ds, run.batch_size, shuffle=True)
        opt = torch.optim.Adam(network.parameters(), run.lr)
        sch = torch.optim.lr_scheduler.StepLR(opt, run.step_size, run.gamma)
        loss_fn = F.cross_entropy

        rm.begin_run(run, network, loss_fn, train_inputs, train_targets, X_val, y_val) 
        # Training loop
        for epoch in range(run.num_epochs):
            rm.begin_epoch()

            for xb,yb in train_dl:
                pred = network(xb)
                loss = loss_fn(pred, yb)
                loss.backward()
                opt.step()
                opt.zero_grad()

            rm.end_epoch()
            sch.step() 
        rm.end_run()

    print(pd.DataFrame.from_dict(rm.run_data))
    
    val_inputs = torch.from_numpy(X_val).float()
    val_pred = network(val_inputs)

    # Since the model returns values for all num_ids_classes
    # The ids_class with the maximim value is picked as the label
    val_pred = val_pred.argmax(dim=1)

    # A numpy array is needed to evaluate metrics
    y_pred = val_pred.detach().to('cpu').numpy()
    ids_metrics(y_val, y_pred)

In [21]:
ids_nn()

load Dataframe shape (136000, 77)
df.shape (136000, 77)
X_train.shape (98260, 76) y_train.shape (98260,)
X_val.shape (17340, 76) y_val.shape (17340,)
X_test.shape (20400, 76) y_test.shape (20400,)
Run(lr=0.008, batch_size=256, num_epochs=10, step_size=5, gamma=0.5)
   run  epoch      loss  accuracy  ...  batch_size  num_epochs  step_size  gamma
0    1      1  0.137854  0.955248  ...         256          10          5    0.5
1    1      2  0.134446  0.956747  ...         256          10          5    0.5
2    1      3  0.111128  0.960727  ...         256          10          5    0.5
3    1      4  0.099807  0.963668  ...         256          10          5    0.5
4    1      5  0.095310  0.965167  ...         256          10          5    0.5
5    1      6  0.079728  0.970069  ...         256          10          5    0.5
6    1      7  0.087075  0.968051  ...         256          10          5    0.5
7    1      8  0.078842  0.971569  ...         256          10          5    0.5
8    

In [22]:
# Classification using KNN
from sklearn.neighbors import KNeighborsClassifier

def ids_knn():
    """
    Classify processed data set stored as csv file using KNN
    Print: accuracy, confusion matrix, f1 score on the validation data set
    Input:
        None    
    Returns:
        None
    """

    df = ids_load_df_from_csv ()
    X_train, X_val, X_test, y_train, y_val, y_test = ids_split(df)

    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(X_train, y_train)

    y_pred = neigh.predict(X_val)
    
    ids_metrics(y_val, y_pred)

In [23]:
ids_knn()

load Dataframe shape (136000, 77)
df.shape (136000, 77)
X_train.shape (98260, 76) y_train.shape (98260,)
X_val.shape (17340, 76) y_val.shape (17340,)
X_test.shape (20400, 76) y_test.shape (20400,)
[[5062    1    1    1    2    6    4    0    4    1   35   59    2]
 [   1 1043    1    1    0    0    0    0    0    0    1    0    0]
 [   0    0 1040    0    0    0    0    0    1    0    0    0    0]
 [   0    0    1  997    2    0    0    0    0    0    0    0    0]
 [   1    0    0    3  979    0    0    0    0    0    0    0    0]
 [   6    0    0    0    0 1043    2    0    0    0    0    0    1]
 [   1    0    0    0    2    0 1033    0    1    0    0    0    0]
 [   0    0    0    0    0    0    0 1039    0    0    0    0    0]
 [   0    0    2    0    0    0    0    0 1022    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0 1037    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0  974    0    0]
 [  20    0    0    0    0    1    0    0    0    0    