In [1]:
from sklearnex import patch_sklearn
patch_sklearn()

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os
import itertools
import random
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer as Imputer
from sklearn.svm import OneClassSVM, SVC
from sklearn.linear_model import SGDOneClassSVM, SGDClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,precision_recall_fscore_support, roc_curve
import keras
from keras.models import Model, Sequential
from keras.layers import Dense, Dropout
from sklearn.model_selection import GridSearchCV
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.metrics import f1_score
from tensorflow.keras import regularizers
import joblib
from sklearn.pipeline import make_pipeline

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
2023-09-14 16:39:16.405523: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
label_d = {}

def datetime_to_timestamp(dt):
    try:
        return datetime.strptime(dt, '%m/%d/%Y %H:%M').weekday()
    except:
        return datetime.strptime(dt, '%Y-%m-%d %H:%M:%S').weekday()
    
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df = df.dropna()
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

def train_test_dataset(df_train, df_test, deep=True):
    labelencoder = LabelEncoder()

    X_train = df_train.drop(columns=['Label']).copy()
    y_train = df_train.iloc[:, -1].values.reshape(-1,1).copy()
    y_train = np.ravel(y_train).copy()
    if deep:
        y_train = to_categorical(y_train)
    
    if df_test is not None:
        X_test = df_test.drop(columns=['Label']) .copy()
        y_test = df_test.iloc[:, -1].values.reshape(-1,1).copy()
        y_test = np.ravel(y_test).copy()
        if deep:
            y_test = to_categorical(y_test)

        return  X_train, X_test, y_train, y_test
    
    else:
        return train_test_split(X_train, y_train)
    
def show_cm(cm):
    f,ax=plt.subplots(figsize=(5,5))
    sns.heatmap(cm,annot=True,linewidth=0.5,linecolor="red",fmt=".0f",ax=ax)
    plt.xlabel("y_pred")
    plt.ylabel("y_true")
    plt.show()

def from_categorical(y_):
    return np.array([np.argmax(i) for i in y_])

def discretize(a):
    return 1 if a > 0.5 else 0

def labels_to_numbers(df: pd.DataFrame, name='Label'):
    labels = df[name].unique()
    d = {label: idx for idx, label in enumerate(labels)}

    return d

def number_to_label(df, number, name='Label'):
    labels = df[name].unique()
    d = {idx: label for idx, label in enumerate(labels)}
    return d[number]
        

def prepare_df(df: pd.DataFrame, dropcols=None, scaler=None, ldict=None):
    temp_df = df.copy()
    if dropcols:
        temp_df = temp_df.drop(columns=dropcols)
    if not ldict:
        ltn_dict = labels_to_numbers(temp_df)
    else:
        ltn_dict = ldict
    temp_df['Label'] = temp_df['Label'].map(ltn_dict)
        
    temp_df = clean_dataset(temp_df)
    if scaler == 'minmax':
        scaler = joblib.load('models/scaler.pkl')
        # scaler = MinMaxScaler()
    elif scaler == 'standard':
        scaler = joblib.load('models/standard_scaler.pkl')
        # scaler = StandardScaler()
        
    if scaler:
        temp_df[temp_df.columns[:-1]] = scaler.fit_transform(temp_df[temp_df.columns[:-1]])
        # joblib.dump(scaler, 'models/standard_scaler.pkl')
    
    return temp_df

def binarize_label(y, label):
    idx = y == label
    
    y[idx] = 1
    y[~idx] = 0
    
def evaluate_model(model, df, binarize=None):
    X = df.drop(columns=['Label']).copy()
    y = df.iloc[:, -1].values.reshape(-1,1).copy()
    y = np.ravel(y).copy()
    if binarize is not None:
        binarize_label(y, binarize)
    
    model.evaluate(X, y)
    
    y_predicted = model.predict(X)
    cm = confusion_matrix(y, vdiscretize(y_predicted))
    show_cm(cm)
    
def unsup_compare_results(model, X, y_true):
    y_pred = model.predict(X)
    return f1_score(y_true, y_pred, average=None)

def plot_cm(model, X, y_true):
    y_pred = model.predict(X)
    cm = confusion_matrix(y_true, y_pred)
    show_cm(cm)
    
def make_binary_svm(train, test, l):
    _X_train, _X_test, _y_train, _y_test = train_test_dataset(train, test, deep=False)
    binarize_label(_y_train, l)
    binarize_label(_y_test, l)
    # _y_train[_y_train == 0] = -1
    # _y_train[_y_train == 0] = -1
    return _X_train, _X_test, _y_train, _y_test

def check_all_labels(train, test, model_constructor, l_name='Label', constructor_kwargs=None):
    labels = train[l_name].unique()
    for label in labels:
        print(f'============= Label {number_to_label(clean_test, label)} ==================')
        X_train, X_test, y_train, y_test = make_binary_svm(clean_train, clean_test, label)
        if constructor_kwargs:
            model = model_constructor(**constructor_kwargs)
        else:
            model = model_constructor()
        model.fit(X_train, y_train)
        print(unsup_compare_results(model, X_test, y_test))
        probs = model.predict_proba(X_test)[:, 1]
        plot_roc(y_test, probs)
        
    
def plot_roc(y, probs, label='Classifier'):
    ns_probs = [0 for _ in range(len(y))]
    
    ns_fpr, ns_tpr, _ = roc_curve(y, ns_probs)
    fpr, tpr, _ = roc_curve(y, probs)
    plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
    plt.plot(fpr, tpr, label=label)
    # plt.ylim([0.0, 1.05])
    # plt.xlim([0.0, 1.05])
    plt.legend()
    plt.show()
        
vdiscretize = np.vectorize(discretize)

In [3]:
training_set_path = 'Training.csv'
test_set_path = 'Testing.csv'

training_df = pd.read_csv(training_set_path)
test_df = pd.read_csv(test_set_path)


# clean_train = prepare_df(training_df, dropcols=['Flow ID', 'Src IP', 'Dst IP', 'Dst Port', 'Src Port', 'Protocol', 'Timestamp'], scaler='minmax')
# clean_test = prepare_df(test_df, dropcols=['Flow ID', 'Src IP', 'Dst IP', 'Dst Port', 'Src Port', 'Protocol', 'Timestamp'], scaler='minmax')

In [4]:
scaler = 'standard'

In [5]:
clean_train = prepare_df(training_df, dropcols=['Flow ID', 'Src IP', 'Dst IP', 'Dst Port', 'Src Port', 'Protocol', 'Timestamp'], scaler=scaler)
clean_test = prepare_df(test_df, dropcols=['Flow ID', 'Src IP', 'Dst IP', 'Dst Port', 'Src Port', 'Protocol', 'Timestamp'], scaler=scaler)
X_train_scvic, X_test_scvic, y_train_scvic, y_test_scvic = make_binary_svm(clean_train, clean_test, 0)


frames = []
for file in os.listdir("./dapt2020/csv/"):
    if file.endswith(".csv"):
        path = "./dapt2020/csv/" + file
        tmp = pd.read_csv(path)
        frames.append(tmp)
        
        
dapt2020 = pd.concat(frames)
dapt2020 = dapt2020.rename(columns={"Stage": "Label"})
# dapt2020 = pd.concat(frames)

dapt_label_d = {
    "Benign": 0,
    'BENIGN': 0,
    'Establish Foothold': 1,
    'Reconnaisance': 2,
    'Data Exfiltration': 5,
    'Lateral Movement': 4
}

clean_dapt = prepare_df(dapt2020, dropcols=['Flow ID', 'Src IP', 'Dst IP', 'Dst Port', 'Src Port', 'Protocol', 'Timestamp', 'Activity'], scaler=scaler, ldict=dapt_label_d)

X_train_dapt, X_test_dapt, y_train_dapt, y_test_dapt = make_binary_svm(clean_dapt, None, 0)


  indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
  indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
  indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)


In [8]:
# SCIVC -> DAPT
svm = SGDClassifier(loss='hinge', alpha=0.0001, max_iter=1000, tol=1e-2)

svm.fit(X_train_scvic, y_train_scvic)

print(unsup_compare_results(svm, X_test_dapt, y_test_dapt))


[0.00209644 0.88658566]


In [17]:
sgdc = SGDClassifier()

sgdc_params = {
    'loss':['hinge'],
    'max_iter':[10,100,1000],
    'alpha':np.logspace(-3, 3, 10),
    'tol': np.logspace(-3, 3, 10),
}


sgdc_gs = GridSearchCV(sgdc, sgdc_params, verbose=1, n_jobs=-1)

sgdc_gs.fit(X_train_scvic, y_train_scvic)

print(sgdc_gs.best_params_)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits




{'alpha': 0.004641588833612777, 'loss': 'hinge', 'max_iter': 10, 'tol': 0.1}


In [18]:
sgdc = SGDClassifier()

sgdc_params = {
    'loss':['hinge'],
    'max_iter':[10,100,1000],
    'alpha':np.logspace(-5, 5, 10),
    'tol': np.logspace(-3, 3, 10),
}


sgdc_gs = GridSearchCV(sgdc, sgdc_params, verbose=1, n_jobs=-1)

sgdc_gs.fit(X_train_dapt, y_train_dapt)

print(sgdc_gs.best_params_)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits




{'alpha': 0.001, 'loss': 'hinge', 'max_iter': 100, 'tol': 2.154434690031882}


In [42]:
for i in np.logspace(-5, 5, 10):
    print(f"{i:.5f}", end=', ')

0.00001, 0.00013, 0.00167, 0.02154, 0.27826, 3.59381, 46.41589, 599.48425, 7742.63683, 100000.00000, 

In [32]:
# SCIVC -> DAPT
svm = SGDClassifier(alpha=0.00001, loss='hinge', max_iter=1000, tol=0.1)

svm.fit(X_train_scvic, y_train_scvic)

print(unsup_compare_results(svm, X_test_scvic, y_test_scvic))
print(unsup_compare_results(svm, X_test_dapt, y_test_dapt))

[0.80326838 0.99716133]
[0.00187135 0.87115379]


In [31]:
# DAPT -> SCIVC
svm = SGDClassifier(alpha=0.001, loss='hinge', max_iter=100, tol=2.1544)

svm.fit(X_train_dapt, y_train_dapt)

print(unsup_compare_results(svm, X_test_dapt, y_test_dapt))
print(unsup_compare_results(svm, X_test_scvic, y_test_scvic))

[0.78057209 0.96448436]
[0.01119795 0.91792295]


In [10]:
# SCIVC -> DAPT
svm = SVC(kernel='rbf', C=10, gamma=0.001)

svm.fit(X_train_scvic, y_train_scvic)

print(unsup_compare_results(svm, X_test_dapt, y_test_dapt))


[0.00271739 0.86644833]


In [13]:
# DAPT -> SCVIC
svm = SVC(kernel='rbf', C=100, gamma=1)

svm.fit(X_train_dapt, y_train_dapt)

print(unsup_compare_results(svm, X_test_dapt, y_test_dapt))


[0.88643362 0.98032694]


In [5]:
clean_train = prepare_df(training_df, dropcols=['Flow ID', 'Src IP', 'Dst IP', 'Dst Port', 'Src Port', 'Protocol', 'Timestamp'], scaler=scaler)
clean_test = prepare_df(test_df, dropcols=['Flow ID', 'Src IP', 'Dst IP', 'Dst Port', 'Src Port', 'Protocol', 'Timestamp'], scaler=scaler)

X_train, X_test, y_train, y_test = make_binary_svm(clean_train, clean_test, 0)


svm = SGDClassifier(loss='hinge', alpha=0.0001, max_iter=1000, tol=1e-2)

svm.fit(X_train, y_train)

print(unsup_compare_results(svm, X_test, y_test))

# probs = svm.predict_proba(X_test)[:, 1]
# plot_roc(y_test, probs, label='SGD SVM SCVIC')

  indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
  indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)


[0.82520074 0.99743276]


In [6]:
frames = []
for file in os.listdir("./dapt2020/csv/"):
    if file.endswith(".csv"):
        path = "./dapt2020/csv/" + file
        tmp = pd.read_csv(path)
        frames.append(tmp)
        
        
dapt2020 = pd.concat(frames)
dapt2020 = dapt2020.rename(columns={"Stage": "Label"})
# dapt2020 = pd.concat(frames)

dapt_label_d = {
    "Benign": 0,
    'BENIGN': 0,
    'Establish Foothold': 1,
    'Reconnaisance': 2,
    'Data Exfiltration': 5,
    'Lateral Movement': 4
}

clean_dapt = prepare_df(dapt2020, dropcols=['Flow ID', 'Src IP', 'Dst IP', 'Dst Port', 'Src Port', 'Protocol', 'Timestamp', 'Activity'], scaler=scaler, ldict=dapt_label_d)

X_train, X_test, y_train, y_test = make_binary_svm(clean_dapt, None, 0)

svm = SGDClassifier(loss='hinge', alpha=0.0001, max_iter=100, tol=1e-2)

svm.fit(X_train, y_train)

print(unsup_compare_results(svm, X_test, y_test))

# probs = svm.predict_proba(X_test)[:, 1]
# plot_roc(y_test, probs, label='SGD SVM DAPT2020')

  indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)


[0.74910394 0.96107507]


In [7]:
X_train, X_test, y_train, y_test = make_binary_svm(clean_train, clean_test, 0)


svm = SGDClassifier(loss='hinge', alpha=0.0001, max_iter=100, tol=1e-2)

svm.fit(X_train, y_train)

print(unsup_compare_results(svm, X_test, y_test))

# probs = svm.predict_proba(X_test)[:, 1]
# plot_roc(y_test, probs, label='SGD SVM SCVIC')

X_train, X_test, y_train, y_test = make_binary_svm(clean_dapt, None, 0)

print(unsup_compare_results(svm, X_test, y_test))

# probs = svm.predict_proba(X_test)[:, 1]
# plot_roc(y_test, probs, label='SGD SVM cross dataset')

[0.79639175 0.99713514]
[0.00263227 0.88720864]


In [8]:


# oc = OneClassSVM(kernel='linear', gamma=0.000001, nu=0.10)

# oc.fit(X_train)

In [32]:
X_train, X_test, y_train, y_test = make_binary_svm(clean_train, clean_test, 0)

svc = SVC(C=10, gamma=0.001, verbose=2, kernel='rbf')

svc.fit(X_train, y_train)
print(unsup_compare_results(svc, X_test, y_test))

[0.94060995 0.99899077]


In [35]:
X_train, X_test, y_train, y_test = make_binary_svm(clean_dapt, None, 0)

svcdapt = SVC(C=100, gamma=1, kernel='rbf', verbose=2)

svcdapt.fit(X_train, y_train)
print(unsup_compare_results(svcdapt, X_test, y_test))

[0.88810931 0.98104829]


In [37]:
X_train, X_test, y_train, y_test = make_binary_svm(clean_train, clean_test, 0)
print(unsup_compare_results(svcdapt, X_test, y_test))

[0.         0.99186127]


In [12]:
# 

# grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2, n_jobs=-1)
# grid.fit(X_train,y_train)
# print(grid.best_estimator_)

In [13]:
# X_train, X_test, y_train, y_test = make_binary_svm(clean_train, clean_test, 0)
# param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001]}
# grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=5, n_jobs=-1)
# grid.fit(X_train,y_train)
# print(grid.best_estimator_)

In [52]:
X_train, X_test, y_train, y_test = make_binary_svm(clean_train, clean_test, 0)
X_normal = X_train[y_train == 1]

y_test[y_test == 0] = -1

oc = SGDOneClassSVM(nu=0.10, verbose=2)
oc.fit(X_normal)

unsup_compare_results(oc, X_test, y_test)

-- Epoch 1
Norm: 0.06, NNZs: 69, Bias: 1.000268, T: 253028, Avg. loss: 0.000843
Total training time: 0.07 seconds.
-- Epoch 2
Norm: 0.05, NNZs: 69, Bias: 0.998836, T: 506056, Avg. loss: 0.000150
Total training time: 0.14 seconds.
-- Epoch 3
Norm: 0.05, NNZs: 69, Bias: 0.998421, T: 759084, Avg. loss: 0.000129
Total training time: 0.22 seconds.
-- Epoch 4
Norm: 0.05, NNZs: 69, Bias: 1.001449, T: 1012112, Avg. loss: 0.000125
Total training time: 0.29 seconds.
-- Epoch 5
Norm: 0.04, NNZs: 69, Bias: 0.998588, T: 1265140, Avg. loss: 0.000106
Total training time: 0.36 seconds.
-- Epoch 6
Norm: 0.04, NNZs: 69, Bias: 0.998734, T: 1518168, Avg. loss: 0.000103
Total training time: 0.43 seconds.
Convergence after 6 epochs took 0.43 seconds


array([0.05887712, 0.78428959])

In [53]:
X_train, X_test, y_train, y_test = make_binary_svm(clean_dapt, None, 0)
X_normal = X_train[y_train == 1]

y_test[y_test == 0] = -1

oc = SGDOneClassSVM(nu=0.10, verbose=2)
oc.fit(X_normal)

unsup_compare_results(oc, X_test, y_test)

-- Epoch 1
Norm: 0.04, NNZs: 64, Bias: 1.016153, T: 47858, Avg. loss: 0.002225
Total training time: 0.01 seconds.
-- Epoch 2
Norm: 0.04, NNZs: 64, Bias: 1.006063, T: 95716, Avg. loss: 0.000282
Total training time: 0.03 seconds.
-- Epoch 3
Norm: 0.03, NNZs: 64, Bias: 1.007595, T: 143574, Avg. loss: 0.000171
Total training time: 0.04 seconds.
-- Epoch 4
Norm: 0.02, NNZs: 64, Bias: 1.003067, T: 191432, Avg. loss: 0.000106
Total training time: 0.06 seconds.
-- Epoch 5
Norm: 0.01, NNZs: 64, Bias: 1.001772, T: 239290, Avg. loss: 0.000083
Total training time: 0.07 seconds.
-- Epoch 6
Norm: 0.01, NNZs: 64, Bias: 1.001187, T: 287148, Avg. loss: 0.000067
Total training time: 0.08 seconds.
-- Epoch 7
Norm: 0.01, NNZs: 64, Bias: 1.001502, T: 335006, Avg. loss: 0.000060
Total training time: 0.10 seconds.
Convergence after 7 epochs took 0.10 seconds


array([0.01731245, 0.85347265])

In [42]:
np.unique(y_test)

array([-1.,  1.])

In [54]:
X_train, X_test, y_train, y_test = make_binary_svm(clean_dapt, None, 0)
X_normal = X_train[y_train == 1]

y_test[y_test == 0] = -1

oc = OneClassSVM(gamma=0.000001, nu=0.10, verbose=2)
oc.fit(X_normal)

unsup_compare_results(oc, X_test, y_test)

[LibSVM]....
*
optimization finished, #iter = 4867
obj = 11423511.355342, rho = 4779.280953
nSV = 4782, nBSV = 4781


array([0.05928587, 0.87282553])

In [None]:
X_train, X_test, y_train, y_test = make_binary_svm(clean_train, clean_test, 0)
X_normal = X_train[y_train == 1]

y_test[y_test == 0] = -1

oc = OneClassSVM(gamma=0.000001, nu=0.10, verbose=2)
oc.fit(X_normal)

unsup_compare_results(oc, X_test, y_test)

[LibSVM].......................
*
optimization finished, #iter = 23230
obj = 319931472.983748, rho = 25293.583986
nSV = 25303, nBSV = 25302
