In [1]:
import sys
import os
sys.path.insert(0, os.path.abspath('../src/'))

# Preparing dataset

In [5]:
from collections import defaultdict
from pathlib import Path
from tqdm import tqdm
import _pickle as cPickle
import argparse
import data_generator as generator
import model_codebase as cb
import numpy as np
import pandas as pd
import random
import sklearn.metrics as skmetrics 

from cicids2017_prophet import Cicids2017Preprocessor


WINDOW_OVERLAPPING = .95
FLEVEL = "MAGIK"
CONTEXT_LEN = 80 # context window length, 28 minutes with 4spm (sample per minutes) 
ACTIVITY_LEN = 40 # activity window length, 14 minutes 


def prepare_dataset(datapath):
    timeseries_data = datapath / "CICIDS2017_ntop.pkl"
    df = pd.read_pickle(timeseries_data)
    
    pr = Cicids2017Preprocessor(flevel=FLEVEL, discretize=False)
    
    monday = 3
    week_mask = df.index.get_level_values("_time").day != monday
    tserver_mask = df.index.get_level_values("host") != "192.168.10.50"

    training = df[week_mask & tserver_mask]
    training = pr.preprocessing(training, fit=True)
    normal_training_mask = (training["isanomaly"] == "none")
    training = training[normal_training_mask].drop("isanomaly", axis=1)
    training = training.reset_index().drop(["device_category", "host"], axis=1).rename(columns={"_time": "ds"})
    
    testing = df[np.bitwise_not(week_mask) & tserver_mask]
    testing = pr.preprocessing(testing)
    normal_testing_mask = (testing["isanomaly"] == "none")
    testing = testing[normal_testing_mask].drop("isanomaly", axis=1)
    testing = testing.reset_index().drop(["device_category", "host"], axis=1).rename(columns={"_time": "ds"})
    
    # Validation attacks: monday + week attacks
    testing_attacks = df[np.bitwise_not(tserver_mask)]
    testing_attacks = pr.preprocessing(testing_attacks)
    testing_attacks = cb.ts_windowing(testing_attacks, overlapping=WINDOW_OVERLAPPING, context_len=CONTEXT_LEN)
 
    datasets = { "prophet_training": training, "prophet_testing": testing,
                 "testing_attacks": testing_attacks }
    return datasets

dd = prepare_dataset(Path("../dataset/"))

# Training Prophet storm

In [7]:
from tqdm import tqdm
from collections import defaultdict
from fbprophet import Prophet

tr = dd["prophet_training"]
cols = set(tr.columns)
cols.remove("ds")
P = defaultdict(lambda: Prophet(weekly_seasonality=True))

for y in tqdm(cols):
    other_cols = set(cols)
    other_cols.remove(y)
    ts = tr.rename(columns={y: "y"})
    m = P[y]
    for c in other_cols:
        m.add_regressor(c)
    m.fit(ts)

ERROR:fbprophet.plot:Importing plotly failed. Interactive plots will not work.
  0%|          | 0/19 [00:00<?, ?it/s]INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
  5%|▌         | 1/19 [00:22<06:49, 22.74s/it]INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
 11%|█         | 2/19 [00:34<05:31, 19.51s/it]INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
 16%|█▌        | 3/19 [01:40<08:53, 33.35s/it]INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
 21%|██        | 4/19 [01:40<05:51, 23.46s/it]INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
 26%|██▋       | 5/19 [20:31<1:22:58, 355.58s/it]INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
 32%|███▏      | 6/19 [22:

# Computing ANOMALY_THRESHOLD

In [143]:
def anomalyscore(x: pd.DataFrame, verbose=True):
    attribute_scores = []
    iterator = tqdm(x.columns) if verbose else x.columns 
    
    for c in iterator:
        if c == "ds":
            continue
        
        ground_truth = x[c]
        model_input = x.drop(c, axis=1)
        pred = P[c].predict(model_input)[["trend", "yhat", "yhat_lower", "yhat_upper"]].copy()
        pred["ground_truth"] = ground_truth

        out_of_forchetta = ((pred["yhat"] < pred["yhat_lower"]) | (pred["yhat"] > pred["yhat_upper"]))
        # Percentage of outliers in validation data for column 'c'
        column_score = 1 - len(out_of_forchetta[out_of_forchetta==False]) / len(out_of_forchetta)
        attribute_scores.append(column_score)
        
    return attribute_scores

In [144]:
s = anomalyscore(dd["prophet_testing"])
ANOMALY_THRESHOLD = np.mean(s)
print(f"Threshold set to: {ANOMALY_THRESHOLD}")

100%|██████████| 20/20 [04:05<00:00, 12.29s/it]

Threshold set to: 0.0





# Prophet attack prediction capabilities

In [93]:
ctx_columns = tr.columns[1:]

def model_input_to_prophet(minput):
    def gen_ctx_idx(start, end):
        return list(map(lambda x: pd.Timestamp(x.mid, unit="s"), pd.interval_range(st, en, 80)))
    
    zipped = zip(minput["context"], minput["start_time"], minput["end_time"])
    
    res = []
    for ctx, st, en in zipped:
        ds = gen_ctx_idx(st, en)
        df = pd.DataFrame(ctx, columns=ctx_columns)
        df["ds"] = ds
        res.append(df)
    return res
        
testing_attacks = dd["testing_attacks"]
prophet_testing_attacks = model_input_to_prophet(testing_attacks)
y_attacks = testing_attacks["isanomaly"]

In [None]:
y_hat = []

for ctx_ts in tqdm(prophet_testing_attacks):
    ctx_score = anomalyscore(ctx_ts, verbose=False)
    is_anomaly = np.mean(ctx_score) > ANOMALY_THRESHOLD
    y_hat.append(is_anomaly)

y_hat = np.array(y_hat)

 99%|█████████▉| 2072/2087 [23:58:58<11:45, 47.05s/it]   

In [None]:
from sklearn import metrics

# report = metrics.classification_report(y_attacks, y_hat)
metrics_rep = [ metrics.roc_auc_score,
                metrics.precision_score, metrics.recall_score,
                metrics.accuracy_score, metrics.f1_score]
for m in metrics_rep:
    mres = m(y_attacks, y_hat)
    print(f"{m.__name__}(moday+attacks): {mres}")

tn, fp, fn, tp = metrics.confusion_matrix(y_attacks, y_hat, normalize="all").ravel()
print("\n Confusion matrix")
print(f"\ttp: {tp} \tfp: {fp} \n\tfn: {fn} \ttn: {tn}")
print(f"\n{report}")