In [18]:
import warnings
from ipywidgets import Button, Layout
# warnings.filterwarnings("ignore")
import pandas as pd
import matplotlib.patches as mpatches
from pprint import pprint
import numpy as np
from ipywidgets import HBox, VBox, interactive
from collections import defaultdict
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.dates as mdates
from IPython.display import display, Markdown
import ipywidgets as widgets
import math
import torch
from ipywidgets import interact, interact_manual
from IPython.display import set_matplotlib_formats
from importlib import reload
from pathlib import Path
from tqdm import tqdm

import sys
import os
sys.path.insert(0, os.path.abspath('../src/'))

import model_codebase as cb
import cicids2017 as cicids2017
import data_generator as generator

reload(cicids2017)
reload(generator)

pd.set_option('display.max_rows', 500)
pd.set_option('float_format', '{:.4f}'.format)

MODELPATH = Path("../res/ts2vec.torch")
DATASETPATH = Path("../dataset/CICIDS2017_ntop.pkl")

In [19]:
def mprint(s):
    display(Markdown(s))
    
def ncolors(n):
    hexl = list('0123456789ABCDEF')
    hexc = np.random.choice(hexl, size=(n, 6))
    return ['#' + ''.join(x) for x in hexc]
    
def describe_mtimeseries(key, mtimeserie, attack_meta=None):
    mprint(f"### {key}")
    # Data description ..... #
    start = min(mtimeserie.index)
    end = max(mtimeserie.index)
    mprint(f"**Time range**: {start} **/** {end}")
    mprint(f"**Total data range:** {end-start}")
    mprint(f"**Samples shown**: {len(mtimeserie)}")

    # Plotting clean data ..... #
    empty_cols = []
    for idx, c in enumerate(mtimeserie.columns):
        legend = [c]
        if (mtimeserie[c] == 0).all(): 
            empty_cols.append(c)
            continue
        plt.figure()
        ax = mtimeserie[c].plot(label=c, figsize=(8, 4))
        if attack_meta is not None:
            attack = attack_meta[attack_meta != "none"]
            for aname, adf in attack.groupby(attack):
                legend.append(aname)
                attack_data = mtimeserie.loc[adf.index]
                attack_data.plot(ax=ax, label=aname)
        plt.gcf().suptitle(f"{key}")
        plt.legend(legend)
        plt.show()
    for c in empty_cols:
        mprint(f"**<span style='color: red'>Empty series:</span> {c}**")
    
    mprint(f"<br>{'-'*50}<br>{'-'*50}<br>")

# Deltas

In [None]:
df = pd.read_pickle(DATASETPATH)
min_delta = np.timedelta64(15, "s")

host_groups = df.groupby(['device_category', 'host'])
for (c, h), host_samples in host_groups:
    times = host_samples.index.get_level_values("_time").sort_values(ascending=True)
    delta = times.to_series().diff()[1:]
    delta_gap = filter(lambda x: x[1] > min_delta, enumerate(delta))

    for (i, gap) in delta_gap:
        if times[i].hour == 17:
            continue
        print(f"{c}:{h} / {times[i]} to {times[i+1]}")

# Plotting

We preprocess the data before showing it

In [20]:
df = pd.read_pickle(DATASETPATH)
pr = cicids2017.Cicids2017Preprocessor(deltas=True, discretize=False, flevel="MAGIK")
df_preproc = pr.preprocessing(df, update=True)

  coro.send(None)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)


In [21]:
%matplotlib inline
set_matplotlib_formats('svg')

days = {
    "Monday": 0,
    "Tuesday": 1,
    "Wednesday": 2,
    "Tursday": 3,
    "Friday": 4}

idxs = df_preproc.index.droplevel(2).unique()
devices = [f"{host} ({cat})" for cat, host in idxs]
devices.sort()
cols = list(df_preproc.columns)
cols.sort()


# ----- ----- WIDGETS ----- ----- #
# ----- ----- ------- ----- ----- #
device_w_list = widgets.Dropdown(options=devices,
                                 value="192.168.10.50 (server)")
days_w_list = widgets.Dropdown(options=list(days.keys()),
                               value="Monday")
selectedc_w_list = widgets.SelectMultiple(options=cols,
                                          value=["unreachable_flows:flows_as_client"],
                                          description='Features',
                                          layout=Layout(width='400px'))
showall_checkbox = widgets.Checkbox(value=False, description='Show all features')
timerange_slider = widgets.FloatSlider(min=.01, max=1., step=.01)
offset_slider = widgets.FloatSlider(min=.0, max=1., step=.01)
ts_selector = HBox([device_w_list, days_w_list])
col_selector = HBox([selectedc_w_list, showall_checkbox])
ts_shifting = HBox([timerange_slider, offset_slider])
wlist = VBox([ts_selector, col_selector, ts_shifting])


# ----- ----- INTERACTOR ----- ----- #
# ----- ----- ------- ----- ----- #
def whandler(device, day, column, showall, trange, offset):
    split = device.split(" ")
    columns = list(column)
    host = split[0].strip()
    category = " ".join(split[1:]).replace("(", "").replace(")", "").strip()
    host_ts = df_preproc.loc[category, host]
    # Filterign day of week
    host_ts = host_ts[host_ts.index.dayofweek==days[day]]
    # Filtering time range
    ts_len = len(host_ts)
    start_idx = int(ts_len * offset)
    end_idx = min(start_idx + int(ts_len * trange), ts_len)
    
    selected_features = cols if showall else columns
    attack =  host_ts.iloc[start_idx:end_idx]["attack"] if "attack" in cols else None
    describe_mtimeseries(device.strip(), 
                         host_ts.iloc[start_idx:end_idx][selected_features],
                         attack)



output = widgets.interactive(whandler,
                        device=device_w_list, day=days_w_list, 
                        column=selectedc_w_list, showall=showall_checkbox, 
                        trange=timerange_slider, offset=offset_slider).children[-1]
display(wlist)
display(output)

VBox(children=(HBox(children=(Dropdown(description='device', index=11, options=('192.168.10.1 (unknown device …

Output()

# Embedding

In [22]:
df = pd.read_pickle(DATASETPATH)
pr = cicids2017.Cicids2017Preprocessor(flevel="MAGIK")
df_preproc = pr.preprocessing(df, update=True)

ts2vec = cb.GRU2Vec()
ts2vec.load_state_dict(torch.load(str(MODELPATH), map_location=torch.device('cpu')))

  coro.send(None)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)


<All keys matched successfully>

In [24]:
df_preproc.index.get_level_values

<bound method MultiIndex.get_level_values of MultiIndex([(                  'pc',  '192.168.10.12', ...),
            (                  'pc',  '192.168.10.12', ...),
            (                  'pc',  '192.168.10.12', ...),
            (                  'pc',  '192.168.10.12', ...),
            (                  'pc',  '192.168.10.12', ...),
            (                  'pc',  '192.168.10.12', ...),
            (                  'pc',  '192.168.10.12', ...),
            (                  'pc',  '192.168.10.12', ...),
            (                  'pc',  '192.168.10.12', ...),
            (                  'pc',  '192.168.10.12', ...),
            ...
            ('unknown device class', '192.168.10.255', ...),
            ('unknown device class', '192.168.10.255', ...),
            ('unknown device class', '192.168.10.255', ...),
            ('unknown device class', '192.168.10.255', ...),
            ('unknown device class', '192.168.10.255', ...),
            ('unknown de

In [48]:
%matplotlib inline
set_matplotlib_formats('svg')

days_keys = {
    "Monday": 0,
    "Tuesday": 1,
    "Wednesday": 2,
    "Tursday": 3,
    "Friday": 4}

idxs = df_preproc.index.droplevel(2).unique()
devices = [f"{host} ({cat})" for cat, host in idxs]
devices.sort()


# ----- ----- COLORMAPS ----- ----- #
# ----- ----- ------- ----- ----- #
hosts = df_preproc.index.get_level_values("host").unique()
host_cmap = dict(zip(hosts, ncolors(len(hosts))))
categories = df_preproc.index.get_level_values("device_category").unique()
dev_cat_cmap = dict(zip(categories, ncolors(len(categories))))
attacks = df_preproc["attack"].unique()
attack_cmap = dict(zip(attacks, ncolors(len(attacks))))


# ----- ----- WIDGETS ----- ----- #
# ----- ----- ------- ----- ----- #
device_w_list = widgets.Dropdown(options=devices,
                                 value="192.168.10.50 (server)")
days_w_list = widgets.Dropdown(options=list(days_keys.keys()),
                               value="Monday")
netmap_checkbox = widgets.Checkbox(value=False, description='Show all network')
show_ip_checkbox = widgets.Checkbox(value=False, description='Show specific IP')
ts1_selector = HBox([device_w_list, netmap_checkbox])
ts2_selector = HBox([days_w_list, show_ip_checkbox])
wlist = VBox([ts1_selector, ts2_selector])


# ----- ----- INTERACTOR ----- ----- #
# ----- ----- ------- ----- ----- #
def whandler(device, day, netmap, show_ip):
    host_only = not netmap
    daydf = df_preproc[df_preproc.index.get_level_values("_time").dayofweek == days_keys[day]]
    if host_only:
        split = device.split(" ")
        host = split[0].strip()
        category = " ".join(split[1:]).replace("(", "").replace(")", "").strip()
        daydf = daydf.loc[category, host]
    
    pts = cb.network2D(ts2vec, daydf, overlapping=.75)
    if not netmap:
        label, cmap = ("attack", attack_cmap)
    elif netmap:
        if show_ip:
            label, cmap = ("host", host_cmap) 
        else:
            label, cmap = ("device_category", dev_cat_cmap) 
    pts["color"] = pts[label].map(cmap)
    
    # Plotting ..... #
    ax = plt.gca()
    for l, vs in pts.groupby(label):
        ax.scatter(vs["x1"].values, vs["x2"].values, 
                    color=vs["color"].values, label=l)
    plt.legend(loc=(1.04,0))
    plt.gcf().set_size_inches(7, 7)
    plt.show()

output = widgets.interactive(whandler,
                        device=device_w_list, day=days_w_list, 
                        netmap=netmap_checkbox, show_ip=show_ip_checkbox).children[-1]
display(wlist)
display(output)

VBox(children=(HBox(children=(Dropdown(description='device', index=11, options=('192.168.10.1 (unknown device …

Output()