In [14]:
import warnings
from ipywidgets import Button, Layout
# warnings.filterwarnings("ignore")
import pandas as pd
import matplotlib.patches as mpatches
from pprint import pprint
import numpy as np
from ipywidgets import HBox, VBox, interactive
from collections import defaultdict
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.dates as mdates
from IPython.display import display, Markdown
import ipywidgets as widgets
import math
import torch
from ipywidgets import interact, interact_manual
from IPython.display import set_matplotlib_formats
from importlib import reload
from tqdm import tqdm

import sys
import os
sys.path.insert(0, os.path.abspath('../tesi_sabella'))

import cicids2017 as cicids2017
import data_generator as generator

reload(cicids2017)
reload(generator)

pd.set_option('display.max_rows', 500)
pd.set_option('float_format', '{:.4f}'.format)
# plt.rcParams.update({'font.size': 28})

In [15]:
df = pd.read_pickle(f'../dataset/CICIDS2017_complete.pkl')

In [16]:
def mprint(s):
    display(Markdown(s))
    
def ncolors(n):
    hexl = list('0123456789ABCDEF')
    hexc = np.random.choice(hexl, size=(n, 6))
    return ['#' + ''.join(x) for x in hexc]
    
def describe_mtimeseries(key, mtimeserie, attack_meta=None):
    mprint(f"### {key}")
    # Data description ..... #
    start = min(mtimeserie.index)
    end = max(mtimeserie.index)
    mprint(f"**Time range**: {start} **/** {end}")
    mprint(f"**Total data range:** {end-start}")
    mprint(f"**Samples shown**: {len(mtimeserie)}")

    # Plotting clean data ..... #
    empty_cols = []
    for idx, c in enumerate(mtimeserie.columns):
        legend = [c]
        if (mtimeserie[c] == 0).all(): 
            empty_cols.append(c)
            continue
        plt.figure()
        ax = mtimeserie[c].plot(label=c, figsize=(8, 4))
        if attack_meta is not None:
            attack = attack_meta[attack_meta != "none"]
            for aname, adf in attack.groupby(attack):
                legend.append(aname)
                attack_data = mtimeserie.loc[adf.index]
                attack_data.plot(ax=ax, label=aname)
        plt.gcf().suptitle(f"{key}")
        plt.legend(legend)
        plt.show()
    for c in empty_cols:
        mprint(f"**<span style='color: red'>Empty series:</span> {c}**")
    
    mprint(f"<br>{'-'*50}<br>{'-'*50}<br>")

# Deltas

In [4]:
min_delta = np.timedelta64(15, "s")

host_groups = df.groupby(['device_category', 'host'])
for (c, h), host_samples in host_groups:
    times = host_samples.index.get_level_values("_time").sort_values(ascending=True)
    delta = times.to_series().diff()[1:]
    delta_gap = filter(lambda x: x[1] > min_delta, enumerate(delta))

    for (i, gap) in delta_gap:
        if times[i].hour == 17:
            continue
        print(f"{c}:{h} / {times[i]} to {times[i+1]}")

pc:192.168.10.25 / 2017-07-03 13:55:21.868211 to 2017-07-03 16:10:36.868211
pc:192.168.10.25 / 2017-07-06 12:42:01.364662192 to 2017-07-06 13:42:06.364662192
server:205.174.165.68 / 2017-07-05 11:42:12.745452 to 2017-07-05 14:12:17.745452
server:205.174.165.68 / 2017-07-05 14:42:12.745452 to 2017-07-05 14:57:17.745452
server:205.174.165.68 / 2017-07-05 15:27:12.745452 to 2017-07-07 15:44:18.784343942
unknown device class:205.174.165.65 / 2017-07-03 09:40:21.868211 to 2017-07-06 14:27:06.364662192
unknown device class:205.174.165.73 / 2017-07-06 15:57:01.364662192 to 2017-07-06 16:57:06.364662192


# Plotting

We preprocess the data before showing it

In [17]:
pr = cicids2017.Cicids2017Preprocessor(deltas=False, discretize=False)
df_preproc = pr.preprocessing(df, update=True)



In [18]:
%matplotlib inline
set_matplotlib_formats('svg')

days = {
    "Monday": 0,
    "Tuesday": 1,
    "Wednesday": 2,
    "Tursday": 3,
    "Friday": 4}

idxs = df_preproc.index.droplevel(2).unique()
all_devices = [f"{host} ({cat})" for cat, host in idxs]
all_devices.sort()
cols = list(df_preproc.columns)
cols.sort()


# ----- ----- WIDGETS ----- ----- #
# ----- ----- ------- ----- ----- #
device_w_list = widgets.Dropdown(options=all_devices,
                                 value="192.168.10.50 (server)")
days_w_list = widgets.Dropdown(options=list(days.keys()),
                               value="Monday")
selectedc_w_list = widgets.SelectMultiple(options=cols,
                                          value=["traffic:bytes_rcvd"],
                                          description='Features',
                                          layout=Layout(width='400px'))
showall_checkbox = widgets.Checkbox(value=False, description='Show all features')
timerange_slider = widgets.FloatSlider(min=.01, max=1., step=.01)
offset_slider = widgets.FloatSlider(min=.0, max=1., step=.01)
ts_selector = HBox([device_w_list, days_w_list])
col_selector = HBox([selectedc_w_list, showall_checkbox])
ts_shifting = HBox([timerange_slider, offset_slider])
wlist = VBox([ts_selector, col_selector, ts_shifting])


# ----- ----- INTERACTOR ----- ----- #
# ----- ----- ------- ----- ----- #
def whandler(device, day, column, showall, trange, offset):
    split = device.split(" ")
    columns = list(column)
    host = split[0].strip()
    category = " ".join(split[1:]).replace("(", "").replace(")", "").strip()
    host_ts = df_preproc.loc[category, host]
    # Filterign day of week
    host_ts = host_ts[host_ts.index.dayofweek==days[day]]
    # Filtering time range
    ts_len = len(host_ts)
    start_idx = int(ts_len * offset)
    end_idx = min(start_idx + int(ts_len * trange), ts_len)
    
    selected_features = cols if showall else columns
    attack =  host_ts.iloc[start_idx:end_idx]["attack"] if "attack" in cols else None
    describe_mtimeseries(device.strip(), 
                         host_ts.iloc[start_idx:end_idx][selected_features],
                         attack)



output = widgets.interactive(whandler,
                        device=device_w_list, day=days_w_list, 
                        column=selectedc_w_list, showall=showall_checkbox, 
                        trange=timerange_slider, offset=offset_slider).children[-1]
display(wlist)
display(output)

VBox(children=(HBox(children=(Dropdown(description='device', index=11, options=('192.168.10.1 (unknown device …

Output()

# Embedding

In [None]:
df = pd.read_pickle(f'../dataset/CICIDS2017_complete.pkl')
pr = cicids2017.Cicids2017Preprocessor(deltas=False, discretize=False)
df = df[df.index.get_level_values("_time").day == 4]

df_preproc = pr.preprocessing(df, update=True)

In [None]:

ts2vec = cb.Ts2LSTM2Vec()
ts2vec.load_state_dict(torch.load("tesi_sabella/res/ts2vec.torch"))

host_point = ts2vec.to2Dmap(df_preproc) # list of <host, point>

In [None]:
# Hosts color mapping
hosts = host_point.index.get_level_values("host").unique()
colormap = dict(zip(hosts, ncolors(len(hosts))))

host_point["color"] = host_point.index.get_level_values("host").map(colormap)

In [None]:
%matplotlib inline 

fig = plt.figure(figsize=(10,10))
ax3D = fig.add_subplot(111)

col_coords_color = [c for c in host_point.columns if "x" in c] + ["color"]

for (dev_cat, host), hdf in host_point.groupby(level=["device_category", "host"]):
    coords_df = hdf[col_coords_color]
    x1, x2, colors = list(zip(*coords_df.values))
    plt.scatter(x1, x2, color=colors, label=f"{host} ({dev_cat})")
    
plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')