In [None]:
import os
import sys
import numpy as np
from pprint import pprint
from tqdm import tqdm
import pandas as pd
from importlib import reload
from IPython.display import display, Markdown

sys.path.insert(0, os.path.abspath('../'))
pd.options.display.max_rows = 999

import tesi_sabella.pyFluxClient.pyfluxclient as flux
import tesi_sabella.data_generator as generator


def mprint(s):
    display(Markdown(s))


reload(flux)
reload(generator)

# Generating time series

Create a InfluxDB Flux client and a data-generator that can poll InfluxDB to get new samples. Poll data added in the last 48 hours

In [2]:
fclient = flux.FluxClient(); 
fluxbucket = 'CIC_IDS_2017_Tuesday'
start, end = fclient.bucket_timerange(fluxbucket)

cicids2017 = generator.ntop_Generator(fluxbucket, '15s', fclient, start)

Extract data from influxdb in time intervals due to kill signal

In [3]:
pbar = tqdm()
lastpoll_start = start

i = 0
while True:
    if not lastpoll_start:
        break
    
    lastpoll_end = lastpoll_start + pd.DateOffset(minutes=5)
    lastpoll_start = cicids2017.poll(stop=lastpoll_end)
    
    # tqdm ..... #
    i += 1
    pbar.set_description(f'polled: {str(lastpoll_start)}')
    pbar.update(i)
    
df = cicids2017.toPandas()

0it [00:00, ?it/s]

ValueError: max() arg is an empty sequence

NaN values processing

In [15]:
# Fix score which is default NaN
df_clean = df.fillna({"score:score": 0})
# Dropping
nans_count = df_clean[df_clean.isnull().any(axis=1)]
print(f'NaNs count: {len(nans_count)}')
df_clean = df_clean.dropna()

NaNs count: 873


Generic information

In [17]:
devices = list(df_clean.index.droplevel(2).unique())
timeseries_lens = [len(df_clean.loc[dtype].loc[ip]) for dtype, ip in devices]
mean_len = np.mean(timeseries_lens)
min_len = np.min(timeseries_lens)

display(Markdown(f'#### Features count: {len(df_clean.columns)}'))
display(Markdown(f'#### Timeseries length: {int(mean_len)} (min. {min_len})'))
display(Markdown(f'#### Hosts:'))
pprint(devices)

#### Features count: 45

#### Timeseries length: 1844 (min. 1280)

#### Hosts:

[('pc', '192.168.10.12'),
 ('pc', '192.168.10.14'),
 ('pc', '192.168.10.15'),
 ('pc', '192.168.10.16'),
 ('pc', '192.168.10.17'),
 ('pc', '192.168.10.19'),
 ('pc', '192.168.10.25'),
 ('pc', '192.168.10.5'),
 ('pc', '192.168.10.8'),
 ('pc', '192.168.10.9'),
 ('server', '192.168.10.3'),
 ('server', '192.168.10.50'),
 ('server', '192.168.10.51')]


In [107]:
for device_type, ip in devices:
    host_ts = df_clean.loc[device_type].loc[ip] 
    delta = np.diff(np.sort(host_ts.index.values))
    
    deltas, counts = np.unique(delta, return_counts=True)
    if len(deltas) != 1: 
        sdeltas = [x / np.timedelta64(1, 's') for x in deltas]
        dcount = dict(zip(sdeltas, counts))
        del dcount[15.]
        print(dcount)

{30.0: 33, 135.0: 1}
{30.0: 44, 75.0: 1, 165.0: 1}
{30.0: 39, 45.0: 1, 60.0: 1, 90.0: 1}
{30.0: 33, 255.0: 1}
{30.0: 33, 150.0: 1}
{30.0: 36}
{30.0: 29, 630.0: 1, 8955.0: 1}
{30.0: 33, 135.0: 1, 435.0: 1}
{30.0: 35, 60.0: 2, 165.0: 2}
{30.0: 33, 90.0: 2}
{30.0: 33, 90.0: 1}
{30.0: 37}
{30.0: 36, 285.0: 1}


# Store time series

In [56]:
df_clean.to_pickle(f'../dataset/CIC_IDS_2017/{fluxbucket}.pkl')