In [1]:
!del /Q utils\__pycache__

In [2]:
import pandas as pd
from tqdm.notebook import tqdm

In [3]:
from utils.datasets import GhlKasperskyDataset, TepHarvardDataset, TepKasperskyDataset, SwatItrustDataset
from utils.custom_plots import plot_stacked
from utils.metrics import time_span_metrics
from utils.watchmen import LimitWatchman, LimitPcaWatchman, SpePcaWatchman, IsolatingWatchman, LinearPredictWatchman

In [4]:
SEED = 127

# Preparing datasets

In [5]:
datasets = {
    0: GhlKasperskyDataset(),
    1: TepHarvardDataset(),
    2: TepKasperskyDataset(),
    3: SwatItrustDataset(),
}

In [6]:
shake_kwargs = {
    'random_state': SEED,
    'valid_test_ratio': 0.0,
}

# Preparing watchmen

In [7]:
watchmen = {
    0: {  # GhlKasperskyDataset
        0: LimitWatchman(),
        1: LimitPcaWatchman(n_components=3),
        2: SpePcaWatchman(n_components=3),
        3: IsolatingWatchman(random_state=SEED),
        4: LinearPredictWatchman(random_state=SEED),
    },
    1: {  # TepHarvardDataset
        0: LimitWatchman(),  # better: ewma='3 min'
        1: LimitPcaWatchman(n_components=3),  # better: n_components=12
        2: SpePcaWatchman(n_components=3),  # better: n_components=12
        3: IsolatingWatchman(random_state=SEED),
        4: LinearPredictWatchman(random_state=SEED),
    },
    2: {  # TepKasperskyDataset
        0: LimitWatchman(),
        1: LimitPcaWatchman(n_components=3),
        2: SpePcaWatchman(n_components=3),
        3: IsolatingWatchman(random_state=SEED),
        4: LinearPredictWatchman(random_state=SEED),
    },
    3: {  # SwatItrustDataset
        0: LimitWatchman(),
        1: LimitPcaWatchman(n_components=3),
        2: SpePcaWatchman(n_components=3),
        3: IsolatingWatchman(random_state=SEED),
        4: LinearPredictWatchman(random_state=SEED),
    },
}

# Learning

## Prefit

In [8]:
for d in datasets:
    datasets[d].shake_not_stir(**shake_kwargs)
    for data, _, _ in tqdm(datasets[d].train_generator(), desc=f'Prefit on train {d}'):
        for w in watchmen[d]:
            watchmen[d][w].prefit(data)

Prefit on train 0: 0it [00:00, ?it/s]

Prefit on train 1: 0it [00:00, ?it/s]

Prefit on train 2: 0it [00:00, ?it/s]

Prefit on train 3: 0it [00:00, ?it/s]

## Fit

In [9]:
for d in datasets:
    datasets[d].shake_not_stir(**shake_kwargs)
    for data, _, _ in tqdm(datasets[d].train_generator(), desc=f'Fit on train {d}'):
        for w in watchmen[d]:
            watchmen[d][w].partial_fit(data)

Fit on train 0: 0it [00:00, ?it/s]

Fit on train 1: 0it [00:00, ?it/s]

Fit on train 2: 0it [00:00, ?it/s]

Fit on train 3: 0it [00:00, ?it/s]

# Examine

## Throw stones

In [10]:
stones = dict()
for d in datasets:
    stones[d] = dict()
    for w in watchmen[d]:
        stones[d][w] = list()

In [11]:
for d in datasets:
    datasets[d].shake_not_stir(**shake_kwargs)
    for data, faults, info in tqdm(datasets[d].test_generator(), desc=f'Detect on test {d}'):
        for w in watchmen[d]:
            detect = watchmen[d][w].predict(data)
            stones[d][w].append(pd.concat([faults, detect], axis=1))
            stones[d][w][-1].index.name = info

Detect on testation 0: 0it [00:00, ?it/s]

Detect on testation 1: 0it [00:00, ?it/s]

Detect on testation 2: 0it [00:00, ?it/s]

Detect on testation 3: 0it [00:00, ?it/s]

## Results

In [19]:
metrics = ('precision', 'recall', 'f1_score')
results = {str(datasets[d]): pd.DataFrame(columns=metrics) for d in datasets}

### Individual

In [20]:
for d in datasets:
    for w in watchmen[d]:
        exam_paper = pd.DataFrame(columns=metrics)
        for st in tqdm(stones[d][w], desc=f'Collect stones d{d}w{w}'):
            exam_paper.loc[st.index.name, metrics] = time_span_metrics(st.iloc[:, 0], st.iloc[:, 1:])
        results[str(datasets[d])].loc[str(watchmen[d][w]), metrics] = exam_paper.mean().values

Collect stones 0:   0%|          | 0/48 [00:00<?, ?it/s]

Collect stones 0:   0%|          | 0/48 [00:00<?, ?it/s]

Collect stones 0:   0%|          | 0/48 [00:00<?, ?it/s]

Collect stones 0:   0%|          | 0/48 [00:00<?, ?it/s]

Collect stones 0:   0%|          | 0/48 [00:00<?, ?it/s]

Collect stones 1:   0%|          | 0/10500 [00:00<?, ?it/s]

Collect stones 1:   0%|          | 0/10500 [00:00<?, ?it/s]

Collect stones 1:   0%|          | 0/10500 [00:00<?, ?it/s]

Collect stones 1:   0%|          | 0/10500 [00:00<?, ?it/s]

Collect stones 1:   0%|          | 0/10500 [00:00<?, ?it/s]

Collect stones 2:   0%|          | 0/288 [00:00<?, ?it/s]

Collect stones 2:   0%|          | 0/288 [00:00<?, ?it/s]

Collect stones 2:   0%|          | 0/288 [00:00<?, ?it/s]

Collect stones 2:   0%|          | 0/288 [00:00<?, ?it/s]

Collect stones 2:   0%|          | 0/288 [00:00<?, ?it/s]

Collect stones 3:   0%|          | 0/6 [00:00<?, ?it/s]

Collect stones 3:   0%|          | 0/6 [00:00<?, ?it/s]

Collect stones 3:   0%|          | 0/6 [00:00<?, ?it/s]

Collect stones 3:   0%|          | 0/6 [00:00<?, ?it/s]

Collect stones 3:   0%|          | 0/6 [00:00<?, ?it/s]

### Ensembling

In [23]:
threshold = 3
for d in datasets:
    exam_paper = pd.DataFrame(columns=metrics)
    for i_st, st in tqdm(enumerate(stones[d][0]), desc=f'Collect stones d{d}'):
        opinions = pd.concat([stones[d][w][i_st].iloc[:, 1:].sum(axis=1) for w in watchmen[d]], axis=1)
        detect = (opinions.sum(axis=1) >= threshold).astype('uint8')
        exam_paper.loc[i_st, metrics] = time_span_metrics(st.iloc[:, 0], detect)
    results[str(datasets[d])].loc[f'ensemble(threshold={threshold})', metrics] = exam_paper.mean().values

Collect stones d0: 0it [00:00, ?it/s]

Collect stones d1: 0it [00:00, ?it/s]

Collect stones d2: 0it [00:00, ?it/s]

Collect stones d3: 0it [00:00, ?it/s]

### Let's see

In [24]:
for d in results:
    print(d)
    display(results[d])

GhlKasperskyDataset(E:\Datasets\GHL)


Unnamed: 0,precision,recall,f1_score
LimitWatchman(ewma=None),0.32572,0.643899,0.378634
LimitPcaWatchman(n_components=3),0.369174,0.383978,0.355303
SpePcaWatchman(n_components=3),0.0,0.0,0.0
IsolatingWatchman(n_trees=17),0.015384,0.989583,0.030195
LinearPredictWatchman(n_features=12),0.013989,1.0,0.027504
ensemble(threshold=3),0.244337,0.649802,0.319381
ensemble(threshold=2),0.015819,0.989583,0.031032


TepHarvardDataset(E:\Datasets\TEP\dataverse)


Unnamed: 0,precision,recall,f1_score
LimitWatchman(ewma=None),0.990542,0.640096,0.679066
LimitPcaWatchman(n_components=3),0.997282,0.395399,0.44416
SpePcaWatchman(n_components=3),0.999734,0.532409,0.568998
IsolatingWatchman(n_trees=500),0.851412,0.446763,0.492408
LinearPredictWatchman(n_features=52),0.983448,0.745214,0.768758
ensemble(threshold=3),0.998295,0.659082,0.700136
ensemble(threshold=2),0.992988,0.702225,0.740372


TepKasperskyDataset(E:\Datasets\TEP\kaspersky)


Unnamed: 0,precision,recall,f1_score
LimitWatchman(ewma=None),0.790822,0.43373,0.421019
LimitPcaWatchman(n_components=3),0.880952,0.243432,0.245122
SpePcaWatchman(n_components=3),0.95625,0.245628,0.251329
IsolatingWatchman(n_trees=400),0.075873,1.0,0.116781
LinearPredictWatchman(n_features=53),0.308391,0.476066,0.228849
ensemble(threshold=3),0.766649,0.424246,0.401417
ensemble(threshold=2),0.302243,0.485086,0.226342


SwatItrustDataset(E:\Datasets\SWaT\dataset12)


Unnamed: 0,precision,recall,f1_score
LimitWatchman(ewma=None),0.3,1.0,0.461538
LimitPcaWatchman(n_components=3),0.285714,0.333333,0.307692
SpePcaWatchman(n_components=3),,0.0,0.0
IsolatingWatchman(n_trees=6),0.3,1.0,0.461538
LinearPredictWatchman(n_features=51),0.272727,0.5,0.352941
ensemble(threshold=3),0.246578,0.827778,0.337145
ensemble(threshold=2),0.194539,0.983333,0.310043
