In [1]:
!del /Q utils\__pycache__

In [2]:
import pandas as pd
from tqdm.notebook import tqdm

In [3]:
from utils.datasets import GhlKasperskyDataset, TepHarvardDataset, TepKasperskyDataset, SwatItrustDataset
from utils.custom_plots import plot_stacked
from utils.metrics import time_span_metrics
from utils.watchmen import DirectLimitWatchman, PcaLimitWatchman
from utils.watchmen import IsoForestWatchman
from utils.watchmen import LinearPredictWatchman, DeepPredictWatchman

In [4]:
SEED = 585

# Preparing datasets

In [5]:
datasets = {
    0: GhlKasperskyDataset(),
    1: TepHarvardDataset(),
    2: TepKasperskyDataset(),
    3: SwatItrustDataset(),
}

In [6]:
shake_kwargs = {
    'random_state': SEED,
    'valid2test_ratio': 0.3,
}
for d in datasets:
    datasets[d].shake_not_stir(**shake_kwargs)

# Preparing watchhouse

In [7]:
watchhouse = {
    0: {  # GhlKasperskyDataset
        0: DirectLimitWatchman(),
        1: PcaLimitWatchman(),
        2: IsoForestWatchman(),
        3: LinearPredictWatchman(),
        4: DeepPredictWatchman(),
    },
    1: {  # TepHarvardDataset
        0: DirectLimitWatchman(),
        1: PcaLimitWatchman(),
        2: IsoForestWatchman(),
        3: LinearPredictWatchman(),
        4: DeepPredictWatchman(),
    },
    2: {  # TepKasperskyDataset
        0: DirectLimitWatchman(),
        1: PcaLimitWatchman(),
        2: IsoForestWatchman(),
        3: LinearPredictWatchman(),
        4: DeepPredictWatchman(),
    },
    3: {  # SwatItrustDataset
        0: DirectLimitWatchman(),
        1: PcaLimitWatchman(),
        2: IsoForestWatchman(),
        3: LinearPredictWatchman(),
        4: DeepPredictWatchman(),
    },
}

In [8]:
for d in datasets:
    for w in watchhouse[d]:
        watchhouse[d][w].load(f'{datasets[d].__class__.__name__}')

# Examine

## Throw stones

In [9]:
stones = dict()
for d in datasets:
    stones[d] = dict()
    for w in watchhouse[d]:
        stones[d][w] = list()

In [10]:
for d in datasets:
    for data, faults, info in tqdm(datasets[d].valid_generator(), desc=f'Detect on validation {d}'):
        for w in watchhouse[d]:
            detect = watchhouse[d][w].predict(data)
            stones[d][w].append(pd.concat([faults, detect], axis=1))
            stones[d][w][-1].index.name = info

Detect on validation 0:   0%|          | 0/14 [00:00<?, ?it/s]

Detect on validation 1:   0%|          | 0/3150 [00:00<?, ?it/s]

Detect on validation 2:   0%|          | 0/146 [00:00<?, ?it/s]

Detect on validation 3:   0%|          | 0/2 [00:00<?, ?it/s]

## Individual results

In [11]:
metrics = ('precision', 'recall', 'f1_score')
results = {str(datasets[d]): pd.DataFrame(columns=metrics) for d in datasets}

In [12]:
for d in datasets:
    for w in tqdm(watchhouse[d], desc=f'Collect stones d{d}'):
        exam_paper = pd.DataFrame(columns=metrics)
        for i_st, st in enumerate(stones[d][w]):
            exam_paper.loc[i_st, metrics] = time_span_metrics(st.iloc[:, 0], st.iloc[:, 1:])
        results[str(datasets[d])].loc[str(watchhouse[d][w]), metrics] = exam_paper.mean().values

Collect stones d0:   0%|          | 0/5 [00:00<?, ?it/s]

Collect stones d1:   0%|          | 0/5 [00:00<?, ?it/s]

Collect stones d2:   0%|          | 0/5 [00:00<?, ?it/s]

Collect stones d3:   0%|          | 0/5 [00:00<?, ?it/s]

## Ensembling results

In [15]:
for threshold in range(1, 7):
    for d in tqdm(datasets):
        exam_paper = pd.DataFrame(columns=metrics)
        for i_st, st in enumerate(stones[d][0]):
            opinions = pd.concat([stones[d][w][i_st].iloc[:, 1:].sum(axis=1) for w in watchhouse[d]], axis=1)
            detect = (opinions.sum(axis=1) >= threshold).astype('uint8')
            exam_paper.loc[i_st, metrics] = time_span_metrics(st.iloc[:, 0], detect)
        results[str(datasets[d])].loc[f'ensemble(threshold={threshold})', metrics] = exam_paper.mean().values

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

## Let's see

In [16]:
for d in results:
    print(d)
    display(results[d].sort_values(by='f1_score', ascending=False))

GhlKasperskyDataset(E:\Datasets\GHL)


Unnamed: 0,precision,recall,f1_score
ensemble(threshold=5),0.785714,0.52381,0.597959
ensemble(threshold=4),0.415056,0.630952,0.441955
ensemble(threshold=3),0.313453,0.672619,0.376754
PcaLimitWatchman(n_features=12),0.410256,0.386905,0.366667
DirectLimitWatchman(n_features=12),0.34305,0.630952,0.366331
ensemble(threshold=6),1.0,0.244048,0.314286
ensemble(threshold=2),0.215977,0.714286,0.29657
DeepPredictWatchman(n_features=12),0.168142,0.714286,0.259547
ensemble(threshold=1),0.025862,1.0,0.050299
IsoForestWatchman(n_features=12),0.023114,0.827381,0.044873


TepHarvardDataset(E:\Datasets\TEP\dataverse)


Unnamed: 0,precision,recall,f1_score
ensemble(threshold=1),0.917139,0.818191,0.838499
LinearPredictWatchman(n_features=52),1.0,0.793294,0.821243
ensemble(threshold=2),1.0,0.763542,0.798357
ensemble(threshold=3),1.0,0.724471,0.7639
DeepPredictWatchman(n_features=52),1.0,0.687159,0.737626
ensemble(threshold=4),1.0,0.687798,0.730151
DirectLimitWatchman(n_features=52),1.0,0.651693,0.699804
ensemble(threshold=5),1.0,0.629077,0.682971
ensemble(threshold=6),1.0,0.581779,0.634125
PcaLimitWatchman(n_features=52),1.0,0.569884,0.618054


TepKasperskyDataset(E:\Datasets\TEP\kaspersky)


Unnamed: 0,precision,recall,f1_score
LinearPredictWatchman(n_features=53),0.786575,0.486306,0.457713
ensemble(threshold=3),0.733105,0.474329,0.42869
DirectLimitWatchman(n_features=53),0.736203,0.437018,0.413159
DeepPredictWatchman(n_features=53),0.67109,0.481813,0.405539
ensemble(threshold=2),0.635369,0.503349,0.404354
ensemble(threshold=4),0.756979,0.435101,0.402375
ensemble(threshold=5),0.776044,0.356763,0.350949
ensemble(threshold=6),0.764976,0.329143,0.324713
PcaLimitWatchman(n_features=53),0.917949,0.221242,0.227533
IsoForestWatchman(n_features=53),0.08879,1.0,0.134426


SwatItrustDataset(E:\Datasets\SWaT\datasetA1)


Unnamed: 0,precision,recall,f1_score
LinearPredictWatchman(n_features=51),1.0,0.420455,0.45679
DirectLimitWatchman(n_features=51),0.257732,1.0,0.370309
IsoForestWatchman(n_features=51),0.257732,1.0,0.370309
DeepPredictWatchman(n_features=51),0.257732,1.0,0.370309
ensemble(threshold=1),0.257732,1.0,0.370309
ensemble(threshold=2),0.257732,1.0,0.370309
ensemble(threshold=3),0.257732,1.0,0.370309
ensemble(threshold=4),0.257732,1.0,0.370309
ensemble(threshold=5),0.257732,1.0,0.370309
ensemble(threshold=6),0.257732,1.0,0.370309
