In [1]:
!del /Q utils\__pycache__

In [2]:
import pandas as pd
from tqdm.notebook import tqdm

In [3]:
from utils.datasets import GhlKasperskyDataset, TepHarvardDataset, TepKasperskyDataset, SwatItrustDataset
from utils.custom_plots import plot_stacked
from utils.metrics import time_span_metrics
from utils.watchmen import Watchman
from utils.watchmen import DirectLimitWatchman, PcaLimitWatchman
from utils.watchmen import IsoForestWatchman
from utils.watchmen import LinearPredictWatchman, DeepPredictWatchman

In [4]:
SEED = 1729

# Preparing datasets

In [5]:
datasets = {
    0: GhlKasperskyDataset(),
    1: TepHarvardDataset(),
    2: TepKasperskyDataset(),
    3: SwatItrustDataset(),
}

In [6]:
shake_kwargs = {
    'random_state': SEED,
    'valid2test_ratio': 0.3,
}
for d in datasets:
    datasets[d].shake_not_stir(**shake_kwargs)

# Preparing watchhouse

In [7]:
watchhouse = {
    0: {  # GhlKasperskyDataset
        0: Watchman(),
        1: DirectLimitWatchman(),
        2: PcaLimitWatchman(),
        3: IsoForestWatchman(random_state=SEED),
        4: LinearPredictWatchman(random_state=SEED),
        5: DeepPredictWatchman(random_state=SEED),
    },
    1: {  # TepHarvardDataset
        0: Watchman(),
        1: DirectLimitWatchman(),
        2: PcaLimitWatchman(),
        3: IsoForestWatchman(random_state=SEED),
        4: LinearPredictWatchman(random_state=SEED),
        5: DeepPredictWatchman(random_state=SEED),
    },
    2: {  # TepKasperskyDataset
        0: Watchman(),
        1: DirectLimitWatchman(),
        2: PcaLimitWatchman(),
        3: IsoForestWatchman(random_state=SEED),
        4: LinearPredictWatchman(random_state=SEED),
        5: DeepPredictWatchman(random_state=SEED),
    },
    3: {  # SwatItrustDataset
        0: Watchman(),
        1: DirectLimitWatchman(),
        2: PcaLimitWatchman(),
        3: IsoForestWatchman(random_state=SEED),
        4: LinearPredictWatchman(random_state=SEED),
        5: DeepPredictWatchman(random_state=SEED),
    },
}

# Learning

## Prefit

In [8]:
for d in datasets:
    for data, _, _ in tqdm(datasets[d].train_generator(), desc=f'Prefit on train {d}'):
        for w in watchhouse[d]:
            watchhouse[d][w].prefit(data)

Prefit on train 0:   0%|          | 0/1 [00:00<?, ?it/s]

Prefit on train 1:   0%|          | 0/500 [00:00<?, ?it/s]

Prefit on train 2:   0%|          | 0/400 [00:00<?, ?it/s]

Prefit on train 3:   0%|          | 0/1 [00:00<?, ?it/s]

## Fit

In [9]:
for d in datasets:
    for data, _, _ in tqdm(datasets[d].train_generator(), desc=f'Fit on train {d}'):
        for w in watchhouse[d]:
            watchhouse[d][w].fit(data)

Fit on train 0:   0%|          | 0/1 [00:00<?, ?it/s]

Fit on train 1:   0%|          | 0/500 [00:00<?, ?it/s]

Fit on train 2:   0%|          | 0/400 [00:00<?, ?it/s]

Fit on train 3:   0%|          | 0/1 [00:00<?, ?it/s]

## Postfit

In [None]:
for d in datasets:
    for data, _, _ in tqdm(datasets[d].train_generator(), desc=f'Fit on train {d}'):
        for w in watchhouse[d]:
            watchhouse[d][w].postfit(data)

## Save

In [10]:
for d in datasets:
    for w in watchhouse[d]:
        watchhouse[d][w].dump(datasets[d].__class__.__name__)

# Self-checking
Check watchhouse on train. Theoretically all metrics must be NaN.

## Load

In [8]:
for d in datasets:
    for w in watchhouse[d]:
        watchhouse[d][w].load(datasets[d].__class__.__name__)

## Throw stones

In [9]:
stones = dict()
for d in datasets:
    stones[d] = dict()
    for w in watchhouse[d]:
        stones[d][w] = list()

In [10]:
for d in datasets:
    for data, faults, info in tqdm(datasets[d].train_generator(), desc=f'Detect on train {d}'):
        for w in watchhouse[d]:
            detect = watchhouse[d][w].predict(data)
            stones[d][w].append(pd.concat([faults, detect], axis=1))
            stones[d][w][-1].index.name = info

Detect on train 0:   0%|          | 0/1 [00:00<?, ?it/s]

Detect on train 1:   0%|          | 0/500 [00:00<?, ?it/s]

Detect on train 2:   0%|          | 0/400 [00:00<?, ?it/s]

Detect on train 3:   0%|          | 0/1 [00:00<?, ?it/s]

## Checks

In [11]:
metrics = ('precision', 'recall', 'f1_score')
checks = {str(datasets[d]): pd.DataFrame(columns=metrics) for d in datasets}

In [12]:
# alone
for d in datasets:
    for w in tqdm(watchhouse[d], desc=f'Collect stones d{d}'):
        exam_paper = pd.DataFrame(columns=metrics)
        for i_st, st in enumerate(stones[d][w]):
            exam_paper.loc[i_st, metrics] = time_span_metrics(st.iloc[:, 0], st.iloc[:, 1:])
        checks[str(datasets[d])].loc[str(watchhouse[d][w]), metrics] = exam_paper.mean().values

Collect stones d0:   0%|          | 0/6 [00:00<?, ?it/s]

Collect stones d1:   0%|          | 0/6 [00:00<?, ?it/s]

Collect stones d2:   0%|          | 0/6 [00:00<?, ?it/s]

Collect stones d3:   0%|          | 0/6 [00:00<?, ?it/s]

In [13]:
# ensemble
max_threshold = 5
for d in datasets:
    exam_papers = dict()
    for threshold in range(1, max_threshold+1):
        exam_papers[threshold] = pd.DataFrame(columns=metrics)
    for i_st, st in enumerate(tqdm(stones[d][0], desc=f'Ensembling stones d{d}')):
        opinions = pd.concat([stones[d][w][i_st].iloc[:, 1:].sum(axis=1) for w in watchhouse[d]], axis=1)
        for threshold in range(1, max_threshold+1):
            detect = (opinions.sum(axis=1) >= threshold).astype('uint8')
            exam_papers[threshold].loc[i_st, metrics] = time_span_metrics(st.iloc[:, 0], detect)
    for threshold in range(1, max_threshold+1):
        checks[str(datasets[d])].loc[f'ensemble(threshold={threshold})', metrics] = exam_papers[threshold].mean().values

Ensembling stones d0:   0%|          | 0/1 [00:00<?, ?it/s]

Ensembling stones d1:   0%|          | 0/500 [00:00<?, ?it/s]

Ensembling stones d2:   0%|          | 0/400 [00:00<?, ?it/s]

Ensembling stones d3:   0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
for d in checks:
    print(d)
    display(checks[d])

GhlKasperskyDataset(E:\Datasets\GHL)


Unnamed: 0,precision,recall,f1_score
Watchman(n_features=12),,,
DirectLimitWatchman(n_features=12),,,
PcaLimitWatchman(n_features=12),,,
IsoForestWatchman(n_features=12),0.0,,0.0
LinearPredictWatchman(n_features=12),,,
DeepPredictWatchman(n_features=12),,,
ensemble(threshold=1),0.0,,0.0
ensemble(threshold=2),,,
ensemble(threshold=3),,,
ensemble(threshold=4),,,


TepHarvardDataset(E:\Datasets\TEP\dataverse)


Unnamed: 0,precision,recall,f1_score
Watchman(n_features=52),,,
DirectLimitWatchman(n_features=52),,,
PcaLimitWatchman(n_features=52),,,
IsoForestWatchman(n_features=52),0.0,,0.0
LinearPredictWatchman(n_features=52),0.0,,0.0
DeepPredictWatchman(n_features=52),0.0,,0.0
ensemble(threshold=1),0.0,,0.0
ensemble(threshold=2),0.0,,0.0
ensemble(threshold=3),0.0,,0.0
ensemble(threshold=4),0.0,,0.0


TepKasperskyDataset(E:\Datasets\TEP\kaspersky)


Unnamed: 0,precision,recall,f1_score
Watchman(n_features=53),,,
DirectLimitWatchman(n_features=53),,,
PcaLimitWatchman(n_features=53),,,
IsoForestWatchman(n_features=53),0.0,,0.0
LinearPredictWatchman(n_features=53),0.0,,0.0
DeepPredictWatchman(n_features=53),0.0,,0.0
ensemble(threshold=1),0.0,,0.0
ensemble(threshold=2),0.0,,0.0
ensemble(threshold=3),0.0,,0.0
ensemble(threshold=4),0.0,,0.0


SwatItrustDataset(E:\Datasets\SWaT\datasetA1)


Unnamed: 0,precision,recall,f1_score
Watchman(n_features=51),,,
DirectLimitWatchman(n_features=51),,,
PcaLimitWatchman(n_features=51),,,
IsoForestWatchman(n_features=51),0.0,,0.0
LinearPredictWatchman(n_features=51),,,
DeepPredictWatchman(n_features=51),,,
ensemble(threshold=1),0.0,,0.0
ensemble(threshold=2),,,
ensemble(threshold=3),,,
ensemble(threshold=4),,,


IsoForestWatchman, LinearPredictWatchman, DeepPredictWatchman detecting anomalies on train. Especially forTEP datasets.

What threshold we should use?

## Let's look closer

In [67]:
counts = {str(datasets[d]): pd.DataFrame(columns=['max', 'median', 'q75', 'q95', 'q98', 'q99']) for d in datasets}

In [68]:
# calc how much detect per watchman
for d in datasets:
    for w in tqdm(watchhouse[d], desc=f'Count stones d{d}'):
        exam_paper = pd.DataFrame(columns=['max', 'median', 'q75', 'q95', 'q98', 'q99'])
        for i_st, st in enumerate(stones[d][w]):
            exam_paper.loc[i_st, 'max'] = st.iloc[:, 1:].sum(axis=1).max()
            exam_paper.loc[i_st, 'median'] = st.iloc[:, 1:].sum(axis=1).median()
            exam_paper.loc[i_st, 'q75'] = st.iloc[:, 1:].sum(axis=1).quantile(q=0.75)
            exam_paper.loc[i_st, 'q95'] = st.iloc[:, 1:].sum(axis=1).quantile(q=0.95)
            exam_paper.loc[i_st, 'q98'] = st.iloc[:, 1:].sum(axis=1).quantile(q=0.98)
            exam_paper.loc[i_st, 'q99'] = st.iloc[:, 1:].sum(axis=1).quantile(q=0.99)
        counts[str(datasets[d])].loc[str(watchhouse[d][w]), 'max'] = exam_paper['max'].max()
        counts[str(datasets[d])].loc[str(watchhouse[d][w]), 'median'] = exam_paper['median'].median()
        counts[str(datasets[d])].loc[str(watchhouse[d][w]), 'q75'] = exam_paper['q75'].mean()
        counts[str(datasets[d])].loc[str(watchhouse[d][w]), 'q95'] = exam_paper['q95'].mean()
        counts[str(datasets[d])].loc[str(watchhouse[d][w]), 'q98'] = exam_paper['q98'].mean()
        counts[str(datasets[d])].loc[str(watchhouse[d][w]), 'q99'] = exam_paper['q99'].mean()

Count stones d0:   0%|          | 0/6 [00:00<?, ?it/s]

Count stones d1:   0%|          | 0/6 [00:00<?, ?it/s]

Count stones d2:   0%|          | 0/6 [00:00<?, ?it/s]

Count stones d3:   0%|          | 0/6 [00:00<?, ?it/s]

In [69]:
for d in checks:
    print(d)
    display(counts[d])

GhlKasperskyDataset(E:\Datasets\GHL)


Unnamed: 0,max,median,q75,q95,q98,q99
Watchman(n_features=12),0,0.0,0.0,0.0,0.0,0.0
DirectLimitWatchman(n_features=12),0,0.0,0.0,0.0,0.0,0.0
PcaLimitWatchman(n_features=12),0,0.0,0.0,0.0,0.0,0.0
IsoForestWatchman(n_features=12),1,0.0,0.0,1.0,1.0,1.0
LinearPredictWatchman(n_features=12),0,0.0,0.0,0.0,0.0,0.0
DeepPredictWatchman(n_features=12),0,0.0,0.0,0.0,0.0,0.0


TepHarvardDataset(E:\Datasets\TEP\dataverse)


Unnamed: 0,max,median,q75,q95,q98,q99
Watchman(n_features=52),0,0.0,0.0,0.0,0.0,0.0
DirectLimitWatchman(n_features=52),0,0.0,0.0,0.0,0.0,0.0
PcaLimitWatchman(n_features=52),0,0.0,0.0,0.0,0.0,0.0
IsoForestWatchman(n_features=52),1,0.0,0.002,0.2591,0.47044,0.71606
LinearPredictWatchman(n_features=52),2,0.0,0.0,0.0,0.0,0.0
DeepPredictWatchman(n_features=52),8,0.0,0.0,0.006,0.87492,1.76306


TepKasperskyDataset(E:\Datasets\TEP\kaspersky)


Unnamed: 0,max,median,q75,q95,q98,q99
Watchman(n_features=53),0,0.0,0.0,0.0,0.0,0.0
DirectLimitWatchman(n_features=53),0,0.0,0.0,0.0,0.0,0.0
PcaLimitWatchman(n_features=53),0,0.0,0.0,0.0,0.0,0.0
IsoForestWatchman(n_features=53),1,1.0,1.0,1.0,1.0,1.0
LinearPredictWatchman(n_features=53),5,0.0,0.0,0.1,0.105,0.1175
DeepPredictWatchman(n_features=53),24,0.0,2.575,3.430625,4.34505,4.847625


SwatItrustDataset(E:\Datasets\SWaT\datasetA1)


Unnamed: 0,max,median,q75,q95,q98,q99
Watchman(n_features=51),0,0.0,0.0,0.0,0.0,0.0
DirectLimitWatchman(n_features=51),0,0.0,0.0,0.0,0.0,0.0
PcaLimitWatchman(n_features=51),0,0.0,0.0,0.0,0.0,0.0
IsoForestWatchman(n_features=51),1,1.0,1.0,1.0,1.0,1.0
LinearPredictWatchman(n_features=51),0,0.0,0.0,0.0,0.0,0.0
DeepPredictWatchman(n_features=51),0,0.0,0.0,0.0,0.0,0.0


Deep