In [1]:
!del /Q utils\__pycache__

In [2]:
import pandas as pd
from tqdm.notebook import tqdm

In [3]:
from utils.datasets import GhlKasperskyDataset, TepHarvardDataset, TepKasperskyDataset, SwatItrustDataset
from utils.custom_plots import plot_stacked
from utils.metrics import time_span_metrics
from utils.watchmen import LimitWatchman, LimitPcaWatchman, SpePcaWatchman, IsolatingWatchman, LinearPredictWatchman

In [4]:
SEED = 1729

# Preparing datasets

In [5]:
datasets = {
    0: GhlKasperskyDataset(),
    1: TepHarvardDataset(),
    2: TepKasperskyDataset(),
    3: SwatItrustDataset(),
}

In [6]:
shake_kwargs = {
    'random_state': SEED,
    'valid_test_ratio': 0.3,
}

# Preparing watchmen

In [7]:
watchmen = {
    0: {  # GhlKasperskyDataset
        0: LimitWatchman(),
        1: LimitPcaWatchman(n_components=3),
        2: SpePcaWatchman(n_components=3),
        3: IsolatingWatchman(random_state=SEED),
        4: LinearPredictWatchman(random_state=SEED),
    },
    1: {  # TepHarvardDataset
        0: LimitWatchman(),  # better: ewma='3 min'
        1: LimitPcaWatchman(n_components=3),  # better: n_components=12
        2: SpePcaWatchman(n_components=3),  # better: n_components=12
        3: IsolatingWatchman(random_state=SEED),
        4: LinearPredictWatchman(random_state=SEED),
    },
    2: {  # TepKasperskyDataset
        0: LimitWatchman(),
        1: LimitPcaWatchman(n_components=3),
        2: SpePcaWatchman(n_components=3),
        3: IsolatingWatchman(random_state=SEED),
        4: LinearPredictWatchman(random_state=SEED),
    },
    3: {  # SwatItrustDataset
        0: LimitWatchman(),
        1: LimitPcaWatchman(n_components=3),
        2: SpePcaWatchman(n_components=3),
        3: IsolatingWatchman(random_state=SEED),
        4: LinearPredictWatchman(random_state=SEED),
    },
}

# Learning

## Prefit

In [8]:
for d in datasets:
    datasets[d].shake_not_stir(**shake_kwargs)
    for data, _, _ in tqdm(datasets[d].train_generator(), desc=f'Prefit on train {d}'):
        for w in watchmen[d]:
            watchmen[d][w].prefit(data)

Prefit on train 0: 0it [00:00, ?it/s]

Prefit on train 1: 0it [00:00, ?it/s]

Prefit on train 2: 0it [00:00, ?it/s]

Prefit on train 3: 0it [00:00, ?it/s]

## Fit

In [9]:
for d in datasets:
    datasets[d].shake_not_stir(**shake_kwargs)
    for data, _, _ in tqdm(datasets[d].train_generator(), desc=f'Fit on train {d}'):
        for w in watchmen[d]:
            watchmen[d][w].partial_fit(data)

Fit on train 0: 0it [00:00, ?it/s]

Fit on train 1: 0it [00:00, ?it/s]

Fit on train 2: 0it [00:00, ?it/s]

Fit on train 3: 0it [00:00, ?it/s]

# Examine

## Throw stones

In [10]:
stones = dict()
for d in datasets:
    stones[d] = dict()
    for w in watchmen[d]:
        stones[d][w] = list()

In [11]:
for d in datasets:
    datasets[d].shake_not_stir(**shake_kwargs)
    for data, faults, info in tqdm(datasets[d].valid_generator(), desc=f'Detect on validation {d}'):
        for w in watchmen[d]:
            detect = watchmen[d][w].predict(data)
            stones[d][w].append(pd.concat([faults, detect], axis=1))
            stones[d][w][-1].index.name = info

Detect on validation 0: 0it [00:00, ?it/s]

Detect on validation 1: 0it [00:00, ?it/s]

Detect on validation 2: 0it [00:00, ?it/s]

Detect on validation 3: 0it [00:00, ?it/s]

## Individual results

In [12]:
metrics = ('precision', 'recall', 'f1_score')
indi_results = dict()
for d in datasets:
    indi_results[str(datasets[d])] = pd.DataFrame(columns=metrics)
    for w in watchmen[d]:
        exam_paper = pd.DataFrame(columns=metrics)
        for st in stones[d][w]:
            exam_paper.loc[st.index.name, metrics] = time_span_metrics(st.iloc[:, 0], st.iloc[:, 1:])
        indi_results[str(datasets[d])].loc[str(watchmen[d][w]), metrics] = exam_paper.mean().values

In [13]:
for d in indi_results:
    print(d)
    display(indi_results[d])

GhlKasperskyDataset(E:\Datasets\GHL)


Unnamed: 0,precision,recall,f1_score
LimitWatchman(ewma=None),0.207358,0.638095,0.281676
LimitPcaWatchman(n_components=3),0.284864,0.328571,0.298718
SpePcaWatchman(n_components=3),0.0,0.0,0.0
IsolatingWatchman(n_trees=17),0.017045,1.0,0.033429
LinearPredictWatchman(n_features=12),0.014094,1.0,0.02773


TepHarvardDataset(E:\Datasets\TEP\dataverse)


Unnamed: 0,precision,recall,f1_score
LimitWatchman(ewma=None),1.0,0.6417,0.689518
LimitPcaWatchman(n_components=3),1.0,0.408214,0.463065
SpePcaWatchman(n_components=3),1.0,0.546925,0.592227
IsolatingWatchman(n_trees=500),0.845354,0.4658,0.530065
LinearPredictWatchman(n_features=52),0.9996,0.750827,0.788164


TepKasperskyDataset(E:\Datasets\TEP\kaspersky)


Unnamed: 0,precision,recall,f1_score
LimitWatchman(ewma=None),0.690657,0.423134,0.384283
LimitPcaWatchman(n_components=3),0.75,0.209302,0.2
SpePcaWatchman(n_components=3),0.991111,0.215745,0.221449
IsolatingWatchman(n_trees=400),0.079134,1.0,0.121627
LinearPredictWatchman(n_features=53),0.332735,0.460101,0.23865


SwatItrustDataset(E:\Datasets\SWaT\dataset12)


Unnamed: 0,precision,recall,f1_score
LimitWatchman(ewma=None),0.226415,1.0,0.369231
LimitPcaWatchman(n_components=3),,0.0,0.0
SpePcaWatchman(n_components=3),,0.0,0.0
IsolatingWatchman(n_trees=6),0.222222,1.0,0.363636
LinearPredictWatchman(n_features=51),0.375,0.25,0.3


## Ensembling results

### Threshold
Watchman can detect anomaly by several features.

In [14]:
metrics = ('precision', 'recall', 'f1_score')
union_results = dict()
max_threshold = 10
for d in datasets:
    union_results[str(datasets[d])] = pd.DataFrame(columns=metrics)
    exam_papers = dict()
    for threshold in range(1, max_threshold+1):
        exam_papers[threshold] = pd.DataFrame(columns=metrics)
    for i_st, st in enumerate(stones[d][0]):
        opinions = pd.concat([stones[d][w][i_st].iloc[:, 1:].sum(axis=1) for w in watchmen[d]], axis=1)
        for threshold in range(1, max_threshold+1):
            detect = (opinions.sum(axis=1) >= threshold).astype('uint8')
            exam_papers[threshold].loc[i_st, metrics] = time_span_metrics(st.iloc[:, 0], detect)
    for threshold in range(1, max_threshold+1):
        union_results[str(datasets[d])].loc[threshold, metrics] = exam_papers[threshold].mean().values

In [15]:
for d in union_results:
    print(d)
    display(union_results[d])

GhlKasperskyDataset(E:\Datasets\GHL)


Unnamed: 0,precision,recall,f1_score
1,0.014094,1.0,0.02773
2,0.016777,1.0,0.032914
3,0.187554,0.666667,0.275014
4,0.292491,0.3,0.278211
5,,0.0,0.0
6,,0.0,0.0
7,,0.0,0.0
8,,0.0,0.0
9,,0.0,0.0
10,,0.0,0.0


TepHarvardDataset(E:\Datasets\TEP\dataverse)


Unnamed: 0,precision,recall,f1_score
1,0.919735,0.788545,0.820395
2,1.0,0.709699,0.756291
3,1.0,0.665969,0.711919
4,1.0,0.591574,0.646704
5,1.0,0.532897,0.585395
6,1.0,0.501868,0.553796
7,1.0,0.480685,0.531019
8,1.0,0.45748,0.502698
9,1.0,0.443919,0.490371
10,1.0,0.392679,0.441956


TepKasperskyDataset(E:\Datasets\TEP\kaspersky)


Unnamed: 0,precision,recall,f1_score
1,0.079134,1.0,0.121627
2,0.317483,0.466356,0.233071
3,0.564314,0.411261,0.328799
4,0.76084,0.277988,0.269971
5,0.81312,0.221642,0.200062
6,0.768485,0.21526,0.188665
7,0.828904,0.202609,0.19479
8,0.843318,0.202253,0.194378
9,0.842949,0.200889,0.191928
10,0.846154,0.200524,0.19131


SwatItrustDataset(E:\Datasets\SWaT\dataset12)


Unnamed: 0,precision,recall,f1_score
1,0.164303,1.0,0.277972
2,0.184636,0.95,0.307903
3,0.361111,0.416667,0.383333
4,0.666667,0.275,0.373626
5,0.714286,0.225,0.324561
6,1.0,0.183333,0.309524
7,1.0,0.091667,0.167832
8,1.0,0.091667,0.167832
9,1.0,0.091667,0.167832
10,1.0,0.091667,0.167832


### Probability
Something like probabilty for every watchman.

In [16]:
metrics = ('precision', 'recall', 'f1_score')
union_results = dict()
for d in datasets:
    union_results[str(datasets[d])] = pd.DataFrame(columns=metrics)
    exam_papers = dict()
    for threshold in range(5, 51, 5):
        exam_papers[threshold] = pd.DataFrame(columns=metrics)
    for i_st, st in enumerate(stones[d][0]):
        opinions = pd.concat([stones[d][w][i_st].iloc[:, 1:].mean(axis=1) for w in watchmen[d]], axis=1)
        for threshold in range(5, 51, 5):
            detect = (opinions.sum(axis=1) >= threshold*0.01).astype('uint8')
            exam_papers[threshold].loc[i_st, metrics] = time_span_metrics(st.iloc[:, 0], detect)
    for threshold in range(5, 51, 5):
        union_results[str(datasets[d])].loc[threshold*0.01, metrics] = exam_papers[threshold].mean().values

In [17]:
for d in union_results:
    print(d)
    display(union_results[d])

GhlKasperskyDataset(E:\Datasets\GHL)


Unnamed: 0,precision,recall,f1_score
0.05,0.014094,1.0,0.02773
0.1,0.016777,1.0,0.032914
0.15,0.016777,1.0,0.032914
0.2,0.016954,1.0,0.033254
0.25,0.016998,1.0,0.03334
0.3,0.016998,1.0,0.03334
0.35,0.016998,1.0,0.03334
0.4,0.016998,1.0,0.03334
0.45,0.01704,1.0,0.03342
0.5,0.017045,1.0,0.033429


TepHarvardDataset(E:\Datasets\TEP\dataverse)


Unnamed: 0,precision,recall,f1_score
0.05,0.906398,0.700685,0.747219
0.1,0.89649,0.616359,0.676363
0.15,0.896072,0.61545,0.675487
0.2,0.896001,0.615347,0.675371
0.25,0.896,0.615344,0.675367
0.3,0.896,0.615344,0.675367
0.35,0.895984,0.615331,0.675353
0.4,0.89588,0.614573,0.674556
0.45,0.89581,0.613373,0.673511
0.5,0.89581,0.613363,0.673502


TepKasperskyDataset(E:\Datasets\TEP\kaspersky)


Unnamed: 0,precision,recall,f1_score
0.05,0.079134,1.0,0.121627
0.1,0.079134,1.0,0.121627
0.15,0.079134,1.0,0.121627
0.2,0.079134,1.0,0.121627
0.25,0.079134,1.0,0.121627
0.3,0.079134,1.0,0.121627
0.35,0.079134,1.0,0.121627
0.4,0.079134,1.0,0.121627
0.45,0.079134,1.0,0.121627
0.5,0.079134,1.0,0.121627


SwatItrustDataset(E:\Datasets\SWaT\dataset12)


Unnamed: 0,precision,recall,f1_score
0.05,0.164303,1.0,0.277972
0.1,0.164303,1.0,0.277972
0.15,0.164303,1.0,0.277972
0.2,0.164303,1.0,0.277972
0.25,0.164303,1.0,0.277972
0.3,0.164303,1.0,0.277972
0.35,0.164303,1.0,0.277972
0.4,0.164303,1.0,0.277972
0.45,0.164303,1.0,0.277972
0.5,0.164303,1.0,0.277972
