In [1]:
!del /Q utils\__pycache__

In [2]:
import pandas as pd
from tqdm.notebook import tqdm

In [3]:
from utils.datasets import GhlKasperskyDataset, TepHarvardDataset, TepKasperskyDataset
from utils.custom_plots import plot_stacked

In [4]:
from utils.metrics import time_span_metrics

In [5]:
from utils.watchmen import IsolatingWatchman

## GHL

In [6]:
ds = GhlKasperskyDataset()
ds.shake_not_stir(valid_test_ratio=0.4)

In [7]:
watchman = IsolatingWatchman(random_state=31)

In [8]:
train_gen = ds.train_generator()
for train, _, _ in tqdm(train_gen, total=1):
    watchman.partial_fit(train)

  0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
watchman

IsolatingWatchman(n_trees=97)

In [10]:
valid_gen = ds.valid_generator()
examine_list = pd.DataFrame(columns=['precision', 'recall', 'f1_score'], dtype='float')
for valid, faults, info in tqdm(valid_gen, total=19):
    detect = watchman.predict(valid)
    examine_list.loc[info] = time_span_metrics(faults, detect)
#     if max(faults) and max(detect):
#         plot_stacked(data,
#                      title=info,
#                      group='value_unit',
#                      faults=faults,
#                      detect=detect,
#                     )

  0%|          | 0/19 [00:00<?, ?it/s]

In [11]:
examine_list.mean()

precision    0.013175
recall       0.828822
f1_score     0.025861
dtype: float64

## TEP Harvard

In [12]:
ds = TepHarvardDataset()
ds.shake_not_stir(valid_test_ratio=0.5, balanced_test=True)

In [13]:
watchman = IsolatingWatchman(random_state=31)

In [14]:
train_gen = ds.train_generator()
for train, _, _ in tqdm(train_gen, total=500):
    watchman.partial_fit(train)

  0%|          | 0/500 [00:00<?, ?it/s]

In [15]:
watchman

IsolatingWatchman(n_trees=500)

In [16]:
valid_gen = ds.valid_generator()
examine_list = pd.DataFrame(columns=['precision', 'recall', 'f1_score'], dtype='float')
for valid, faults, info in tqdm(valid_gen, total=500):
    detect = watchman.predict(valid)
    examine_list.loc[info] = time_span_metrics(faults, detect)
#     if max(faults) and max(detect):
#         plot_stacked(data,
#                      title=info,
#                      group='value_unit',
#                      faults=faults,
#                      detect=detect,
#                     )

  0%|          | 0/500 [00:00<?, ?it/s]

In [17]:
examine_list.mean()

precision    0.999978
recall       0.500208
f1_score     0.586213
dtype: float64

## TEP Kaspersky

In [53]:
ds = TepKasperskyDataset()
ds.shake_not_stir(valid_test_ratio=0.4)

In [54]:
watchman = IsolatingWatchman(random_state=31, max_samples=2048)

In [55]:
watchman.forest

IsolationForest(max_samples=2048, n_estimators=0, n_jobs=-1, random_state=31,
                warm_start=True)

In [56]:
train_gen = ds.train_generator()
for train, _, _ in tqdm(train_gen, total=400):
    watchman.partial_fit(train, increment=1)

  0%|          | 0/400 [00:00<?, ?it/s]

In [57]:
watchman

IsolatingWatchman(n_trees=400)

In [58]:
valid_gen = ds.valid_generator()
examine_list = pd.DataFrame(columns=['precision', 'recall', 'f1_score'], dtype='float')
for valid, faults, info in tqdm(valid_gen, total=115):
    detect = watchman.predict(valid)
    examine_list.loc[info] = time_span_metrics(faults, detect)
#     if max(faults) and max(detect):
#         plot_stacked(data,
#                      title=info,
#                      group='value_unit',
#                      faults=faults,
#                      detect=detect,
#                     )

  0%|          | 0/115 [00:00<?, ?it/s]

In [59]:
examine_list.mean()

precision    0.074594
recall       1.000000
f1_score     0.116174
dtype: float64