In [1]:
!del /Q utils\__pycache__

In [2]:
import pandas as pd
from tqdm.notebook import tqdm

In [3]:
from utils.datasets import GhlKasperskyDataset, TepHarvardDataset, TepKasperskyDataset
from utils.watchmen import IsoForestWatchman
from utils.metrics import time_span_metrics
from utils.custom_plots import plot_stacked

## GHL

In [4]:
ds = GhlKasperskyDataset()
ds.shake_not_stir(random_state=84)

In [5]:
watchman = IsoForestWatchman(generate_features=False, random_state=31)

In [6]:
train_gen = ds.train_generator()
for train, _, _ in tqdm(train_gen):
    watchman.prefit(train)

  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
train_gen = ds.train_generator()
for train, _, _ in tqdm(train_gen):
    watchman.fit(train)

  0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
train_gen = ds.train_generator()
for train, _, _ in tqdm(train_gen):
    watchman.postfit(train)

  0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
train_gen = ds.train_generator()
examine_list = pd.DataFrame(columns=['precision', 'recall', 'f1_score'], dtype='float')
for train, faults, info in tqdm(train_gen):
    detect = watchman.predict(train)
    examine_list.loc[info] = time_span_metrics(faults, detect)

  0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
valid_gen = ds.valid_generator()
examine_list = pd.DataFrame(columns=['precision', 'recall', 'f1_score'], dtype='float')
for valid, faults, info in tqdm(valid_gen):
    detect = watchman.predict(valid)
    examine_list.loc[info] = time_span_metrics(faults, detect)
#     if max(faults) and sum(detect)>10:
#         print(watchman.forest.score_samples(valid))
#         print(min(watchman.forest.score_samples(valid)))
#         print(detect.values)
#         break

#         plot_stacked(data,
#                      title=info,
#                      group='value_unit',
#                      faults=faults,
#                      detect=detect,
#                     )

  0%|          | 0/14 [00:00<?, ?it/s]

In [11]:
examine_list.mean()

precision    0.166667
recall       0.023810
f1_score     0.028571
dtype: float64

## TEP Harvard

In [12]:
ds = TepHarvardDataset()
ds.shake_not_stir()

In [13]:
watchman = IsoForestWatchman(generate_features=False, random_state=31)

In [14]:
train_gen = ds.train_generator()
for train, _, _ in tqdm(train_gen):
    watchman.prefit(train)

  0%|          | 0/500 [00:00<?, ?it/s]

In [15]:
train_gen = ds.train_generator()
for train, _, _ in tqdm(train_gen):
    watchman.fit(train)

  0%|          | 0/500 [00:00<?, ?it/s]

In [None]:
train_gen = ds.train_generator()
for train, _, _ in tqdm(train_gen):
    watchman.postfit(train)

  0%|          | 0/500 [00:00<?, ?it/s]

In [None]:
valid_gen = ds.valid_generator()
examine_list = pd.DataFrame(columns=['precision', 'recall', 'f1_score'], dtype='float')
for valid, faults, info in tqdm(valid_gen):
    detect = watchman.predict(valid)
    examine_list.loc[info] = time_span_metrics(faults, detect)
#     if max(faults) and sum(detect)>10:
#         print(watchman.forest.score_samples(valid)<-0.5)
#         print(min(watchman.forest.score_samples(valid)))
#         print(detect.values)
#         break
#     if max(faults) and max(detect):
#         plot_stacked(data,
#                      title=info,
#                      group='value_unit',
#                      faults=faults,
#                      detect=detect,
#                     )

In [None]:
examine_list.mean()

## TEP Kaspersky

In [None]:
ds = TepKasperskyDataset()
ds.shake_not_stir()

In [None]:
watchman = IsoForestWatchman(random_state=31)

In [None]:
train_gen = ds.train_generator()
for train, _, _ in tqdm(train_gen):
    watchman.prefit(train)

In [None]:
train_gen = ds.train_generator()
for train, _, _ in tqdm(train_gen):
    watchman.fit(train)

In [None]:
train_gen = ds.train_generator()
for train, _, _ in tqdm(train_gen):
    watchman.postfit(train)

In [None]:
train_gen = ds.train_generator()
for train, _, _ in tqdm(train_gen):
    watchman.postfit(train)valid_gen = ds.valid_generator()
examine_list = pd.DataFrame(columns=['precision', 'recall', 'f1_score'], dtype='float')
for valid, faults, info in tqdm(valid_gen):
    detect = watchman.predict(valid)
    examine_list.loc[info] = time_span_metrics(faults, detect)
#     if max(faults) and max(detect):
#         plot_stacked(data,
#                      title=info,
#                      group='value_unit',
#                      faults=faults,
#                      detect=detect,
#                     )

In [None]:
examine_list.mean()