In [1]:
!del /Q utils\__pycache__

In [2]:
import pandas as pd
from tqdm.notebook import tqdm

In [3]:
from utils.datasets import GhlKasperskyDataset, TepHarvardDataset, TepKasperskyDataset
from utils.watchmen import IsolatingWatchman
from utils.metrics import time_span_metrics
from utils.custom_plots import plot_stacked

## GHL

In [4]:
ds = GhlKasperskyDataset()
ds.shake_not_stir(valid_test_ratio=0.4)

In [5]:
watchman = IsolatingWatchman(random_state=31, contamination=0.01)

In [6]:
train_gen = ds.train_generator()
for train, _, _ in tqdm(train_gen, total=1):
    watchman.partial_fit(train)

  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
# pd.Series(watchman.forest.predict(train)).replace({1: 0, -1: 1}).value_counts()

In [8]:
# min(watchman.forest.score_samples(train))

In [9]:
valid_gen = ds.valid_generator()
examine_list = pd.DataFrame(columns=['precision', 'recall', 'f1_score'], dtype='float')
for valid, faults, info in tqdm(valid_gen, total=19):
    detect = watchman.predict(valid)
    examine_list.loc[info] = time_span_metrics(faults, detect)
#     if max(faults) and sum(detect)>10:
#         print(watchman.forest.score_samples(valid))
#         print(min(watchman.forest.score_samples(valid)))
#         print(detect.values)
#         break

#         plot_stacked(data,
#                      title=info,
#                      group='value_unit',
#                      faults=faults,
#                      detect=detect,
#                     )

  0%|          | 0/19 [00:00<?, ?it/s]

In [10]:
examine_list.mean()

precision    0.104444
recall       0.140351
f1_score     0.103509
dtype: float64

In [11]:
stop

NameError: name 'stop' is not defined

## TEP Harvard

In [None]:
ds = TepHarvardDataset()
ds.shake_not_stir(valid_test_ratio=0.5, balanced_test=True)

In [None]:
watchman = IsolatingWatchman(random_state=31)

In [None]:
train_gen = ds.train_generator()
for train, _, _ in tqdm(train_gen, total=500):
    watchman.partial_fit(train)

In [None]:
pd.Series(watchman.forest.predict(train)).replace({1: 0, -1: 1}).value_counts()

In [None]:
valid_gen = ds.valid_generator()
examine_list = pd.DataFrame(columns=['precision', 'recall', 'f1_score'], dtype='float')
for valid, faults, info in tqdm(valid_gen, total=500):
    detect = watchman.predict(valid)
    examine_list.loc[info] = time_span_metrics(faults, detect)
#     if max(faults) and sum(detect)>10:
#         print(watchman.forest.score_samples(valid)<-0.5)
#         print(min(watchman.forest.score_samples(valid)))
#         print(detect.values)
#         break
#     if max(faults) and max(detect):
#         plot_stacked(data,
#                      title=info,
#                      group='value_unit',
#                      faults=faults,
#                      detect=detect,
#                     )

In [None]:
examine_list.mean()

## TEP Kaspersky

In [None]:
ds = TepKasperskyDataset()
ds.shake_not_stir(valid_test_ratio=0.4)

In [None]:
watchman = IsolatingWatchman(random_state=31, max_samples=1024)

In [None]:
watchman.forest

In [None]:
train_gen = ds.train_generator()
for train, _, _ in tqdm(train_gen, total=400):
    watchman.partial_fit(train, increment=1)

In [None]:
watchman

In [None]:
valid_gen = ds.valid_generator()
examine_list = pd.DataFrame(columns=['precision', 'recall', 'f1_score'], dtype='float')
for valid, faults, info in tqdm(valid_gen, total=115):
    detect = watchman.predict(valid)
    examine_list.loc[info] = time_span_metrics(faults, detect)
#     if max(faults) and max(detect):
#         plot_stacked(data,
#                      title=info,
#                      group='value_unit',
#                      faults=faults,
#                      detect=detect,
#                     )

In [None]:
examine_list.mean()