In [1]:
!del /Q utils\__pycache__

In [2]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [3]:
from utils.datasets import GhlKasperskyDataset, TepHarvardDataset, TepKasperskyDataset
from utils.watchmen import IsolatingWatchman
from utils.metrics import time_span_metrics
from utils.custom_plots import plot_stacked

## GHL

In [4]:
ds = GhlKasperskyDataset()
ds.shake_not_stir(valid_test_ratio=0.4)

### Without

In [5]:
watchman = IsolatingWatchman(random_state=31)

for train, _, _ in tqdm(ds.train_generator()):
    watchman.partial_fit(train)

exam_list = pd.DataFrame(columns=['precision', 'recall', 'f1_score'])
for valid, faults, info in tqdm(ds.valid_generator()):
    detect = watchman.predict(valid)
    exam_list.loc[info] = time_span_metrics(faults, detect)
    
exam_list.mean()

0it [00:00, ?it/s]

0it [00:00, ?it/s]

precision    0.012276
recall       1.000000
f1_score     0.024230
dtype: float64

In [6]:
# plot_stacked(test, faults=faults)

### First and second derivatives

In [7]:
def add_derivatives(data: pd.DataFrame) -> pd.DataFrame:
    diff1 = data.diff()
    diff1.columns = data.columns + '_d1'
    diff2 = diff1.diff()
    diff2.columns = data.columns + '_d2'
    return pd.concat([data, diff1, diff2], axis=1).fillna(0)

In [8]:
watchman = IsolatingWatchman(random_state=31)

for train, _, _ in tqdm(ds.train_generator()):
    train = add_derivatives(train)
    watchman.partial_fit(train)

exam_list = pd.DataFrame(columns=['precision', 'recall', 'f1_score'])
for valid, faults, info in tqdm(ds.valid_generator()):
    valid = add_derivatives(valid)
    detect = watchman.predict(valid)
    exam_list.loc[info] = time_span_metrics(faults, detect)
    
exam_list.mean()

0it [00:00, ?it/s]

0it [00:00, ?it/s]

precision    0.010944
recall       0.815789
f1_score     0.021574
dtype: float64

### EWMA

In [9]:
ti_ewma = '4 min'

In [10]:
watchman = IsolatingWatchman(random_state=31)

for train, _, _ in tqdm(ds.train_generator()):
    train = train.ewm(halflife=ti_ewma, times=train.index).mean()
    watchman.partial_fit(train)

exam_list = pd.DataFrame(columns=['precision', 'recall', 'f1_score'])
for valid, faults, info in tqdm(ds.valid_generator()):
    valid = valid.ewm(halflife=ti_ewma, times=valid.index).mean()
    detect = watchman.predict(valid)
    exam_list.loc[info] = time_span_metrics(faults, detect)
    
exam_list.mean()

0it [00:00, ?it/s]

0it [00:00, ?it/s]

precision    0.014156
recall       0.973684
f1_score     0.027874
dtype: float64

### Add Fourier coefs

In [11]:
from numpy.fft import rfft, hfft

In [12]:
def add_fft_coef(data: pd.DataFrame, window: int, n_coef: int) -> pd.DataFrame:
    for c in data.select_dtypes(include='float').columns:
        fft_coefs = pd.concat(((data[c]
                                .rolling(window, min_periods=1)
                                .apply(lambda x: hfft(x, n_coef)[i])) for i in range(n_coef)), 
                              axis=1
                             )
        fft_coefs.columns = [f'{c}_fft_coef_{i}' for i in range(n_coef)]
        data = pd.concat([data, fft_coefs], axis=1)
    return data

In [15]:
watchman = IsolatingWatchman(random_state=31)

w = 15
n = 3

for train, _, _ in tqdm(ds.train_generator()):
    train = add_fft_coef(train, w, n)
    watchman.partial_fit(train)

exam_list = pd.DataFrame(columns=['precision', 'recall', 'f1_score'])
for valid, faults, info in tqdm(ds.valid_generator()):
    valid = add_fft_coef(valid, w, n)
    detect = watchman.predict(valid)
    exam_list.loc[info] = time_span_metrics(faults, detect)
    
exam_list.mean()

0it [00:00, ?it/s]

0it [00:00, ?it/s]

precision    0.011112
recall       0.907895
f1_score     0.021934
dtype: float64

## Add mean ans std

In [60]:
def add_stat(data: pd.DataFrame, window: int) -> pd.DataFrame:
    float_data = data.select_dtypes(include='float')
    data_mean = float_data.rolling(window, min_periods=1).mean()
    data_mean.columns += '_mean'
    data_std = float_data.rolling(window, min_periods=1).std().fillna(0)
    data_std.columns += '_std'
    data_median = float_data.rolling(window, min_periods=1).median()
    data_median.columns += '_median'
    data_kurt = float_data.rolling(window, min_periods=1).kurt().fillna(0)
    data_kurt.columns += '_kurt'
    data = pd.concat([data, data_mean, data_std, data_median, data_kurt], axis=1)
    return data

In [61]:
watchman = IsolatingWatchman(random_state=31)

w = 15

for train, _, _ in tqdm(ds.train_generator()):
    train = add_stat(train, w)
    watchman.partial_fit(train)

exam_list = pd.DataFrame(columns=['precision', 'recall', 'f1_score'])
for valid, faults, info in tqdm(ds.valid_generator()):
    valid = add_stat(valid, w)
    detect = watchman.predict(valid)
    exam_list.loc[info] = time_span_metrics(faults, detect)
    
exam_list.mean()

0it [00:00, ?it/s]

0it [00:00, ?it/s]

precision    0.015225
recall       0.973684
f1_score     0.029946
dtype: float64

In [14]:
stop

NameError: name 'stop' is not defined

## TEP Harvard

In [None]:
ds = TepHarvardDataset()
ds.shake_not_stir(valid_test_ratio=0.5, balanced_test=True)

In [None]:
watchman = IsolatingWatchman(random_state=31)

In [None]:
train_gen = ds.train_generator()
for train, _, _ in tqdm(train_gen, total=500):
    watchman.partial_fit(train)

In [None]:
watchman

In [None]:
valid_gen = ds.valid_generator()
examine_list = pd.DataFrame(columns=['precision', 'recall', 'f1_score'], dtype='float')
for valid, faults, info in tqdm(valid_gen, total=500):
    detect = watchman.predict(valid)
    examine_list.loc[info] = time_span_metrics(faults, detect)
#     if max(faults) and max(detect):
#         plot_stacked(data,
#                      title=info,
#                      group='value_unit',
#                      faults=faults,
#                      detect=detect,
#                     )

In [None]:
examine_list.mean()

## TEP Kaspersky

In [None]:
ds = TepKasperskyDataset()
ds.shake_not_stir(valid_test_ratio=0.4)

In [None]:
watchman = IsolatingWatchman(random_state=31, max_samples=1024)

In [None]:
watchman.forest

In [None]:
train_gen = ds.train_generator()
for train, _, _ in tqdm(train_gen, total=400):
    watchman.partial_fit(train, increment=1)

In [None]:
watchman

In [None]:
valid_gen = ds.valid_generator()
examine_list = pd.DataFrame(columns=['precision', 'recall', 'f1_score'], dtype='float')
for valid, faults, info in tqdm(valid_gen, total=115):
    detect = watchman.predict(valid)
    examine_list.loc[info] = time_span_metrics(faults, detect)
#     if max(faults) and max(detect):
#         plot_stacked(data,
#                      title=info,
#                      group='value_unit',
#                      faults=faults,
#                      detect=detect,
#                     )

In [None]:
examine_list.mean()

## Extended Isolation Forest

### sahandha implementation

!pip install Cython

!python --version

!pip install eif

Can't install on current version of python. Implementation doesn't support partial_fit.

### H2O

https://docs.h2o.ai/h2o/latest-stable/h2o-py/docs/intro.html#installing-h2o-3

!pip install requests

!pip install tabulate

!pip install future

!pip uninstall h2o

!pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o

In [15]:
import h2o
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.341-b10, mixed mode)
  Starting server from C:\Users\Stepan\anaconda3\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\Stepan\AppData\Local\Temp\tmpvytro_u5
  JVM stdout: C:\Users\Stepan\AppData\Local\Temp\tmpvytro_u5\h2o_Stepan_started_from_python.out
  JVM stderr: C:\Users\Stepan\AppData\Local\Temp\tmpvytro_u5\h2o_Stepan_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,07 secs
H2O_cluster_timezone:,Europe/Moscow
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.1.4
H2O_cluster_version_age:,"28 days, 19 hours and 35 minutes"
H2O_cluster_name:,H2O_from_python_Stepan_su9tl2
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,5.316 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [16]:
from h2o.estimators import H2OExtendedIsolationForestEstimator

In [17]:
dir(H2OExtendedIsolationForestEstimator)

['_H2OEstimator__default_params',
 '_ModelBase__generate_partial_plots',
 '_ModelBase__generate_user_splits',
 '_ModelBase__grab_values',
 '_ModelBase__plot_1d_pdp',
 '_ModelBase__plot_1d_pdp_multinomial',
 '_ModelBase__plot_2d_pdp',
 '_ModelBase__pred_for_3d',
 '_ModelBase__set_axs_1d',
 '_ModelBase__set_axs_1d_multinomial',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_bc',
 '_check_and_save_parm',
 '_check_targets',
 '_default_param_value',
 '_fillMultinomialDict',
 '_get_metrics',
 '_get_rest_version',
 '_keyify',
 '_make_parms',
 '_metrics_class',
 '_options_',
 '_print_model_scoring_history',
 '_resolve_model',
 '_train',
 '_verify_training_fr

Implementation doesn't support partial_fit.