In [27]:
import logging
import os

import pandas as pd
import numpy as np
from btb.tuning import Tunable
from mlblocks import MLPipeline
from sklearn.model_selection import KFold, StratifiedKFold

from orion import MLBLOCKS_PIPELINES
from orion.evaluation import CONTEXTUAL_METRICS as METRICS

In [2]:
from orion.data import load_signal, load_anomalies

signal = 'S-1'

data = load_signal(signal)
anomalies = load_anomalies(signal)

In [12]:
data.shape

(10149, 2)

In [22]:
def _expand(data, anomalies):
    time_column = 'timestamp'
    X = data.copy()

    X = X.set_index(time_column)
    X['label'] = [0] * len(X)
    for i, anom in anomalies.iterrows():
        X.loc[anom[0]: anom[1], 'label'] = 1

    return X.reset_index()

def _compress(data):
    time_column = 'timestamp'
    X = data.copy()
    
    X = X.set_index(time_column)
    y = list()
    
    return anomalies

def _get_split(data, index):
    X = data.iloc[index]
    y = _compress(X)
    return X, y

n_splits = 2
cv = KFold(n_splits=n_splits, shuffle=False, random_state=None)

X = _expand(data, anomalies)
for train_index, test_index in cv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train = _get_split(data, train)
    X_test = _get_split(data, test)

TRAIN: [ 5075  5076  5077 ... 10146 10147 10148] TEST: [   0    1    2 ... 5072 5073 5074]
TRAIN: [   0    1    2 ... 5072 5073 5074] TEST: [ 5075  5076  5077 ... 10146 10147 10148]


In [18]:
X = _expand(data, anomalies)

In [24]:
X = pd.DataFrame({
    "timestamp": list(range(10)),
    "value": [-9] * 10,
    "label": [0, 0, 1, 1, 0, 0, 0, 1, 1, 1]
})

In [25]:
list_of_df = [d for _, d in X.groupby('label')]
print(*list_of_df, sep="\n\n")

   timestamp  value  label
0          0     -9      0
1          1     -9      0
4          4     -9      0
5          5     -9      0
6          6     -9      0

   timestamp  value  label
2          2     -9      1
3          3     -9      1
7          7     -9      1
8          8     -9      1
9          9     -9      1


In [29]:
list_of_df = np.split(X, np.flatnonzero(np.diff(X.label) != 0) + 1)
print(*list_of_df, sep="\n\n")

   timestamp  value  label
0          0     -9      0
1          1     -9      0

   timestamp  value  label
2          2     -9      1
3          3     -9      1

   timestamp  value  label
4          4     -9      0
5          5     -9      0
6          6     -9      0

   timestamp  value  label
7          7     -9      1
8          8     -9      1
9          9     -9      1
