In [1]:
# Exploring predict next row by current

In [2]:
!del /Q utils\__pycache__

In [3]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [4]:
from utils.metrics import time_span_metrics
from utils.datasets import GhlKasperskyDataset, TepHarvardDataset, SwatItrustDataset
from utils.custom_plots import plot_stacked

In [5]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, SGDRegressor
from sklearn.preprocessing import StandardScaler

# GHL

In [6]:
ds = GhlKasperskyDataset()
ds.shake_not_stir(valid_test_ratio=0.3)

In [7]:
train, _, _ = next(ds.train_generator())
valid, _, _ = next(ds.valid_generator())

In [8]:
train.columns

Index(['input_temp_gc', 'rt_temp_gc', 'ht_temp_gc', 'ct_temp_gc', 'rt_level_m',
       'ht_level_m', 'ct_level_m', 'input_flow_state', 'supply_flow_state',
       'return_flow_state', 'output_flow_state', 'heater_state'],
      dtype='object')

In [9]:
y = train['rt_level_m'].shift(-1).iloc[:-1]
X = train.iloc[:-1]

In [10]:
lr_1 = Lasso(random_state=31)
lr_1.fit(X, y)
print('Score:', lr_1.score(X, y))
# print(pd.Series(index=train.columns, data=lr_1.coef_))
# print('icpt:', lr_1.intercept_)

Score: 0.41978540804238307


In [11]:
lr_2 = Ridge(random_state=31)
lr_2.fit(X, y)
print('Score:', lr_2.score(X, y))
# print(pd.Series(index=train.columns, data=lr_1.coef_))
# print('icpt:', lr_1.intercept_)

Score: 0.9101176038761056


In [12]:
def fit_and_score(lr, df):
    scores = pd.Series(index=df.columns, name='score', dtype='float64')
    X = df.iloc[:-1].copy()  # without last row
    for c in df.columns:
        y = df[c].shift(-1).iloc[:-1]  # predict next value
        lr.fit(X, y)
        scores[c] = lr.score(X, y)
    return scores

In [13]:
fit_and_score(LinearRegression(), train).median()

0.9032384629696463

In [14]:
fit_and_score(Lasso(random_state=31), train).median()

0.2523845320112769

In [15]:
fit_and_score(Ridge(random_state=31), train).median()

0.902914994718925

In [16]:
train_ = pd.DataFrame(index=train.index, columns=train.columns, data=StandardScaler().fit_transform(train))
fit_and_score(SGDRegressor(random_state=31), train_)

input_temp_gc        0.983786
rt_temp_gc           0.989903
ht_temp_gc           0.956837
ct_temp_gc           0.997965
rt_level_m           0.909093
ht_level_m           0.584622
ct_level_m           0.999970
input_flow_state     0.895690
supply_flow_state    0.125663
return_flow_state    0.052921
output_flow_state    0.532552
heater_state         0.562575
Name: score, dtype: float64

In [17]:
# Let's using SGDRegressor, because it have warm_start

# TEP Harvard

In [18]:
ds = TepHarvardDataset()
ds.shake_not_stir(valid_test_ratio=0.3)

In [19]:
train, _, _ = next(ds.train_generator())
valid, _, _ = next(ds.valid_generator())

In [20]:
# add derivatives
def add_derivatives(data: pd.DataFrame) -> pd.DataFrame:
    diff1 = data.diff()
    diff1.columns = data.columns + '_d1'
    diff2 = diff1.diff()
    diff2.columns = data.columns + '_d2'
    return pd.concat([data, diff1, diff2], axis=1).fillna(0)
#     return pd.concat([data, diff1], axis=1).fillna(0)

train = add_derivatives(train)

In [21]:
train.columns

Index(['inp_a_flow_ksm3h', 'inp_d_flow_kgh', 'inp_e_flow_kgh',
       'inp_c_flow_ksm3h', 'recyl_flow_ksm3h', 'react_flow_ksm3h',
       'react_press_kpa', 'react_level_pc', 'react_temp_gc',
       'purge_flow_ksm3h',
       ...
       'inp_e_feed_pc_d2', 'inp_a_feed_pc_d2', 'inp_c_feed_pc_d2',
       'compr_valv_pc_d2', 'purge_feed_pc_d2', 'seprt_feed_pc_d2',
       'strip_feed_pc_d2', 'steam_feed_pc_d2', 're_cl_feed_pc_d2',
       'co_cl_feed_pc_d2'],
      dtype='object', length=156)

In [22]:
fit_and_score(LinearRegression(), train).median()

0.7572341479788365

In [23]:
fit_and_score(Lasso(random_state=31), train).median()

0.003862012138952875

In [24]:
fit_and_score(Ridge(random_state=31), train).median()

0.7055406521339254

In [25]:
train_ = pd.DataFrame(index=train.index, columns=train.columns, data=StandardScaler().fit_transform(train))
fit_and_score(SGDRegressor(random_state=31), train_).median()

0.7460486482643751

# SWaT

In [26]:
ds = SwatItrustDataset()
ds.shake_not_stir(valid_test_ratio=0.3)

In [27]:
train, _, _ = next(ds.train_generator())
valid, _, _ = next(ds.valid_generator())

In [28]:
fit_and_score(LinearRegression(), train).median()

0.9825456632676254

In [29]:
# fit_and_score(Lasso(random_state=31), train).median()

In [30]:
# fit_and_score(Ridge(random_state=31), train).median()

In [47]:
regr = SGDRegressor(random_state=31, 
                    penalty='l2', 
                    alpha=1.0, 
                    l1_ratio=0.15, 
                    max_iter=1000, 
                    tol=0.001, 
                    learning_rate='invscaling', 
                    eta0=0.01, 
                    power_t=0.25, 
                    validation_fraction=0.1, 
                    n_iter_no_change=10, 
                    warm_start=False, 
                    average=False)
train_ = pd.DataFrame(index=train.index, columns=train.columns, data=StandardScaler().fit_transform(train))
fit_and_score(regr, train_)

FIT101_flow_m3h             0.807538
LIT101_level_mm             0.847500
MV101_feed_state            0.746730
P101_pump_state             0.835405
P102_pump_state             1.000000
AIT201_conductivity_uScm    0.868183
AIT202_acidity_pH           0.580579
AIT203_oxidation_mV         0.849374
FIT201_flow_m3h             0.845837
MV201_feed_state            0.818286
P201_pump_state             1.000000
P202_pump_state             1.000000
P203_pump_state             0.846164
P204_pump_state             1.000000
P205_pump_state             0.636739
P206_pump_state             1.000000
DPIT301_diffpressure_kPa    0.690489
FIT301_flow_m3h             0.660621
LIT301_level_mm             0.901437
MV301_feed_state            0.017796
MV302_feed_state            0.734218
MV303_feed_state            0.447263
MV304_feed_state            0.673567
P301_pump_state             0.731346
P302_pump_state             0.660601
AIT401_hardness_ppm         0.776939
AIT402_oxidation_mV         0.942895
F

In [32]:
stop

NameError: name 'stop' is not defined

# Watchman

In [None]:
from utils.watchmen import LinearPredictWatchman

## GHL

In [None]:
ds = GhlKasperskyDataset()
ds.shake_not_stir(valid_test_ratio=0.3)

In [None]:
watchman = LinearPredictWatchman(random_state=31)

In [None]:
for train, _, _ in tqdm(ds.train_generator()):
    watchman.partial_fit_scaler(train)
for train, _, _ in tqdm(ds.train_generator()):
    watchman.partial_fit(train)

In [None]:
watchman.limits

In [None]:
# main_columns = ['rt_temp_gc', 'ht_temp_gc', 'rt_level_m', 'ht_level_m']

In [None]:
valid_gen = ds.valid_generator()
examine_list = pd.DataFrame(columns=['precision', 'recall', 'f1_score'], dtype='float')
for valid, faults, info in tqdm(valid_gen):
    detect = watchman.predict(valid)
#     detect = detect[main_columns]
    examine_list.loc[info] = time_span_metrics(faults, detect)
#     if max(faults) and max(detect):
#         plot_stacked(valid, faults=faults, detect=detect)
#         break

In [None]:
examine_list.mean()

## TEP

In [None]:
ds = TepHarvardDataset()
ds.shake_not_stir(valid_test_ratio=1.0, balanced_test=True)

In [None]:
watchman = LinearPredictWatchman(random_state=31)

In [None]:
for train, _, _ in tqdm(ds.train_generator()):
    watchman.partial_fit_scaler(train)
for train, _, _ in tqdm(ds.train_generator()):
    watchman.partial_fit(train)

In [None]:
watchman.limits

In [None]:
valid_gen = ds.valid_generator()
examine_list = pd.DataFrame(columns=['precision', 'recall', 'f1_score'], dtype='float')
for valid, faults, info in tqdm(valid_gen):
    detect = watchman.predict(valid)
    examine_list.loc[info] = time_span_metrics(faults, detect)

In [None]:
examine_list.mean()

## SWaT

In [None]:
ds = SwatItrustDataset()
ds.shake_not_stir(valid_test_ratio=0.5)

In [None]:
watchman = LinearPredictWatchman(random_state=31)

In [None]:
for train, _, _ in tqdm(ds.train_generator()):
    watchman.partial_fit_scaler(train)
for train, _, _ in tqdm(ds.train_generator()):
    watchman.partial_fit(train)

In [None]:
watchman.limits

In [None]:
valid_gen = ds.valid_generator()
examine_list = pd.DataFrame(columns=['precision', 'recall', 'f1_score'], dtype='float')
for valid, faults, info in tqdm(valid_gen):
    detect = watchman.predict(valid)
    examine_list.loc[info] = time_span_metrics(faults, detect)

In [None]:
examine_list.mean()