In [5]:
#!pip install psycopg2-binary
#!pip install -U tsfresh
#!pip install alibi-detect

In [2]:
import psycopg2
import psycopg2.extras
import pandas as pd
import numpy as np
import sqlalchemy
import time
from IPython.display import display, clear_output

import alibi_detect
from alibi_detect.od import SpectralResidual

alibi_detect.od.__all__

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


['OutlierAEGMM',
 'IForest',
 'Mahalanobis',
 'OutlierAE',
 'OutlierVAE',
 'OutlierVAEGMM',
 'OutlierSeq2Seq',
 'SpectralResidual',
 'LLR',
 'OutlierProphet']

In [3]:
engine = sqlalchemy.create_engine("postgresql://admin:adminadmin@postgresql-timescaledb.default.svc.cluster.local:5432/tsdb")
last_time = "1970-01-01"
cols = ['r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14']

In [None]:
i = 1

while True:
    clear_output(wait=True)
    display('Iteration: '+str(i))
    
    sql = f"""\
            SELECT time, {', '.join(cols)}
            from conditions 
            where (timestamp '{last_time}' = timestamp '1970-01-01' or time > timestamp '{last_time}') and time < now()
            order by time asc 
            limit 1000
    """
    
    df = pd.read_sql_query(sql, engine)
    df_scores = df[['time']].copy()
    
    od = SpectralResidual(
        threshold=1.,
        window_amp=20,
        window_local=20,
        padding_amp_method='reflect',
        padding_local_method='reflect',
        padding_amp_side='bilateral',
        n_est_points=10,
        n_grad_points=5
    )
    
    for col in cols:
        result = od.predict(
            df[col].to_numpy(),
            t=None,
            return_instance_score=True
        )
        df_scores[col+'_score'] = result['data']['instance_score'].tolist()
    
    df_scores.to_sql('scores_sr', engine, index=False, if_exists='append')
    
    # get last timestamp to use for next offset
    last_time = pd.to_datetime(df.time.tail(1).values[0]).strftime('%Y-%m-%d %H:%M:%S.%f %Z')
    print(f'Next offset: {last_time}, fetched: {df_scores.r1_score.size}')
    
    time.sleep(12)
    i += 1

'Iteration: 2'

Next offset: 2023-06-23 11:53:20.519999 , fetched: 39


In [None]:
# TODOs
# -----
# *DONE* - the SELECT should get all rows since the last call: i.e. the last ts has to cached and used as a baseline for the next call. Initially set to zero, NaN etc.
# *DONE* - this means we have no overlap and the outlier scores will not be persisted multiple times. alternatively we could average over the whole window.
# *DONE* - combine result_r1['data']#['instance_score'] with the original data frame's timestamp and write out to a new table. We can combine models in the same table.
# *DONE* - move scores_sr table into the demo yaml to avoid the permissions issue
# *DONE* - remove timescaledb script
# *DONE* - replace the unused notebook with the one from this demo
# *DONE* - update dashboard yaml in the branch
# *DONE* - add < now() to query (so we don't get ahead and can see how many records are needed for the threshold)
# *DONE* - for each batch, run multiple models - one per univariate reading - and combine them into a single data frame and persist this.
# *DONE* - profile timescaledb & grafana queries (add WHERE filter on time interval)

# - fix permissions on create extension timescale_toolkit 
# *DONE* - fix permissions for service creation (postgresql-timescaledb)
# *DONE* - try out some sort of multivariate-score averaging for an overall measurement (separate dashboard)
# - add in an initial "training" window for setting the threshold (needed? would allow use of trained algos such as Prophet, and also for setting a threshold for SRs)
# *DONE* - add lttb algorithm to reduce data points
# - rename demo/branch to signal-processing

# - add libraries to requirements at install/image?
# - (low prio) add this script to a spark-streaming job (only when things are really finished?)
