In [None]:
#!pip install psycopg2-binary
#!pip install -U tsfresh
#!pip install alibi-detect

In [2]:
import psycopg2
import psycopg2.extras
import pandas as pd
import numpy as np
import sqlalchemy
import time
from IPython.display import display, clear_output

import alibi_detect
from alibi_detect.od import SpectralResidual

alibi_detect.od.__all__

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


['OutlierAEGMM',
 'IForest',
 'Mahalanobis',
 'OutlierAE',
 'OutlierVAE',
 'OutlierVAEGMM',
 'OutlierSeq2Seq',
 'SpectralResidual',
 'LLR',
 'OutlierProphet']

In [12]:
engine = sqlalchemy.create_engine("postgresql://admin:adminadmin@postgresql-timescaledb.default.svc.cluster.local:5432/tsdb")
#last_time = "1970-01-01"
last_time = "2023-06-22 11:20:11.191262"
cols = ['r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14']

In [None]:
i = 1

while True:
    clear_output(wait=True)
    display('Iteration: '+str(i))
    
    sql = f"""\
            SELECT time, {', '.join(cols)}
            from conditions 
            where (timestamp '{last_time}' = timestamp '1970-01-01' or time > timestamp '{last_time}') and time < now()
            order by time asc 
            limit 50
    """
    
    df = pd.read_sql_query(sql, engine)
    
    # dataframe for SR scores
    df_scores = df[['time']].copy()
    
    od = SpectralResidual(
        threshold=1.,
        window_amp=20,
        window_local=20,
        padding_amp_method='reflect',
        padding_local_method='reflect',
        padding_amp_side='bilateral',
        n_est_points=10,
        n_grad_points=5
    )
    
    for col in cols:
        result = od.predict(
            df[col].to_numpy(),
            t=None,
            return_instance_score=True
        )
        df_scores[col+'_score'] = result['data']['instance_score'].tolist()
    
    df_scores.to_sql('scores_sr', engine, index=False, if_exists='append')
    print(df_scores.head(60))
    
    # get last timestamp to use for next offset
    last_time = pd.to_datetime(df.time.tail(1).values[0]).strftime('%Y-%m-%d %H:%M:%S.%f %Z')
    print(f'Next offset: {last_time}')
    
    time.sleep(12)
    i += 1

'Iteration: 210'

                               time  r1_score  r2_score  r3_score  r4_score  \
0  2023-06-22 15:16:50.570788+00:00  3.397636  2.936362  1.925105  1.673785   
1  2023-06-22 15:16:50.874788+00:00 -0.709516 -0.645430 -0.454737 -0.414969   
2  2023-06-22 15:16:51.179788+00:00 -0.186116 -0.190446 -0.232788 -0.946187   
3  2023-06-22 15:16:51.484788+00:00 -0.002229  0.008801  0.032608 -0.697614   
4  2023-06-22 15:16:51.789788+00:00 -0.095644 -0.052742  0.036914 -0.519849   
5  2023-06-22 15:16:52.094788+00:00 -0.075932 -0.066634 -0.048844 -0.373927   
6  2023-06-22 15:16:52.399788+00:00 -0.146979 -0.107529 -0.065093 -0.197735   
7  2023-06-22 15:16:52.704788+00:00 -0.097941 -0.080210 -0.057443 -0.054716   
8  2023-06-22 15:16:53.009788+00:00 -0.079882 -0.067165 -0.072861  0.181568   
9  2023-06-22 15:16:53.314788+00:00 -0.147949 -0.123581 -0.080354  0.101413   
10 2023-06-22 15:16:53.618788+00:00 -0.122784 -0.098292 -0.059005  0.408290   
11 2023-06-22 15:16:53.923788+00:00 -0.095935 -0.095

In [21]:
df = pd.read_sql_query("SELECT * from conditions where time < now() order by time desc limit 50", engine)

In [7]:
# can this be omitted?
od.infer_threshold(
    df.r1.to_numpy(),
    t=None,  # array with timesteps, assumes dt=1 between observations if omitted
    threshold_perc=95
)

In [22]:
result_r1 = od.predict(
    df.r1.to_numpy(),
    t=None,  # array with timesteps, assumes dt=1 between observations if omitted
    return_instance_score=True
)

In [23]:
df.r1.to_numpy()

array([ 0.1765,  0.0952,  0.0944,  0.0936,  0.0929,  0.092 ,  0.0909,
        0.0897,  0.0884,  0.0873,  0.0855,  0.0835,  0.0812,  0.0783,
        0.0736,  0.0669,  0.0628,  0.5607,  0.5832,  0.6081,  0.6365,
        0.6677,  0.7033,  0.7438,  0.7913,  0.845 ,  0.9085,  0.984 ,
        1.0722,  1.1807,  1.3104,  1.472 ,  1.6707,  1.9291,  2.2479,
        2.6636,  3.1959,  3.8935,  4.794 ,  5.9545,  7.4062,  9.2636,
       11.4509, 14.1592, 15.3372, 20.8079, 24.0669, 27.1078, 30.9681,
       33.0204])

In [25]:
# TODOs
# -----
# *DONE* - the SELECT should get all rows since the last call: i.e. the last ts has to cached and used as a baseline for the next call. Initially set to zero, NaN etc.
# *DONE* - this means we have no overlap and the outlier scores will not be persisted multiple times. alternatively we could average over the whole window.
# *DONE* - combine result_r1['data']#['instance_score'] with the original data frame's timestamp and write out to a new table. We can combine models in the same table.
# *DONE* - move scores_sr table into the demo yaml to avoid the permissions issue
# *DONE* - remove timescaledb script
# *DONE* - replace the unused notebook with the one from this demo
# *DONE* - update dashboard yaml in the branch
# *DONE* - add < now() to query (so we don't get ahead and can see how many records are needed for the threshold)
# *DONE* - for each batch, run multiple models - one per univariate reading - and combine them into a single data frame and persist this.
# *DONE* - profile timescaledb & grafana queries (add WHERE filter on time interval)

# - try out some sort of multivariate-score averaging for an overall measurement
# - add in an initial "training" window for setting the threshold (needed?)
# - add this script to a spark-streaming job (only when things are really finished)
# - add libraries to requirements at install/image?
# - add lttb algorithm to reduce data points
