In [None]:
# Required packages for processing
import pandas as pd
import numpy as np
import pytz
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
def preprocess_dataframe(df):
    df = df.replace(-1, np.NaN)
    df["dt"] = pd.to_datetime(df.index, utc=True, unit="s")
    df["dt"] = df["dt"].dt.tz_convert(pytz.timezone("Europe/Amsterdam")).dt.tz_localize(None)
    df = df.set_index("dt")
    df["Total"]= df.sum(axis=1)
    return(df)

def resample_dataframe(df, window):
    df = df.resample(window).mean()
    return(df)
    
def zscore(x, window):
    r = x.rolling(window=window)
    m = r.mean().shift(1)
    s = r.std(ddof=0).shift(1)
    z = (x-m)/s
    z = z.to_frame()
    z = z.dropna()
    return(z)

def moving_average(x, window):
    y = x.rolling(window=window).mean()
    return(y)

def top_anomalies(df, threshold):
    column = df.iloc[:, 0]
    df['temp'] = column.apply(lambda x: x if x > 0 else (x*-1))
    df['pct'] = df['temp'].rank(pct=True)
    df['over_threshold'] = df['pct'].apply(lambda x: 1 if x >= threshold else 0)
    df = df.drop(columns=['pct'])
    df = df.drop(columns=['temp'])
    return(df)

In [None]:
# Load in sample data - load1 metric
node_load1 = pd.read_parquet("node_load1")
# Preprocess data - resample to 5 minute time bins
node_load1 = preprocess_dataframe(node_load1)
node_load1_5min = resample_dataframe(node_load1,"5T")
print(node_load1_5min.plot(y="Total", figsize=(15,5)))

In [None]:
# Visualize subset
print(node_load1_5min['2020-01-15 00:00:00':'2020-01-28 00:00:00'].plot(y="Total", figsize=(15,5)))

In [None]:
# Generate z-scores based on a 50 minute time window
zscore = zscore((node_load1_5min['Total']),10)

In [None]:
# Visualize results
print(zscore['2020-01-15 00:00:00':'2020-01-28 00:00:00'].plot(figsize=(15,5)))

In [None]:
# Calculate top z-score 'anomalies' (top 3%)
top_anomalies = top_anomalies(zscore, 0.97)

In [None]:
# Print/visualize results
print(top_anomalies)

In [None]:
print(top_anomalies.plot(y='Total', figsize=(15,5)))

In [None]:
# Filter out only those z-scores and their corresponding timestamps which are over the defined threshold
selection = top_anomalies[top_anomalies['over_threshold']==1]
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(selection)