This notebook is exploring using Facebook Prophet while also trying to determine when to retrain the model using the volatility measurement inspired by the DRAIN service

In [1]:
from prophet import Prophet
import pandas as pd
from datetime import datetime, timedelta
import altair as alt
from collections import deque
import numpy as np
import math
import time

fit_predict_model is a function which will take in a dataframe, fit the Prophet model to it and then give the forecasted results back for the next N minutes

In [2]:
def fit_predict_model(dataframe,periods=15,changepoint_prior_scale=0.10,daily_seasonality=False, weekly_seasonality=False, yearly_seasonality=False):
    prophet_model = Prophet(growth="logistic",changepoint_prior_scale=changepoint_prior_scale,daily_seasonality=daily_seasonality, weekly_seasonality=weekly_seasonality, yearly_seasonality=yearly_seasonality)
    prophet_model.fit(dataframe)
    future = prophet_model.make_future_dataframe(periods=periods,freq="1MIN",include_history=False,)
    future["cap"] = 1.0
    future_forecast = prophet_model.predict(future)
    future_forecast["timestamp"] = future_forecast["ds"]
    future_forecast = future_forecast.set_index("timestamp")
    return future_forecast

detect_anomalies determines if each of the actual values is within the lower and upper bound.

In [3]:
def detect_anomalies(forecast):
    forecasted = forecast[['ds','trend', 'yhat', 'yhat_lower', 'yhat_upper', 'fact', 'upper_std', 'lower_std']].copy()
    #forecast['fact'] = df['y']
    forecasted.loc[forecasted['fact'] > forecasted['upper_std'], 'anomaly'] = 1
    forecasted.loc[forecasted['fact'] < forecasted['lower_std'], 'anomaly'] = 1

    #anomaly importances
    forecasted['importance'] = 0
    forecasted.loc[forecasted['anomaly'] ==1, 'importance'] = \
        (forecasted['fact'] - forecasted['yhat_upper'])/forecast['fact']
    forecasted.loc[forecasted['anomaly'] ==-1, 'importance'] = \
        (forecasted['yhat_lower'] - forecasted['fact'])/forecast['fact']
    
    return forecasted

plot_anomalies is a function that will plot the lower and upper bounds for each data point in addition to the actual value. Anomalies are marked by red meaning the data point is outside of the lower and upper bound range.

In [4]:
def plot_anomalies(forecasted):
    interval = alt.Chart(forecasted).mark_area(interpolate="basis", color = '#7FC97F').encode(
    x=alt.X('ds:T', axis=alt.Axis(format='%H:%M'), title ='date'),
    y='upper_std',
    y2='lower_std',
    tooltip=['ds', 'fact', 'lower_std', 'upper_std']
    ).interactive().properties(
        title='Anomaly Detection'
    )

    fact = alt.Chart(forecasted[forecasted.anomaly==0]).mark_circle(size=15, opacity=0.7, color = 'Black').encode(
        x='ds:T',
        y=alt.Y('fact', title='CPU Utilization Percentage'),    
        tooltip=['ds', 'fact', 'lower_std', 'upper_std']
    ).interactive()

    anomalies = alt.Chart(forecasted[forecasted.anomaly!=0]).mark_circle(size=30, color = 'Red').encode(
        x='ds:T',
        y=alt.Y('fact', title='CPU Utilization Percentage'),    
        tooltip=['ds', 'fact', 'lower_std', 'upper_std']
    ).interactive()

    return alt.layer(interval, fact, anomalies)\
              .properties(width=870, height=450)\
              .configure_title(fontSize=20)

Read in 600 data points from clusterwide_cpu_usage_percentage.txt which contains 600 minutes worth of CPU usage within a cluster

In [5]:
dataset_values = []
date_time_vals = []
with open('clusterwide_cpu_usage_percentage.txt','r') as cmd:
    for line in cmd:
        line = line.rstrip().split(",")
        date_time = datetime.fromtimestamp(float(line[0]))
        date_time_vals.append(float(line[0]))
        metric_dict = {"ds": date_time.strftime("%Y-%m-%d %H:%M:%S"), "y": float(line[1]) / 100.0}
        dataset_values.append(metric_dict)

Inject some outlier data at the end of this dataset to see whether or not these data points are marked as an anomaly or if model can adjust its forecasts over time

In [6]:
last_one = date_time_vals[-1] + 60
icr = 0.70
for i in range(500):
    if i % 50 == 0:
        icr += 0.01
    new_dict = {"ds": (datetime.fromtimestamp(last_one)).strftime("%Y-%m-%d %H:%M:%S"), "y": icr}
    last_one += 60
    dataset_values.append(new_dict)
    

Define data_range to determine how much of the dataset you would want to work with

In [7]:
data_range = 1200

Define time_interval for how often FB Prophet model will be retrained.

In [8]:
time_interval = 15

Define the maximum amount of training data which the FB prophet model will be trained on

In [9]:
training_metric_limit = 1440

Define the changepoint_range for the Prophet model

In [10]:
changepoint_prior_scale = 0.05

Retrieve the first data_range values of the dataset

In [11]:
data_values = dataset_values[:data_range]

For the initial training of the FB Prophet model, retrieve the first time_interval data points from the dataset

In [12]:
training_dataset = data_values[:time_interval]

Turn the training_dataset list into a Pandas dataframe so it can be taken as input into Prophet model

In [13]:
training_dataset_df = pd.DataFrame(training_dataset)

weighted_avg_and_std function which computes the weighted mean and standard deviation of a numpy array.

In [14]:
def weighted_avg_and_std(values, weights):
    average = np.average(values, weights=weights)
    # Fast and numerically precise:
    variance = np.average((values - average) ** 2, weights=weights)
    return average, math.sqrt(variance)

Initial variables declaration

In [15]:
metrics_data_queue = deque([],training_metric_limit)
normal_periods = []
num_anomalies = 0
metric_data_obtained = dict()
column_names =  ['trend', 'yhat', 'yhat_lower', 'yhat_upper']
train_on_next_chance = True
stable = False
training_start_ts_ns = 0
very_first_ts_ns = 0
forecasting_future = 1000
future_forecast = None
metrics_db = dict()
prediction_results = []
weighted_vols = []

In [None]:
%%time
for i in range(0,len(data_values)):
    metric_data_obtained = dict()
    current_metric_datetime = data_values[i]["ds"]
    element = datetime.strptime(current_metric_datetime,"%Y-%m-%d %H:%M:%S")
    element_datetime_tuple = element.timetuple()
    current_ts = int(time.mktime(element_datetime_tuple))
    if training_start_ts_ns == 0:
        training_start_ts_ns = current_ts
        very_first_ts_ns = current_ts
    current_metric_value = data_values[i]["y"]
    metrics_db[current_ts] = data_values[i]
    metrics_data_queue.appendleft(current_metric_value)
    cumulative_metrics_data = np.array(metrics_data_queue)
    vol = np.std(cumulative_metrics_data) / np.mean(cumulative_metrics_data[:10])
    time_steps = cumulative_metrics_data.shape[0]
    weights = np.flip(np.true_divide(np.arange(1, time_steps + 1), time_steps))
    weighted_mean, weighted_std = weighted_avg_and_std(cumulative_metrics_data, weights)
    weighted_vol = weighted_std / np.mean(cumulative_metrics_data[:10])
    weighted_vols.append(weighted_vol)
    if future_forecast is not None:
        nearest_index = future_forecast.index.get_loc(current_metric_datetime, method="nearest")
        nearest_index_value = future_forecast.iloc[[nearest_index]]
        for c in column_names:
            metric_data_obtained[c] = nearest_index_value[c].values[0]
        yhat_lower = nearest_index_value['yhat_lower'].values[0]
        yhat_upper = nearest_index_value['yhat_upper'].values[0]
        yhat_value = nearest_index_value['yhat'].values[0]
        yhat_lower = max(0, yhat_lower)
        upper_std = abs(yhat_upper - yhat_value)
        lower_std = abs(yhat_value - yhat_lower)
        metric_data_obtained = {"fact": current_metric_value, "ds": current_metric_datetime}
        metric_data_obtained['lower_std'] = max(0, yhat_lower - 2 * lower_std)
        metric_data_obtained['upper_std'] = yhat_upper + (2 * upper_std)
        metric_data_obtained['yhat_lower'] = yhat_lower
        metric_data_obtained['yhat_upper'] = yhat_upper
        metric_data_obtained['anomaly'] = 0
        if (current_metric_value < yhat_lower - (2 * lower_std)) or (current_metric_value > yhat_upper + (2 * upper_std)):
            metric_data_obtained['anomaly'] = 1
            num_anomalies += 1
        prediction_results.append(metric_data_obtained)
        
    if len(metrics_data_queue) > 15:
        if weighted_vol >= 0.199:
            train_on_next_chance = True

        if (weighted_vol < 0.199 and training_start_ts_ns != very_first_ts_ns and train_on_next_chance):
            training_start_ts_ns = current_ts

        if weighted_vol > 0.155 and not train_on_next_chance and stable:
            training_end_ts_ns = current_ts
            normal_periods.append({"start_ts": training_start_ts_ns, "end_ts": training_end_ts_ns})
            stable = False
            training_start_ts_ns = -1.0

        if weighted_vol <= 0.15 and train_on_next_chance:
            print(f"SENDING TRAIN SIGNAL on iteration {i}")
            if training_start_ts_ns != -1.0:
                training_end_ts_ns = current_ts
                normal_periods.append({"start_ts": training_start_ts_ns, "end_ts": training_end_ts_ns})
            train_on_next_chance = False
            stable = True
            training_start_ts_ns = current_ts
            training_ds = []
            for normal in normal_periods:
                start_ts, end_ts = normal["start_ts"], normal["end_ts"]
                print(start_ts)
                print(end_ts)
                print(metrics_db)
                for ts in range(start_ts,end_ts,60):
                    if ts in metrics_db:
                        training_ds.append(metrics_db[ts])
            training_ds_df = pd.DataFrame(training_ds)
            training_ds_df['cap'] = 1.0
            print(training_ds_df)
            future_forecast = fit_predict_model(training_ds_df,periods=training_metric_limit)


In [None]:
weighted_vols

Create the first Prophet model which will be trained on the initial time_interval data points

In [18]:
pred_result_df = pd.DataFrame(prediction_results)

In [19]:
pred_result_df.head(50)

Unnamed: 0,fact,ds,lower_std,upper_std,yhat_lower,yhat_upper,anomaly
0,0.153004,2021-07-17 03:33:03,0.056939,0.269987,0.119643,0.190659,0
1,0.15851,2021-07-17 03:34:03,0.041048,0.291438,0.114415,0.197879,0
2,0.155938,2021-07-17 03:35:03,0.029713,0.316456,0.110706,0.206287,0
3,0.155479,2021-07-17 03:36:03,0.013573,0.346036,0.105395,0.216216,0
4,0.158823,2021-07-17 03:37:03,0.0,0.376967,0.100012,0.226596,0
5,0.15675,2021-07-17 03:38:03,0.0,0.412771,0.093799,0.2386,0
6,0.156813,2021-07-17 03:39:03,0.0,0.443658,0.091148,0.248965,0
7,0.155792,2021-07-17 03:40:03,0.0,0.477858,0.085799,0.260435,0
8,0.154792,2021-07-17 03:41:04,0.0,0.528832,0.082223,0.277496,0
9,0.160082,2021-07-17 03:42:04,0.0,0.568082,0.078082,0.290649,0


See the number of anomalies predicted for each time_interval. A cold start is expected initially.

In [20]:
print(anomaly_count)

NameError: name 'anomaly_count' is not defined

In [None]:
plot_anomalies(pred_result_df)

In [None]:
import matplotlib.pyplot as plt
plotting_values = []
for d in data_values:
    plotting_values.append(d['y'])
plt.plot(range(len(plotting_values)), plotting_values)