In [20]:
from prophet import Prophet
import pandas as pd
from datetime import datetime, timedelta
import altair as alt
from collections import deque
import numpy as np
import math
import time

def fit_predict_model(dataframe,periods=15,changepoint_prior_scale=0.10,daily_seasonality=False, weekly_seasonality=False, yearly_seasonality=False):
    prophet_model = Prophet(growth="logistic",changepoint_prior_scale=changepoint_prior_scale,daily_seasonality=daily_seasonality, weekly_seasonality=weekly_seasonality, yearly_seasonality=yearly_seasonality)
    prophet_model.fit(dataframe)
    future = prophet_model.make_future_dataframe(periods=periods,freq="1MIN",include_history=False,)
    future["cap"] = 1.0
    future_forecast = prophet_model.predict(future)
    future_forecast["timestamp"] = future_forecast["ds"]
    future_forecast = future_forecast.set_index("timestamp")
    return future_forecast

def detect_anomalies(forecast):
    forecasted = forecast[['ds','trend', 'yhat', 'yhat_lower', 'yhat_upper', 'fact', 'upper_std', 'lower_std']].copy()
    #forecast['fact'] = df['y']
    forecasted.loc[forecasted['fact'] > forecasted['upper_std'], 'anomaly'] = 1
    forecasted.loc[forecasted['fact'] < forecasted['lower_std'], 'anomaly'] = 1

    #anomaly importances
    forecasted['importance'] = 0
    forecasted.loc[forecasted['anomaly'] ==1, 'importance'] = \
        (forecasted['fact'] - forecasted['yhat_upper'])/forecast['fact']
    forecasted.loc[forecasted['anomaly'] ==-1, 'importance'] = \
        (forecasted['yhat_lower'] - forecasted['fact'])/forecast['fact']
    
    return forecasted

def plot_anomalies(forecasted):
    interval = alt.Chart(forecasted).mark_area(interpolate="basis", color = '#7FC97F').encode(
    x=alt.X('ds:T', axis=alt.Axis(format='%H:%M'), title ='date'),
    y='upper_std',
    y2='lower_std',
    tooltip=['ds', 'fact', 'lower_std', 'upper_std']
    ).interactive().properties(
        title='Anomaly Detection'
    )

    fact = alt.Chart(forecasted[forecasted.anomaly==0]).mark_circle(size=15, opacity=0.7, color = 'Black').encode(
        x='ds:T',
        y=alt.Y('fact', title='CPU Utilization Percentage'),    
        tooltip=['ds', 'fact', 'lower_std', 'upper_std']
    ).interactive()

    anomalies = alt.Chart(forecasted[forecasted.anomaly!=0]).mark_circle(size=30, color = 'Red').encode(
        x='ds:T',
        y=alt.Y('fact', title='CPU Utilization Percentage'),    
        tooltip=['ds', 'fact', 'lower_std', 'upper_std']
    ).interactive()

    return alt.layer(interval, fact, anomalies)\
              .properties(width=870, height=450)\
              .configure_title(fontSize=20)

fit_predict_model is a function which will take in a dataframe, fit the Prophet model to it and then give the forecasted results back for the next N minutes

detect_anomalies determines if each of the actual values is within the lower and upper bound.

plot_anomalies is a function that will plot the lower and upper bounds for each data point in addition to the actual value. Anomalies are marked by red meaning the data point is outside of the lower and upper bound range.

In [2]:
dataset_values = []
date_time_vals = []
with open('clusterwide_cpu_usage_percentage.txt','r') as cmd:
    for line in cmd:
        line = line.rstrip().split(",")
        date_time = datetime.fromtimestamp(float(line[0]))
        date_time_vals.append(float(line[0]))
        metric_dict = {"ds": date_time.strftime("%Y-%m-%d %H:%M:%S"), "y": float(line[1]) / 100.0}
        dataset_values.append(metric_dict)

Inject outlier data into dataset to see if Prophet is able to adjust its forecasts over time.

In [3]:
last_one = date_time_vals[-1] + 60
icr = 0.70
for i in range(500):
    if i % 50 == 0:
        icr += 0.01
    new_dict = {"ds": (datetime.fromtimestamp(last_one)).strftime("%Y-%m-%d %H:%M:%S"), "y": icr}
    last_one += 60
    dataset_values.append(new_dict)
    

Code to read in the values from CSV file

In [19]:
data_range = 1200
time_interval = 10
training_metric_limit = 1440
changepoint_prior_scale = 0.05
data_values = dataset_values[:data_range]
training_dataset = data_values[:time_interval]
training_dataset_df = pd.DataFrame(training_dataset)
training_dataset_df['cap'] = 1.0
future_forecast = fit_predict_model(training_dataset_df,periods=time_interval,changepoint_prior_scale=changepoint_prior_scale)

In [None]:
%%time
counter = 0
num_anomalies = 0
prediction_results = []
interval_metrics = []
column_names =  ['trend', 'yhat', 'yhat_lower', 'yhat_upper']
anomaly_count = []
metric_values = []
for i in range(time_interval,len(data_values)):
    interval_metrics.append(data_values[i])
    current_metric_datetime = data_values[i]["ds"]
    current_metric_value = data_values[i]["y"]
    nearest_index = future_forecast.index.get_loc(current_metric_datetime, method="nearest")
    nearest_index_value = future_forecast.iloc[[nearest_index]]
    metric_data_obtained = {"fact": current_metric_value, "ds": current_metric_datetime}
    for c in column_names:
        metric_data_obtained[c] = nearest_index_value[c].values[0]
    yhat_lower = nearest_index_value['yhat_lower'].values[0]
    yhat_upper = nearest_index_value['yhat_upper'].values[0]
    yhat_value = nearest_index_value['yhat'].values[0]
    yhat_lower = max(0, yhat_lower)
    upper_std = abs(yhat_upper - yhat_value)
    lower_std = abs(yhat_value - yhat_lower)
    metric_data_obtained['lower_std'] = max(0, yhat_lower - 2 * lower_std)
    metric_data_obtained['upper_std'] = min(1.0,yhat_upper + (2 * upper_std))
    metric_data_obtained['yhat_lower'] = yhat_lower
    metric_data_obtained['yhat_upper'] = yhat_upper
    metric_data_obtained['anomaly'] = 0
    interval_metrics.append(data_values[i])
    if (current_metric_value < yhat_lower - (2 * lower_std)) or (current_metric_value > yhat_upper + (2 * upper_std)):
        metric_data_obtained['anomaly'] = 1
        num_anomalies += 1
    prediction_results.append(metric_data_obtained)
    counter += 1
    if counter == time_interval:
        counter = 0
        if len(training_dataset) + len(interval_metrics) > training_metric_limit:
            training_dataset = training_dataset[time_interval:]
        training_dataset += interval_metrics
        anomaly_count.append(num_anomalies)
        num_anomalies = 0
        training_dataset_df = pd.DataFrame(training_dataset)
        training_dataset_df['cap'] = 1.0
        future_forecast = fit_predict_model(training_dataset_df,periods=time_interval,changepoint_prior_scale=changepoint_prior_scale)
        interval_metrics = []

In [13]:
pred_result_df = pd.DataFrame(prediction_results)

In [14]:
pred_result_df

Unnamed: 0,fact,ds,trend,yhat,yhat_lower,yhat_upper,lower_std,upper_std,anomaly
0,0.156010,2021-07-17 03:27:03,0.143062,0.143062,0.135726,0.150514,0.121053,0.165419,0
1,0.146288,2021-07-17 03:28:03,0.141458,0.141458,0.134118,0.148449,0.119440,0.162431,0
2,0.156358,2021-07-17 03:29:03,0.139868,0.139868,0.132666,0.147075,0.118262,0.161490,0
3,0.152594,2021-07-17 03:30:03,0.138294,0.138294,0.130035,0.145485,0.113517,0.159867,0
4,0.155208,2021-07-17 03:31:03,0.136734,0.136734,0.129200,0.144184,0.114133,0.159083,0
...,...,...,...,...,...,...,...,...,...
1085,0.800000,2021-07-17 21:32:40,0.799683,0.799683,0.720097,0.884861,0.560925,1.000000,0
1086,0.800000,2021-07-17 21:33:40,0.799828,0.799828,0.719890,0.876919,0.560013,1.000000,0
1087,0.800000,2021-07-17 21:34:40,0.799974,0.799974,0.718402,0.879433,0.555257,1.000000,0
1088,0.800000,2021-07-17 21:35:40,0.800119,0.800119,0.722313,0.882242,0.566699,1.000000,0


See the number of anomalies predicted for each time_interval. A cold start is expected initially.

In [15]:
print(anomaly_count)

[4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [16]:
alt_plot = plot_anomalies(pred_result_df)

In [17]:
alt_plot