In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Importing relevant libraries

In [None]:
import plotly.express as px
import altair as alt
from sklearn.ensemble import IsolationForest
import plotly.graph_objects as go

# 2. Understanding the Data

In [None]:
cloudwatch_df = pd.read_csv("/kaggle/input/nab/realAWSCloudwatch/realAWSCloudwatch/ec2_cpu_utilization_53ea38.csv")
cloudwatch_df.head()

In [None]:
cloudwatch_df.shape

In [None]:
cloudwatch_df.info()

In [None]:
cloudwatch_df.describe()

# 3. Preprocessing/ feature engineering

In [None]:
cloudwatch_df['timestamp'] = pd.to_datetime(cloudwatch_df['timestamp'])
cloudwatch_df.info()

In [None]:
cloudwatch_df['year'] = cloudwatch_df['timestamp'].apply(lambda x: x.year)
cloudwatch_df['month'] = cloudwatch_df['timestamp'].apply(lambda x: x.month)
cloudwatch_df['day'] = cloudwatch_df['timestamp'].apply(lambda x: x.day)
cloudwatch_df['weekday'] = cloudwatch_df['timestamp'].apply(lambda x: x.weekday())
cloudwatch_df['hour'] = cloudwatch_df['timestamp'].apply(lambda x: x.hour)

cloudwatch_df = cloudwatch_df[['timestamp', 'year', 'month', 'day', 'weekday', 'hour', 'value']]

# Weekday starts from Monday
print(f'{cloudwatch_df.timestamp[0]} with weekday {cloudwatch_df.weekday[0]} is {cloudwatch_df.timestamp[0].strftime("%A")}.\n')

cloudwatch_df.head()

In [None]:
cloudwatch_df.describe()

# 4. Exploratory Data Analysis

In [None]:
fig = px.line(cloudwatch_df, x='timestamp', y='value', title='Overview of time series data')

fig.update_xaxes(rangeslider_visible=True,)
fig

In [None]:
cloudwatch_df.head()

In [None]:
alt.Chart(cloudwatch_df).mark_rect().encode(alt.X('hour:O', title='hour of day'),
                                      alt.Y('weekday:O', title='weekday'),
                                      alt.Color('value:Q', title='CPU usage')).properties(
                                            width=800,
                                            height=300)

In [None]:
alt.Chart(cloudwatch_df).mark_bar().encode(x = 'weekday:O',
                                     y = 'value:Q').properties(width=600)

# 5. Unsupervised Models

## 5.1 Isolation Forests

In [None]:
x = cloudwatch_df['value'].apply(lambda x: [x]).to_list()

iso_forest = IsolationForest(n_estimators = 100, 
                        max_samples = "auto",
                        contamination = 0.01, 
                        random_state = 42)
iso_forest.fit(x)
y_pred = iso_forest.predict(x)
y_pred = [1 if x == -1 else 0 for x in y_pred]
y_pred[:10]
# Points that are 1 are outliers

In [None]:
cloudwatch_df["anomaly"] = y_pred
cloudwatch_df.head()

In [None]:
iso_anomaly_df = pd.DataFrame(cloudwatch_df)
iso_anomaly_df = iso_anomaly_df.loc[iso_anomaly_df['anomaly'] == 1]
iso_anomaly_df.head()

In [None]:
fig = px.line(cloudwatch_df, x='timestamp', y='value', title='Unsupervised anomaly detection in CPU utilization')
fig.add_trace(go.Scatter(x=iso_anomaly_df["timestamp"].to_list(), y=iso_anomaly_df["value"].to_list(), mode='markers', name='anomalies'))
fig.update_xaxes(rangeslider_visible=True)
fig

Too bad we do not have labelled data to measure the amount of anomalies we manage to capture

## 5.2 Local Outlier Factor

In [None]:
from sklearn.neighbors import LocalOutlierFactor
lof = LocalOutlierFactor(n_neighbors=2)
y_pred = lof.fit_predict(x)
y_pred = [1 if x == -1 else 0 for x in y_pred]
y_pred[:10]

In [None]:
cloudwatch_df["anomaly"] = y_pred
cloudwatch_df.head()

In [None]:
lof_anomaly_df = pd.DataFrame(cloudwatch_df)
lof_anomaly_df = lof_anomaly_df.loc[lof_anomaly_df['anomaly'] == 1]
lof_anomaly_df.head()

In [None]:
fig = px.line(cloudwatch_df, x='timestamp', y='value', title='Unsupervised anomaly detection in CPU utilization')
fig.add_trace(go.Scatter(x=lof_anomaly_df["timestamp"].to_list(), y=lof_anomaly_df["value"].to_list(), mode='markers', name='anomalies'))
fig.update_xaxes(rangeslider_visible=True)
fig

# 6. Model comparison

In [None]:
fig = px.line(cloudwatch_df, x='timestamp', y='value', title='Unsupervised anomaly detection in CPU utilization')
fig.add_trace(go.Scatter(x=lof_anomaly_df["timestamp"].to_list(), y=lof_anomaly_df["value"].to_list(), mode='markers', name='Local Outlier Factor'))
fig.add_trace(go.Scatter(x=iso_anomaly_df["timestamp"].to_list(), y=iso_anomaly_df["value"].to_list(), mode='markers', name='Isolation Forests'))
fig.update_xaxes(rangeslider_visible=True)
fig

Just based on the two visualizations, it is easy to see that Isolation Forest provides is more robust as a model for capturing anomalies. Isolation Forest also allows more flexibility by defining a contamination parameter when defining the model.