In [80]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px

import torch
import sklearn

import sys
sys.path.append('../')

%matplotlib notebook

In [81]:
obs_df = pd.read_csv("../data/synthetic_cloud_ops.csv")

obs_df[['service', 'region']] = obs_df['dimensions'].apply(lambda x: pd.Series([eval(x)['service'], eval(x)['region']]))

obs_df = obs_df.drop(columns="dimensions")

In [82]:
obs_df.head()

Unnamed: 0,timestamp,metric_name,value,service,region
0,2022-01-01 00:00:00,memory_usage,16.681384,webserver,us-west-1
1,2022-01-01 00:00:01,network_throughput,26.702124,worker,us-west-2
2,2022-01-01 00:00:02,error_rate,43.616466,worker,us-west-2
3,2022-01-01 00:00:03,network_throughput,16.961173,database,us-west-1
4,2022-01-01 00:00:04,cpu_usage,16.365177,webserver,us-west-1


# Outlier Detection


#### Heuristic Z-score Normalization

In [83]:
# Calculate the average of the target variable
avg = obs_df['value'].mean()

# Calculate the standard deviation of the target variable
std = obs_df['value'].std()

# Calculate the upper threshold for outliers
threshold = avg + 1.5 * std

# Create a new column that flags outliers as True or False
obs_df['is_outlier'] = obs_df['value'] > threshold

# Count the number of outliers found
num_outliers = obs_df['is_outlier'].sum()

# Print the number of outliers found
print("Number of outliers:", num_outliers)

Number of outliers: 8


In [84]:
px.scatter(obs_df, x='timestamp', y='value', color='is_outlier', title="Usage Values Over Time (Z-score Normalization)")

#### Using One-Class SVM

In [85]:
from sklearn.svm import OneClassSVM
import numpy as np

# Load your dataset into a numpy array, X
X = obs_df['value'].to_numpy().reshape(-1, 1)

# Create an instance of the OneClassSVM model
model = OneClassSVM(kernel='rbf', nu=0.1, gamma=2)

# Train the model on your dataset
model.fit(X)

# Predict the anomaly score for each sample in the dataset
obs_df['scores_svm'] = model.decision_function(X)

# Identify the outliers based on a threshold score
threshold = np.percentile(obs_df['scores_svm'],90)

obs_df['is_outlier_svm']= obs_df['scores_svm'] > threshold

# Print the number of outliers found


In [86]:
px.scatter(obs_df, x="timestamp", y="value", color ="is_outlier_svm", title="Usage Values Over Time (One-Class SVM)")

#### Using KMeans Clustering

In [103]:
from sklearn.cluster import KMeans

# Convert the float column to a numpy array
X = np.array(obs_df['value']).reshape(-1, 1)

# Define the number of clusters
n_clusters = 5

# Initialize the KMeans model with the specified number of clusters
model = KMeans(n_clusters=n_clusters)

# Fit the model to the data
model.fit(X)

# Get the cluster labels for each data point
labels = model.labels_

# Create a new column in the DataFrame for the cluster labels
obs_df['cluster'] = labels

# Get the average value for each cluster
cluster_means = obs_df.groupby('cluster')['value'].mean()

print(cluster_means)

obs_df['clusters'] = model.predict(obs_df['value'].to_numpy().reshape(-1,1))

obs_df['distances'] = obs_df['clusters'].apply(lambda x: cluster_means[x])

# Calculate the standard deviation of the distances
std_dev = obs_df['distances'].std()

# Define a threshold for anomalies as a multiple of the standard deviation
threshold = std_dev * 3

# Create a new column in the DataFrame to flag anomalies as True or False
obs_df['is_anomaly_kmeans'] = obs_df['distances'] > threshold

# Count the number of anomalies found
num_anomalies = obs_df['is_anomaly_kmeans'].sum()

# Print the number of anomalies found
print("Number of anomalies:", num_anomalies)

cluster
0    28.702747
1    46.993167
2    73.214668
3    17.120957
4    40.398250
Name: value, dtype: float64
Number of anomalies: 8


In [105]:
px.scatter(obs_df,x="timestamp",y="value",color="is_anomaly_kmeans", title="Anomaly Detection Using KMeans Clustering")