In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

In [None]:
# show files
!ls -l ../input/*

In [None]:
# select file
my_file = 'g.csv'

In [None]:
# load / preview
t1 = time.time()
df = pd.read_csv('../input/benchmark-labeled-anomaly-detection-ts/' + my_file)
t2 = time.time()
print('Elapsed time [s]:', np.round(t2-t1,4))
df.head()

In [None]:
# structure of data frame
df.info()

### Check frequency of anomalies

In [None]:
# frequencies
print('Absolute Frequencies:')
print(df.label.value_counts())
print()
print('Relative Frequencies:')
print(df.label.value_counts(normalize=True))

<a id='1'></a>
# Smoothing

In [None]:
# add smoothing of values
win_size = 15
df['smoothed'] = df.value.rolling(center=True, window=win_size, min_periods=1).mean()
# add difference of actual value and smoothed valued
df['diff_val_sm'] = df.value - df.smoothed
# preview
df.head(10)

In [None]:
# extract labeled anomalies
df_a = df[df.label==1]

In [None]:
# plot smoothing effect
df_zoom = df[2000:2200]
df_zoom_a = df_zoom[df_zoom.label==1]

plt.figure(figsize=(16,6))
plt.scatter(df_zoom.timestamp, df_zoom.value, 
            s=10, alpha=1, label='original')
plt.scatter(df_zoom.timestamp, df_zoom.smoothed, 
            c='green',
            s=10, alpha=1, label='smoothed')
plt.scatter(df_zoom_a.timestamp, df_zoom_a.value, 
            c='magenta',
            s=10, label='anomalies')
plt.legend()
plt.grid()
plt.show()

In [None]:
# plot differences from smoothed value
df_zoom = df[2000:2200]
df_zoom_a = df_zoom[df_zoom.label==1]

plt.figure(figsize=(16,6))
plt.scatter(df_zoom.timestamp, df_zoom.diff_val_sm, 
            s=10, alpha=1)
plt.scatter(df_zoom_a.timestamp, df_zoom_a.diff_val_sm, 
            c='magenta',
            s=10)
plt.grid()
plt.show()

<a id='2'></a>
# Visualize full time series

In [None]:
# plot full time series + anomalies
plt.figure(figsize=(16,6))
plt.scatter(df.timestamp, df.value, 
            s=10, alpha=0.1)
plt.scatter(df_a.timestamp, df_a.value, 
            c='magenta',
            s=10)
plt.grid()
plt.show()

In [None]:
# plot difference value vs smoothed value for full time series
plt.figure(figsize=(16,6))
plt.scatter(df.timestamp, df.diff_val_sm, 
            s=10, alpha=0.1)
plt.scatter(df_a.timestamp, df_a.diff_val_sm, 
            c='magenta',
            s=10)
plt.grid()
plt.show()

<a id='3'></a>
# Visualize time series piecewise

In [None]:
# split into parts
chunk_size = 10000

n = df.shape[0]
m = int(np.floor(n/chunk_size))

print('Decomposing series in ' + str(m) + ' parts')

In [None]:
# plot piecewise
for i in range(m+1):
    n1 = i*chunk_size
    n2 = min(n1 + chunk_size - 1, n)
    df_zoom = df[n1:n2]
    df_zoom_a = df_zoom[df_zoom.label==1]
    my_title = 'Original Time Series / Part ' + str(i+1) + ' - Row ' + str(n1) + ' to ' + str(n2)
    # plot
    plt.figure(figsize=(16,4))
    plt.scatter(df_zoom.index, df_zoom.value, alpha=0.2)
    plt.scatter(df_zoom_a.index, df_zoom_a.value, c='magenta')
    plt.title(my_title)
    plt.grid()
    plt.show()

In [None]:
# same for difference value vs smoothed value
for i in range(m+1):
    n1 = i*chunk_size
    n2 = min(n1 + chunk_size - 1, n)
    df_zoom = df[n1:n2]
    df_zoom_a = df_zoom[df_zoom.label==1]
    my_title = 'Time Series Diff vs Smoothed / Part ' + str(i+1) + ' - Row ' + str(n1) + ' to ' + str(n2)
    # plot
    plt.figure(figsize=(16,4))
    plt.scatter(df_zoom.index, df_zoom.diff_val_sm, alpha=0.2)
    plt.scatter(df_zoom_a.index, df_zoom_a.diff_val_sm, c='magenta')
    plt.title(my_title)
    plt.grid()
    plt.show()

## Work in Progress...