In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd

from tensorflow import keras
from tensorflow.keras import layers

from datetime import datetime

import matplotlib.pyplot as plt
from matplotlib import dates as md
import plotly.graph_objs as go
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
cf.set_config_file(offline=True)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load the data
We will use the Numenta Anomaly Benchmark(NAB) dataset. It provides artifical timeseries data containing labeled anomalous periods of behavior. Data are ordered, timestamped, single-valued metrics.

We will use the `art_daily_small_noise.csv` file for training and the `art_daily_jumpsup.csv` file for testing. The simplicity of this dataset allows us to demonstrate anomaly detection effectively.

In [None]:
df_meta_BDG1 = pd.read_csv('/kaggle/input/building-data-genome-project-v1/meta_open.csv').set_index('uid')
df_meta_BDG1['dataend'] = pd.to_datetime(df_meta_BDG1['dataend'], dayfirst = True)
df_meta_BDG1['datastart'] = pd.to_datetime(df_meta_BDG1['datastart'], dayfirst = True)
df_meta_BDG1

In [None]:
df_powerMeter_BDG1 = pd.read_csv('/kaggle/input/building-data-genome-project-v1/temp_open_utc_complete.csv', index_col='timestamp', parse_dates=True)
df_powerMeter_BDG1.index = df_powerMeter_BDG1.index.tz_localize(None)
df_powerMeter_BDG1

In [None]:
df_powerMeter_BDG1_align = pd.DataFrame()
for col in df_powerMeter_BDG1.columns:
    datastart = df_meta_BDG1.loc[col, 'datastart']
    dataend = df_meta_BDG1.loc[col, 'dataend']
    df_temp = df_powerMeter_BDG1[col].loc[datastart:dataend].reset_index(drop=True)
    df_powerMeter_BDG1_align = pd.concat([df_powerMeter_BDG1_align, df_temp], axis=1)
    
df_powerMeter_BDG1_align

In [None]:
df_powerMeter_BDG2 = pd.read_csv('/kaggle/input/buildingdatagenomeproject2/electricity_cleaned.csv')
df_powerMeter_BDG2['timestamp'] = pd.to_datetime(df_powerMeter_BDG2['timestamp'])
df_powerMeter_BDG2 = df_powerMeter_BDG2.set_index('timestamp')
df_powerMeter_BDG2

In [None]:
df_power_meter = pd.concat([df_powerMeter_BDG2.loc[:'2016-12'].reset_index(drop=True), df_powerMeter_BDG1_align.reset_index(drop=True)],axis=1)
df_power_meter.index = df_powerMeter_BDG2.loc[:'2016-12'].index
df_power_meter

In [None]:
df_small_noise = df_power_meter.loc[:'2016-12'].copy()
df_small_noise = df_small_noise.fillna(method='ffill').fillna(method='bfill')
df_small_noise.iloc[:,:5].iplot()

In [None]:
df_daily_jumpsup = df_powerMeter_BDG2.loc['2016-12':'2017-12'].copy()
df_daily_jumpsup = df_daily_jumpsup.fillna(method='ffill').fillna(method='bfill')
df_daily_jumpsup.iloc[:,:5].iplot()

# Prepare training data
Get data values from the training timeseries data file and normalize the `value` data. We have a `value` for every 5 mins for 14 days.

24 * 366 = **8784 timesteps per year**

In [None]:
def normalize(df):
    mean = df.mean()
    df -= mean
    std = df.std()
    df /= std
    return df, mean, std

# Normalize `value` and save the mean and std we get,
# for normalizing test data.
training_value, training_mean, training_std = normalize(df_small_noise)
len(df_small_noise)

In [None]:
x_train = np.reshape(training_value.dropna(axis=1).T.values, training_value.dropna(axis=1).T.values.shape+(1,))
x_train.shape

# Build a model
We will build a convolutional reconstruction autoencoder model. The model will take input of shape `(batch_size, sequence_length, num_features)` and return output of the same shape. In this case, `sequence_length` is 288 and `num_features` is 1.

In [None]:
model = keras.Sequential(
    [
        layers.Input(shape=(x_train.shape[1], x_train.shape[2])),
        layers.BatchNormalization(),
        layers.Conv1D(
            filters=32, kernel_size=24, padding="same", strides=3, activation="relu"
        ),
        layers.Dropout(rate=0.05),
        layers.Conv1D(
            filters=16, kernel_size=24, padding="same", strides=3, activation="relu"
        ),
        layers.Conv1DTranspose(
            filters=16, kernel_size=24, padding="same", strides=3, activation="relu"
        ),
        layers.Dropout(rate=0.05),
        layers.Conv1DTranspose(
            filters=32, kernel_size=24, padding="same", strides=3, activation="relu"
        ),
        layers.Conv1DTranspose(filters=1, kernel_size=24, padding="same"),
    ]
)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="mse")
model.summary()

# Train the model
Please note that we are using `x_train` as both the input and the target since this is a reconstruction model.

In [None]:
history = model.fit(
    x_train,
    x_train,
    epochs=50,
    batch_size=128,
    validation_split=0.5,
    callbacks=[
        keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, mode="min")
    ],
)

Let's plot training and validation loss to see how the training went.

In [None]:
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.legend()

# Detecting anomalies
We will detect anomalies by determining how well our model can reconstruct the input data.
1. Find MAE loss on training samples.
2. Find max MAE loss value. This is the worst our model has performed trying to reconstruct a sample. We will make this the `threshold` for anomaly detection.
3. If the reconstruction loss for a sample is greater than this `threshold` value then we can infer that the model is seeing a pattern that it isn't familiar with. We will label this sample as an `anomaly`.

In [None]:
# Get train MAE loss.
x_train_pred = model.predict(x_train)
train_mae_loss = np.mean(np.abs(x_train_pred - x_train), axis=1)

plt.hist(train_mae_loss, bins=50)
plt.xlabel("Train MAE loss")
plt.ylabel("No of samples")
plt.show()

# Get reconstruction loss threshold.
threshold = np.max(train_mae_loss)
print("Reconstruction error threshold: ", threshold)

## Compare recontruction
Just for fun, let's see how our model has recontructed the first sample. This is the 288 timesteps from day 1 of our training dataset.

In [None]:
# Checking how the first sequence is learnt
for idx in np.arange(0,1500,100):
    pd.concat([pd.Series(x_train[idx].flatten()).rename('x_train'), pd.Series(x_train_pred[idx].flatten()).rename('x_train_pred')], axis=1).iplot()