In [1]:
import sys
import os
import time
import glob
import pandas as pd
import numpy as np
from math import sqrt
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [2]:
print('python version', sys.version_info)
print('tf version', tf.__version__, 'keras version', keras.__version__)

python version sys.version_info(major=3, minor=8, micro=5, releaselevel='final', serial=0)
tf version 2.2.0 keras version 2.3.0-tf


# Data Source folders

In [3]:
YAHOO_DS="../Stochastic-Methods/data/yahoo/dataset/ydata-labeled-time-series-anomalies-v1_0"
DIRS_FILE_EXTENSIONS = {'A1Benchmark' : "*.csv", \
                        'A2Benchmark' : "*.csv", \
                        'A3Benchmark' : "*TS*.csv", \
                        'A4Benchmark' : "*TS*.csv" }

print("benchmark folders and file extensions")
for DIR, extension in DIRS_FILE_EXTENSIONS.items():
    Benchmark_dir  = YAHOO_DS + os.path.sep + DIR + os.path.sep 
    print("{} .. file extensions {}".format(Benchmark_dir, extension))

benchmark folders and file extensions
../Stochastic-Methods/data/yahoo/dataset/ydata-labeled-time-series-anomalies-v1_0/A1Benchmark/ .. file extensions *.csv
../Stochastic-Methods/data/yahoo/dataset/ydata-labeled-time-series-anomalies-v1_0/A2Benchmark/ .. file extensions *.csv
../Stochastic-Methods/data/yahoo/dataset/ydata-labeled-time-series-anomalies-v1_0/A3Benchmark/ .. file extensions *TS*.csv
../Stochastic-Methods/data/yahoo/dataset/ydata-labeled-time-series-anomalies-v1_0/A4Benchmark/ .. file extensions *TS*.csv


### Functions

In [4]:
def read_data_with_labels(file, timeVariantColumns, labelColumnNum):
    df = pd.read_csv(file)
    data = df.values.astype('float64')
    tsData = df[timeVariantColumns].values.astype('float64')
    labels = data[:, labelColumnNum].reshape((-1,1))
    tsDataWithLabels = np.hstack((tsData, labels))
    return tsDataWithLabels, data

In [5]:
def scale(data):
    scaler = MinMaxScaler(feature_range=(0,1))
    return scaler, scaler.fit_transform(data)

In [6]:
def split_data_set(dataset, split=0.67):
    train_size = int(len(dataset) * split)
    train, test = dataset[0:train_size, :], dataset[train_size:len(dataset), :]
    return train, test

In [7]:
# input expected to be a 2D array with last column being label
# Returns looked back X (n_samples, n_steps, n_features) and Y (n_samples, 2); 
# last column in looked back Y data returned is label
# Only one step ahead prediction setting is expected.

def look_back_and_create_dataset(tsDataWithLabels, look_back = 1):
    lookbackTsDataX = [] 
    lookbackTsDataYAndLabel = []
    for i in range(look_back, len(tsDataWithLabels)):
        a = tsDataWithLabels[i-look_back:i, :-1]
        lookbackTsDataX.append(a)
        lookbackTsDataYAndLabel.append(tsDataWithLabels[i])
    return np.array(lookbackTsDataX), np.array(lookbackTsDataYAndLabel)

In [8]:
def get_train_validation(Xtrain, Ytrain, validation_ratio=0.1):
    validation_size = int(len(Xtrain) * validation_ratio)
    Xtrain, Xvalid = Xtrain[validation_size:], Xtrain[:validation_size]
    Ytrain, Yvalid = Ytrain[validation_size:], Ytrain[:validation_size]
    return Xtrain, Ytrain, Xvalid, Yvalid

In [9]:
def get_deviations(model, X, Y):
    deviations = np.absolute(Y - model.predict(X))
    print("Deviation Min {}, Max {}".format(np.amin(deviations, axis=0), np.amax(deviations, axis=0)))    
    return deviations

In [10]:
def get_records_above_deviation_pctile(model, X, Y, pctile=95):
    deviations = get_deviations(model, X, Y)
    pctileDeviationValue = np.percentile(deviations, q=pctile, axis=0)
    print("Deviation {}th pctile {}".format(pctile, pctileDeviationValue ))
    labels = (deviations > pctileDeviationValue).astype('int')
    print("Deviation > {}th pctile is_anomaly labels in data {}".format(pctile, np.unique(labels, return_counts = True)))
    return labels

In [11]:
def get_classification_metrics(actual, predicted):
    return confusion_matrix(actual, predicted), precision_score(actual, predicted), \
    recall_score(actual, predicted), f1_score(actual, predicted)

### Read data

In [12]:
timeVariantColumns = ['value']
labelColumnNum = 2
look_back=24

In [13]:
file_name = "../Stochastic-Methods/data/yahoo/dataset/ydata-labeled-time-series-anomalies-v1_0/A2Benchmark/synthetic_10.csv"

In [14]:
tsDataWithLabels, data = read_data_with_labels(file_name, timeVariantColumns, labelColumnNum)
print("Shapes: time variant data with labels {}, full data {}".format(tsDataWithLabels.shape, data.shape))

Shapes: time variant data with labels (1421, 2), full data (1421, 3)


### Scale data

In [15]:
scaler, tsDataScaled = scale(tsDataWithLabels)
tsDataScaled.shape

(1421, 2)

### Get look back data in the 3D array shape

In [16]:
# look back and create reshaped dataset
lookbackX, lookbackY = look_back_and_create_dataset(tsDataScaled, look_back=look_back)
print("Look back data shapes: lookbackX {} lookbackY {}".format(lookbackX.shape, lookbackY.shape))
# print("Look back data ... first 10 \n")
# for i in range (10):
#     print(lookbackX[i], lookbackX[i].shape, lookbackY[i], lookbackY[i].shape)

Look back data shapes: lookbackX (1397, 24, 1) lookbackY (1397, 2)


### Split into train and test

In [17]:
Xtrain_full, Xtest = split_data_set(lookbackX, split=0.8)
Ytrain_full, Ytest = split_data_set(lookbackY[:, :-1], split=0.8) # exclude label

Xtrain_full.shape, Ytrain_full.shape, Xtest.shape, Ytest.shape

((1117, 24, 1), (1117, 1), (280, 24, 1), (280, 1))

### Train

Split full train set into train and validation sets

In [18]:
Xtrain, Ytrain, Xvalid, Yvalid = get_train_validation(Xtrain_full, Ytrain_full, validation_ratio=0.1)
Xtrain.shape, Ytrain.shape, Xvalid.shape, Yvalid.shape

((1006, 24, 1), (1006, 1), (111, 24, 1), (111, 1))

In [19]:
input_shape = (Xtrain.shape[1], Xtrain.shape[2]) # (n_steps, n_features)
input_shape

(24, 1)

In [21]:
# Note here the slight change in how we stack the hidden LSTM layers - special for the last LSTM layer.
def baseline_model(input_shape, learning_rate):
    def build_model(input_shape=input_shape, n_hidden = 1, n_units = 50, learning_rate = learning_rate):
        model = keras.models.Sequential()
        model.add(keras.layers.InputLayer(input_shape=input_shape))
        for layer in range(n_hidden - 1):
            # return sequence = true for all layers except last layer
            model.add(keras.layers.LSTM(n_units, return_sequences = True, activation = 'relu'))
        model.add(keras.layers.LSTM(n_units, activation = 'relu'))
        model.add(keras.layers.Dense(1))
        optimizer = keras.optimizers.Adam(lr=learning_rate)
        model.compile(loss="mse", optimizer=optimizer)
        return model
    return build_model

In [22]:
param_distribs = {
    "n_hidden": np.arange(1, 3).tolist(), # upto 2 hidden layers
    "n_units": np.arange(5,6).tolist() # 5 hidden layer units/neurons
}

n_iter = 1
cv = 5

learning_rate = 0.001
batch_size = 32
epochs = 10
early_stop_patience = 5

verbosity = 1

pctile=99.5

In [23]:
regressor = keras.wrappers.scikit_learn.KerasRegressor(build_fn = baseline_model(input_shape=input_shape, 
                                                                                 learning_rate=learning_rate))

early_stopping_cb = keras.callbacks.EarlyStopping(patience=early_stop_patience, restore_best_weights = True)

keras.backend.clear_session()

rnd_search_cv = RandomizedSearchCV(regressor, param_distribs, n_iter = n_iter, cv = cv, verbose = verbosity)

rnd_search_cv.fit(Xtrain, Ytrain, epochs=epochs, batch_size=batch_size, validation_data=(Xvalid, Yvalid), 
          callbacks=[early_stopping_cb], verbose=verbosity)

model = rnd_search_cv.best_estimator_.model
print("Best parameters {} best score {}:".format(rnd_search_cv.best_params_, -rnd_search_cv.best_score_))

trainMSE = model.evaluate(Xtrain_full, Ytrain_full, verbose = verbosity)
print("Train Score: {0:.5f} MSE {1:.5f} RMSE".format(trainMSE, np.sqrt(trainMSE)))
testMSE = model.evaluate(Xtest, Ytest, verbose = verbosity)
print("Test Score: {0:.5f} MSE {1:.5f} RMSE".format(testMSE, np.sqrt(testMSE)))

# get deviations for whole dataset and id records with deviations > pctile threshold and asign an is_anomaly label
predictedLabels = get_records_above_deviation_pctile(model, lookbackX, lookbackY[:, :-1], pctile)

# actual is_anomaly labels in dataset
actualLabels = (data[look_back:, labelColumnNum] != 0.0).astype('int')    
print("Actual is_anomaly labels in data", np.unique(actualLabels, return_counts = True))
conf_matrix, prec, recall, f1 = get_classification_metrics(actualLabels, predictedLabels)
print("Confusion matrix \n{0}\nprecision {1:.5f}, recall {2:.5f}, f1 {3:.5f}".format(conf_matrix, prec, recall, f1))


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.2min finished


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Best parameters {'n_units': 5, 'n_hidden': 2} best score 0.0069167348323389884:
Train Score: 0.00210 MSE 0.04583 RMSE
Test Score: 0.00180 MSE 0.04239 RMSE
Deviation Min [9.76080236e-05], Max [0.25483835]
Deviation 99.5th pctile [0.13183259]
Deviation > 99.5th pctile is_anomaly labels in data (array([0, 1]), array([1390,    7]))
Actual is_anomaly labels in data (array([0, 1]), array([1393,    4]))
Confusion matrix 
[[1390    3]
 [   0    4]]
precision 0.57143, recall 1.00000, f1 0.72727
