In [1]:
import sys
import os
import time
import glob
import pandas as pd
import numpy as np
from math import sqrt
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [2]:
print('python version', sys.version_info)
print('tf version', tf.__version__, 'keras version', keras.__version__)

python version sys.version_info(major=3, minor=8, micro=5, releaselevel='final', serial=0)
tf version 2.2.0 keras version 2.3.0-tf


In [3]:
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Data Source folders

In [4]:
YAHOO_DS="../Stochastic-Methods/data/yahoo/dataset/ydata-labeled-time-series-anomalies-v1_0"
DIRS_FILE_EXTENSIONS = {'A1Benchmark' : "*.csv", \
                        'A2Benchmark' : "*.csv", \
                        'A3Benchmark' : "*TS*.csv", \
                        'A4Benchmark' : "*TS*.csv" }

print("benchmark folders and file extensions")
for DIR, extension in DIRS_FILE_EXTENSIONS.items():
    Benchmark_dir  = YAHOO_DS + os.path.sep + DIR + os.path.sep 
    print("{} .. file extensions {}".format(Benchmark_dir, extension))

benchmark folders and file extensions
../Stochastic-Methods/data/yahoo/dataset/ydata-labeled-time-series-anomalies-v1_0/A1Benchmark/ .. file extensions *.csv
../Stochastic-Methods/data/yahoo/dataset/ydata-labeled-time-series-anomalies-v1_0/A2Benchmark/ .. file extensions *.csv
../Stochastic-Methods/data/yahoo/dataset/ydata-labeled-time-series-anomalies-v1_0/A3Benchmark/ .. file extensions *TS*.csv
../Stochastic-Methods/data/yahoo/dataset/ydata-labeled-time-series-anomalies-v1_0/A4Benchmark/ .. file extensions *TS*.csv


In [5]:
colnames =['time', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'class']
nasa = pd.read_csv("../Stochastic-Methods/data/nasa/shuttle.trn/shuttle.trn",names=colnames,sep=" ")
times = nasa['time'].values.astype(int)
times.shape, np.unique(times).shape

((43500,), (72,))

In [6]:
def read_data_with_labels(file, timeVariantColumns, labelColumnNum):
    df = pd.read_csv(file)
    data = df.values.astype('float64')
    tsData = df[timeVariantColumns].values.astype('float64')
    labels = data[:, labelColumnNum].reshape((-1,1))
    tsDataWithLabels = np.hstack((tsData, labels))
    return tsDataWithLabels, data

# Get look back dataset 

In [45]:
# input expected to be a 2D array with last column being label
# Returns looked back X adn Y; last column in look back Y data returned is label
# Only one step ahead prediction setting is expected.

def look_back_and_create_dataset(tsDataWithLabels, look_back = 1):
    lookbackTsDataX = [] 
    lookbackTsDataYAndLabel = []
    for i in range(look_back, len(tsDataWithLabels)):
        a = tsDataWithLabels[i-look_back:i, :-1]
        lookbackTsDataX.append(a)
        lookbackTsDataYAndLabel.append(tsDataWithLabels[i])
    return np.array(lookbackTsDataX), np.array(lookbackTsDataYAndLabel)

### univariate

In [46]:
timeVariantColumns = ['value']
labelColumnNum = 2
file_name = "../Stochastic-Methods/data/yahoo/dataset/ydata-labeled-time-series-anomalies-v1_0/A2Benchmark/synthetic_10.csv"
tsDataWithLabels, data = read_data_with_labels(file_name, timeVariantColumns, labelColumnNum)
print("Shapes: time variant data with labels {}, full data {}".format(tsDataWithLabels.shape, data.shape))

Shapes: time variant data with labels (1421, 2), full data (1421, 3)


In [47]:
# look back and create reshaped dataset
lookbackX, lookbackY = look_back_and_create_dataset(tsDataWithLabels, look_back=5)
print("Look back data shapes: lookbackX {} lookbackY {}".format(lookbackX.shape, lookbackY.shape))
print("Look back data ... first 10 \n")
for i in range (10):
    print(lookbackX[i], lookbackX[i].shape, lookbackY[i], lookbackY[i].shape)

Look back data shapes: lookbackX (1416, 5, 1) lookbackY (1416, 2)
Look back data ... first 10 

[[127.69923816]
 [ 23.06961419]
 [246.1188566 ]
 [295.93781848]
 [264.6499858 ]] (5, 1) [122.23286746   0.        ] (2,)
[[ 23.06961419]
 [246.1188566 ]
 [295.93781848]
 [264.6499858 ]
 [122.23286746]] (5, 1) [238.54006612   0.        ] (2,)
[[246.1188566 ]
 [295.93781848]
 [264.6499858 ]
 [122.23286746]
 [238.54006612]] (5, 1) [356.59069416   0.        ] (2,)
[[295.93781848]
 [264.6499858 ]
 [122.23286746]
 [238.54006612]
 [356.59069416]] (5, 1) [439.19625187   0.        ] (2,)
[[264.6499858 ]
 [122.23286746]
 [238.54006612]
 [356.59069416]
 [439.19625187]] (5, 1) [735.37467526   0.        ] (2,)
[[122.23286746]
 [238.54006612]
 [356.59069416]
 [439.19625187]
 [735.37467526]] (5, 1) [621.57485665   0.        ] (2,)
[[238.54006612]
 [356.59069416]
 [439.19625187]
 [735.37467526]
 [621.57485665]] (5, 1) [514.83835704   0.        ] (2,)
[[356.59069416]
 [439.19625187]
 [735.37467526]
 [621.574

In [48]:
X = lookbackX
Y = lookbackY[:, :-1] # exclude label
X.shape, Y.shape

((1416, 5, 1), (1416, 1))

In [49]:
for i in range(10):
    print(X[i], X[i].shape, Y[i], Y[i].shape)

[[127.69923816]
 [ 23.06961419]
 [246.1188566 ]
 [295.93781848]
 [264.6499858 ]] (5, 1) [122.23286746] (1,)
[[ 23.06961419]
 [246.1188566 ]
 [295.93781848]
 [264.6499858 ]
 [122.23286746]] (5, 1) [238.54006612] (1,)
[[246.1188566 ]
 [295.93781848]
 [264.6499858 ]
 [122.23286746]
 [238.54006612]] (5, 1) [356.59069416] (1,)
[[295.93781848]
 [264.6499858 ]
 [122.23286746]
 [238.54006612]
 [356.59069416]] (5, 1) [439.19625187] (1,)
[[264.6499858 ]
 [122.23286746]
 [238.54006612]
 [356.59069416]
 [439.19625187]] (5, 1) [735.37467526] (1,)
[[122.23286746]
 [238.54006612]
 [356.59069416]
 [439.19625187]
 [735.37467526]] (5, 1) [621.57485665] (1,)
[[238.54006612]
 [356.59069416]
 [439.19625187]
 [735.37467526]
 [621.57485665]] (5, 1) [514.83835704] (1,)
[[356.59069416]
 [439.19625187]
 [735.37467526]
 [621.57485665]
 [514.83835704]] (5, 1) [529.76966501] (1,)
[[439.19625187]
 [735.37467526]
 [621.57485665]
 [514.83835704]
 [529.76966501]] (5, 1) [665.57120413] (1,)
[[735.37467526]
 [621.574856

## multivariate

In [50]:
in_seq1 = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150])
in_seq2 = np.array([15, 25, 35, 45, 55, 65, 75, 85, 95, 105, 115, 125, 135, 145, 155])
out_seq = np.array([in_seq1[i]+in_seq2[i] for i in range(len(in_seq1))])
labels = np.array([0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]).reshape((-1, 1))
# convert to [rows, columns] structure
in_seq1 = in_seq1.reshape((len(in_seq1), 1))
in_seq2 = in_seq2.reshape((len(in_seq2), 1))
out_seq = out_seq.reshape((len(out_seq), 1))
# horizontally stack columns
mvdataWithLabels = np.hstack((in_seq1, in_seq2, out_seq, labels))

In [51]:
mvdataWithLabels.shape, mvdataWithLabels

((15, 4),
 array([[ 10,  15,  25,   0],
        [ 20,  25,  45,   1],
        [ 30,  35,  65,   0],
        [ 40,  45,  85,   1],
        [ 50,  55, 105,   0],
        [ 60,  65, 125,   1],
        [ 70,  75, 145,   0],
        [ 80,  85, 165,   1],
        [ 90,  95, 185,   0],
        [100, 105, 205,   1],
        [110, 115, 225,   0],
        [120, 125, 245,   1],
        [130, 135, 265,   0],
        [140, 145, 285,   1],
        [150, 155, 305,   0]]))

In [55]:
lookbackX, lookbackY = look_back_and_create_dataset(mvdataWithLabels, look_back=5)
print("Look back data shapes: lookbackX {} lookbackY {}".format(lookbackX.shape, lookBackY.shape))
print("Look back lookBackTsData ... \n")
for i in range (len(lookbackX)):
    print(lookbackX[i], lookbackX[i].shape, lookbackY[i], lookbackY[i].shape)

Look back data shapes: lookbackX (10, 5, 3) lookbackY (10, 4)
Look back lookBackTsData ... 

[[ 10  15  25]
 [ 20  25  45]
 [ 30  35  65]
 [ 40  45  85]
 [ 50  55 105]] (5, 3) [ 60  65 125   1] (4,)
[[ 20  25  45]
 [ 30  35  65]
 [ 40  45  85]
 [ 50  55 105]
 [ 60  65 125]] (5, 3) [ 70  75 145   0] (4,)
[[ 30  35  65]
 [ 40  45  85]
 [ 50  55 105]
 [ 60  65 125]
 [ 70  75 145]] (5, 3) [ 80  85 165   1] (4,)
[[ 40  45  85]
 [ 50  55 105]
 [ 60  65 125]
 [ 70  75 145]
 [ 80  85 165]] (5, 3) [ 90  95 185   0] (4,)
[[ 50  55 105]
 [ 60  65 125]
 [ 70  75 145]
 [ 80  85 165]
 [ 90  95 185]] (5, 3) [100 105 205   1] (4,)
[[ 60  65 125]
 [ 70  75 145]
 [ 80  85 165]
 [ 90  95 185]
 [100 105 205]] (5, 3) [110 115 225   0] (4,)
[[ 70  75 145]
 [ 80  85 165]
 [ 90  95 185]
 [100 105 205]
 [110 115 225]] (5, 3) [120 125 245   1] (4,)
[[ 80  85 165]
 [ 90  95 185]
 [100 105 205]
 [110 115 225]
 [120 125 245]] (5, 3) [130 135 265   0] (4,)
[[ 90  95 185]
 [100 105 205]
 [110 115 225]
 [120 125 245]

In [56]:
X = lookbackX
Y = lookBackY[:, :-1] # exclude label
X.shape, Y.shape

((10, 5, 3), (10, 3))

In [57]:
for i in range(len(X)):
    print(X[i], X[i].shape, Y[i], Y[i].shape)

[[ 10  15  25]
 [ 20  25  45]
 [ 30  35  65]
 [ 40  45  85]
 [ 50  55 105]] (5, 3) [ 60  65 125] (3,)
[[ 20  25  45]
 [ 30  35  65]
 [ 40  45  85]
 [ 50  55 105]
 [ 60  65 125]] (5, 3) [ 70  75 145] (3,)
[[ 30  35  65]
 [ 40  45  85]
 [ 50  55 105]
 [ 60  65 125]
 [ 70  75 145]] (5, 3) [ 80  85 165] (3,)
[[ 40  45  85]
 [ 50  55 105]
 [ 60  65 125]
 [ 70  75 145]
 [ 80  85 165]] (5, 3) [ 90  95 185] (3,)
[[ 50  55 105]
 [ 60  65 125]
 [ 70  75 145]
 [ 80  85 165]
 [ 90  95 185]] (5, 3) [100 105 205] (3,)
[[ 60  65 125]
 [ 70  75 145]
 [ 80  85 165]
 [ 90  95 185]
 [100 105 205]] (5, 3) [110 115 225] (3,)
[[ 70  75 145]
 [ 80  85 165]
 [ 90  95 185]
 [100 105 205]
 [110 115 225]] (5, 3) [120 125 245] (3,)
[[ 80  85 165]
 [ 90  95 185]
 [100 105 205]
 [110 115 225]
 [120 125 245]] (5, 3) [130 135 265] (3,)
[[ 90  95 185]
 [100 105 205]
 [110 115 225]
 [120 125 245]
 [130 135 265]] (5, 3) [140 145 285] (3,)
[[100 105 205]
 [110 115 225]
 [120 125 245]
 [130 135 265]
 [140 145 285]] (5, 3)

## NASA SMAP MSL telemetry data

In [22]:
l_s = 250
n_predictions = 10

In [23]:
def shape_data(arr, l_s, n_predictions) :
    data = []
    for i in range(len(arr) - l_s - n_predictions):
        data.append(arr[i:i + l_s + n_predictions])
    data = np.array(data)
    
    X = data[:, :-n_predictions, :]
    Y = data[:, -n_predictions:, 0]  # telemetry value is at position 0
        
    return data, X, Y

In [24]:
for channel in ['A-1']:    
    train = np.load(os.path.join("data", "train", "{}.npy".format(channel)))
    test = np.load(os.path.join("data", "test", "{}.npy".format(channel)))
    
    print(train.shape, test.shape)
    reshaped_train, X_train, Y_train  = shape_data(train, l_s, n_predictions)
    reshaped_test, X_test, Y_test = shape_data(test, l_s, n_predictions)
    print(reshaped_train.shape, X_train.shape, Y_train.shape, reshaped_test.shape, X_test.shape, Y_test.shape)

(2880, 25) (8640, 25)
(2620, 260, 25) (2620, 250, 25) (2620, 10) (8380, 260, 25) (8380, 250, 25) (8380, 10)


In [12]:
for i in range(A1_train.shape[1]) :
    print(np.unique(A1_train[:, i], return_counts=True))

(array([0.999]), array([2880]))
(array([0., 1.]), array([2776,  104]))
(array([0., 1.]), array([2853,   27]))
(array([0., 1.]), array([2798,   82]))
(array([0., 1.]), array([2870,   10]))
(array([0., 1.]), array([2388,  492]))
(array([0., 1.]), array([2479,  401]))
(array([0., 1.]), array([2876,    4]))
(array([0., 1.]), array([2879,    1]))
(array([0., 1.]), array([2878,    2]))
(array([0.]), array([2880]))
(array([0., 1.]), array([2876,    4]))
(array([0.]), array([2880]))
(array([0., 1.]), array([2877,    3]))
(array([0., 1.]), array([2877,    3]))
(array([0.]), array([2880]))
(array([0.]), array([2880]))
(array([0., 1.]), array([2798,   82]))
(array([0., 1.]), array([2798,   82]))
(array([0., 1.]), array([2839,   41]))
(array([0., 1.]), array([2875,    5]))
(array([0., 1.]), array([2769,  111]))
(array([0., 1.]), array([2772,  108]))
(array([0.]), array([2880]))
(array([0.]), array([2880]))
