In [1]:
import sys
import os
import time
import glob
import pandas as pd
import numpy as np
from math import sqrt
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [2]:
print('python version', sys.version_info)
print('tf version', tf.__version__, 'keras version', keras.__version__)

python version sys.version_info(major=3, minor=8, micro=5, releaselevel='final', serial=0)
tf version 2.2.0 keras version 2.3.0-tf


In [None]:
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Get look back dataset 

In [3]:
# input expected to be a 2D array with last column being label
# Returns looked back X adn Y; last column in look back Y data returned is label
# Only one step ahead prediction setting is expected.

def look_back_and_create_dataset(tsDataWithLabels, look_back = 1):
    lookbackTsDataX = [] 
    lookbackTsDataYAndLabel = []
    for i in range(look_back, len(tsDataWithLabels)):
        a = tsDataWithLabels[i-look_back:i, :-1]
        lookbackTsDataX.append(a)
        lookbackTsDataYAndLabel.append(tsDataWithLabels[i])
    return np.array(lookbackTsDataX), np.array(lookbackTsDataYAndLabel)

## multivariate

In [None]:
in_seq1 = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150])
in_seq2 = np.array([15, 25, 35, 45, 55, 65, 75, 85, 95, 105, 115, 125, 135, 145, 155])
out_seq = np.array([in_seq1[i]+in_seq2[i] for i in range(len(in_seq1))])
labels = np.array([0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]).reshape((-1, 1))
# convert to [rows, columns] structure
in_seq1 = in_seq1.reshape((len(in_seq1), 1))
in_seq2 = in_seq2.reshape((len(in_seq2), 1))
out_seq = out_seq.reshape((len(out_seq), 1))
# horizontally stack columns
mvdataWithLabels = np.hstack((in_seq1, in_seq2, out_seq, labels))

In [None]:
mvdataWithLabels.shape, mvdataWithLabels

In [None]:
lookbackX, lookbackY = look_back_and_create_dataset(mvdataWithLabels, look_back=5)
print("Look back data shapes: lookbackX {} lookbackY {}".format(lookbackX.shape, lookbackY.shape))
print("Look back lookBackTsData ... \n")
# for i in range (len(lookbackX)):
#     print(lookbackX[i], lookbackX[i].shape, lookbackY[i], lookbackY[i].shape)

In [None]:
X = lookbackX
Y = lookbackY[:, :-1] # exclude label
X.shape, Y.shape

In [None]:
for i in range(len(X)):
    print(X[i], X[i].shape, Y[i], Y[i].shape)

## NASA Statlog Shuttle multivariate

In [7]:
import sys
import os
import time
import glob
import pandas as pd
import numpy as np
from math import sqrt
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

current_time_millis = lambda: int(round(time.time() * 1000))

def label_outliers(nasa_df_row):
    if nasa_df_row['class'] == 1 :
        return 0
    else :
        return 1
    
def cleanup() :
    colnames =['time', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'class']
    train_df = pd.read_csv("../Stochastic-Methods/data/nasa/shuttle.trn/shuttle.trn",names=colnames,sep=" ")
    test_df = pd.read_csv("../Stochastic-Methods/data/nasa/shuttle.tst",names=colnames,sep=" ")

    # merge train and test
    merged_df = pd.concat([train_df, test_df])
    # print("Unique classes {}".format(np.unique(merged_df['class'].values, return_counts=True)))

    # drop class = 4
    minus4_df = merged_df.loc[merged_df['class'] != 4]
    # print("Frame after dropping 4 \n{}".format(minus4_df))
    # print("Unique classes after dropping 4 {}".format(np.unique(minus4_df['class'].values, return_counts=True)))

    # mark class 1 as inlier and rest as outlier
    is_anomaly_column = minus4_df.apply(lambda row: label_outliers(row), axis=1)
    labelled_df = minus4_df.assign(is_anomaly=is_anomaly_column.values)

    #print("Frame after labelling outliers \n{}".format(labelled_df))
    print("Unique classes after labelling outliers {}".format(np.unique(labelled_df['class'].values, return_counts=True)))
    print("Unique outliers after labelling outliers {}".format(np.unique(labelled_df['is_anomaly'].values, return_counts=True)))

    # sort by time

    sorted_df = labelled_df.sort_values('time')

    #print("Sorted Frame\n{}".format(sorted_df))
    
    return sorted_df

def read_data_with_labels(df, timeVariantColumns, labelColumnNum):
#     df = pd.read_csv(file)
    data = df.values.astype('float64')
    tsData = df[timeVariantColumns].values.astype('float64')
    labels = data[:, labelColumnNum].reshape((-1,1))
    tsDataWithLabels = np.hstack((tsData, labels))
    return tsDataWithLabels, data

def scale(data):
    scaler = MinMaxScaler(feature_range=(0,1))
    scaler.fit(data)
    return scaler, scaler.transform(data)

"""
# input expected to be a 2D array with last column being label
# Returns looked back X adn Y; last column in look back Y data returned is label
# Only one step ahead prediction setting is expected.
"""

def look_back_and_create_dataset(tsDataWithLabels, look_back = 1):
    lookbackTsDataX = [] 
    lookbackTsDataYAndLabel = []
    for i in range(look_back, len(tsDataWithLabels)):
        a = tsDataWithLabels[i-look_back:i, :-1]
        lookbackTsDataX.append(a)
        lookbackTsDataYAndLabel.append(tsDataWithLabels[i])
    return np.array(lookbackTsDataX), np.array(lookbackTsDataYAndLabel)

def split_data_set(dataset, split=0.67):
    train_size = int(len(dataset) * split)
    train, test = dataset[0:train_size, :], dataset[train_size:len(dataset), :]
    return train, test

def get_train_validation(Xtrain, Ytrain, validation_ratio=0.1):
    validation_size = int(len(Xtrain) * validation_ratio)
    Xtrain, Xvalid = Xtrain[validation_size:], Xtrain[:validation_size]
    Ytrain, Yvalid = Ytrain[validation_size:], Ytrain[:validation_size]
    return Xtrain, Ytrain, Xvalid, Yvalid

# Note here the slight change in how we stack the hidden LSTM layers - special for the last LSTM layer.
def baseline_model(input_shape, learning_rate):
    def build_model(input_shape=input_shape, n_hidden = 1, n_units = 50, learning_rate = learning_rate):
        model = keras.models.Sequential()
        model.add(keras.layers.InputLayer(input_shape=input_shape))
        for layer in range(n_hidden - 1):
            # return sequence = true for all layers except last layer
            model.add(keras.layers.LSTM(n_units, return_sequences = True, activation = 'relu'))
        model.add(keras.layers.LSTM(n_units, activation = 'relu'))
        model.add(keras.layers.Dense(input_shape[1]))
        optimizer = keras.optimizers.Adam(lr=learning_rate)
        model.compile(loss="mse", optimizer=optimizer)
        return model
    return build_model

    

############## main #########################

split = 0.8
look_back = 24
learning_rate = 0.001
n_iter = 1
cv=3
batch_size=32
early_stop_patience=3
epochs=10
verbosity=2
min_delta=0.0003

param_distribs = {
    "n_hidden": np.arange(1, 2).tolist(), # upto 1 hidden layers
    "n_units": np.arange(5,6).tolist() # 5 hidden layer units/neurons
}


sorted_df = cleanup()

timeVariantColumns = ['a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8']
labelColumnNum = 10

# read data
tsDataWithLabels, data = read_data_with_labels(sorted_df, timeVariantColumns, labelColumnNum)
print("Shapes: time variant data array with labels {}, full data {}".format(tsDataWithLabels.shape, data.shape))
print("Unique outliers in full data array {}".format(np.unique(data[:, -1], return_counts=True)))
print("Unique outliers in time variant data array with labels {}".format(np.unique(tsDataWithLabels[:, -1], 
                                                                                   return_counts=True)))

# print(tsDataWithLabels)

# scale data
scaler, tsDataScaled = scale(tsDataWithLabels)

# Get look back data in the 3D array shape (n_samples, n_lookback_steps, n_features)
lookbackX, lookbackY = look_back_and_create_dataset(tsDataScaled, look_back=look_back)
print("Look back data shapes: lookbackX {} lookbackY {}".format(lookbackX.shape, lookbackY.shape))

 # split into train/test
Xtrain_full, Xtest = split_data_set(lookbackX, split=0.8)
Ytrain_full, Ytest = split_data_set(lookbackY[:, :-1], split=0.8)   # exclude labels     

print("Shapes: Xtrain_full {}, Ytrain_full {}, Xtest {}, Ytest {}".format(Xtrain_full.shape, Ytrain_full.shape, 
                                                                          Xtest.shape, Ytest.shape))

# split further full train set into train and validation set
Xtrain, Ytrain, Xvalid, Yvalid = get_train_validation(Xtrain_full, Ytrain_full, validation_ratio=0.1)

print("Shapes: Xtrain {}, Ytrain {}, Xvalid {}, Yvalid {}".format(Xtrain.shape, Ytrain.shape, 
                                                                  Xvalid.shape, Yvalid.shape))


input_shape = (Xtrain.shape[1], Xtrain.shape[2])
regressor = keras.wrappers.scikit_learn.KerasRegressor(build_fn = baseline_model(input_shape=input_shape, 
                                                                         learning_rate=learning_rate))

early_stopping_cb = keras.callbacks.EarlyStopping(patience=early_stop_patience, monitor='val_loss', min_delta=0.0003, 
                                                  restore_best_weights = True)

rnd_search_cv = RandomizedSearchCV(regressor, param_distribs, n_iter = n_iter, cv = cv, verbose = verbosity)

start_millis = current_time_millis()
rnd_search_cv.fit(Xtrain, Ytrain, epochs=epochs, batch_size=batch_size, validation_data=(Xvalid, Yvalid), 
                  callbacks=[early_stopping_cb], verbose=verbosity)


end_millis = current_time_millis()

model = rnd_search_cv.best_estimator_.model
print("Best parameters {} best score {}:".format(rnd_search_cv.best_params_, -rnd_search_cv.best_score_))

trainMSE = model.evaluate(Xtrain_full, Ytrain_full, verbose = verbosity)
print("Train Score: {0:.5f} MSE {1:.5f} RMSE".format(trainMSE, np.sqrt(trainMSE)))
testMSE = model.evaluate(Xtest, Ytest, verbose = verbosity)
print("Test Score: {0:.5f} MSE {1:.5f} RMSE".format(testMSE, np.sqrt(testMSE)))
        
        


Unique classes after labelling outliers (array([1, 2, 3, 5, 6, 7]), array([45586,    50,   171,  3267,    10,    13]))
Unique outliers after labelling outliers (array([0, 1]), array([45586,  3511]))
Shapes: time variant data array with labels (49097, 9), full data (49097, 11)
Unique outliers in full data array (array([0., 1.]), array([45586,  3511]))
Unique outliers in time variant data array with labels (array([0., 1.]), array([45586,  3511]))
Look back data shapes: lookbackX (49073, 24, 8) lookbackY (49073, 9)
Shapes: Xtrain_full (39258, 24, 8), Ytrain_full (39258, 8), Xtest (9815, 24, 8), Ytest (9815, 8)
Shapes: Xtrain (35333, 24, 8), Ytrain (35333, 8), Xvalid (3925, 24, 8), Yvalid (3925, 8)
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] n_units=5, n_hidden=1 ...........................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Epoch 1/10
737/737 - 27s - loss: 0.0970 - val_loss: 0.0123
Epoch 2/10
737/737 - 28s - loss: 0.0021 - val_loss: 0.0024
Epoch 3/10
737/737 - 27s - loss: 7.4686e-04 - val_loss: 0.0022
Epoch 4/10
737/737 - 27s - loss: 7.4565e-04 - val_loss: 0.0022
Epoch 5/10
737/737 - 27s - loss: 7.4571e-04 - val_loss: 0.0022
[CV] ............................ n_units=5, n_hidden=1, total= 2.4min
[CV] n_units=5, n_hidden=1 ...........................................
Epoch 1/10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.4min remaining:    0.0s


737/737 - 26s - loss: 0.0237 - val_loss: 0.0020
Epoch 2/10
737/737 - 26s - loss: 0.0014 - val_loss: 0.0019
Epoch 3/10
737/737 - 26s - loss: 0.0013 - val_loss: 0.0017
Epoch 4/10
737/737 - 26s - loss: 0.0013 - val_loss: 0.0017
[CV] ............................ n_units=5, n_hidden=1, total= 1.8min
[CV] n_units=5, n_hidden=1 ...........................................
Epoch 1/10
737/737 - 27s - loss: 0.0494 - val_loss: 0.0018
Epoch 2/10
737/737 - 27s - loss: 0.0012 - val_loss: 0.0017
Epoch 3/10
737/737 - 28s - loss: 0.0012 - val_loss: 0.0017
Epoch 4/10
737/737 - 27s - loss: 0.0012 - val_loss: 0.0016
[CV] ............................ n_units=5, n_hidden=1, total= 1.9min
Epoch 1/10


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  6.1min finished


1105/1105 - 40s - loss: 0.0788 - val_loss: 0.0033
Epoch 2/10
1105/1105 - 40s - loss: 0.0013 - val_loss: 0.0019
Epoch 3/10
1105/1105 - 40s - loss: 0.0012 - val_loss: 0.0019
Epoch 4/10
1105/1105 - 40s - loss: 0.0012 - val_loss: 0.0019
Epoch 5/10
1105/1105 - 40s - loss: 0.0012 - val_loss: 0.0019
Best parameters {'n_units': 5, 'n_hidden': 1} best score 0.0014627903001382947:
1227/1227 - 11s - loss: 0.0012
Train Score: 0.00124 MSE 0.03522 RMSE
307/307 - 3s - loss: 0.0063
Test Score: 0.00625 MSE 0.07907 RMSE
