In [1]:
import sys
import os
import time
import glob
import pandas as pd
import numpy as np
from math import sqrt
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [2]:
print('python version', sys.version_info)
print('tf version', tf.__version__, 'keras version', keras.__version__)

python version sys.version_info(major=3, minor=8, micro=5, releaselevel='final', serial=0)
tf version 2.2.0 keras version 2.3.0-tf


In [3]:
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Get look back dataset 

In [4]:
# input expected to be a 2D array with last column being label
# Returns looked back X adn Y; last column in look back Y data returned is label
# Only one step ahead prediction setting is expected.

def look_back_and_create_dataset(tsDataWithLabels, look_back = 1):
    lookbackTsDataX = [] 
    lookbackTsDataYAndLabel = []
    for i in range(look_back, len(tsDataWithLabels)):
        a = tsDataWithLabels[i-look_back:i, :-1]
        lookbackTsDataX.append(a)
        lookbackTsDataYAndLabel.append(tsDataWithLabels[i])
    return np.array(lookbackTsDataX), np.array(lookbackTsDataYAndLabel)

## multivariate

In [5]:
in_seq1 = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150])
in_seq2 = np.array([15, 25, 35, 45, 55, 65, 75, 85, 95, 105, 115, 125, 135, 145, 155])
out_seq = np.array([in_seq1[i]+in_seq2[i] for i in range(len(in_seq1))])
labels = np.array([0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]).reshape((-1, 1))
# convert to [rows, columns] structure
in_seq1 = in_seq1.reshape((len(in_seq1), 1))
in_seq2 = in_seq2.reshape((len(in_seq2), 1))
out_seq = out_seq.reshape((len(out_seq), 1))
# horizontally stack columns
mvdataWithLabels = np.hstack((in_seq1, in_seq2, out_seq, labels))

In [6]:
mvdataWithLabels.shape, mvdataWithLabels

((15, 4),
 array([[ 10,  15,  25,   0],
        [ 20,  25,  45,   1],
        [ 30,  35,  65,   0],
        [ 40,  45,  85,   1],
        [ 50,  55, 105,   0],
        [ 60,  65, 125,   1],
        [ 70,  75, 145,   0],
        [ 80,  85, 165,   1],
        [ 90,  95, 185,   0],
        [100, 105, 205,   1],
        [110, 115, 225,   0],
        [120, 125, 245,   1],
        [130, 135, 265,   0],
        [140, 145, 285,   1],
        [150, 155, 305,   0]]))

In [7]:
lookbackX, lookbackY = look_back_and_create_dataset(mvdataWithLabels, look_back=5)
print("Look back data shapes: lookbackX {} lookbackY {}".format(lookbackX.shape, lookbackY.shape))
print("Look back lookBackTsData ... \n")
# for i in range (len(lookbackX)):
#     print(lookbackX[i], lookbackX[i].shape, lookbackY[i], lookbackY[i].shape)

Look back data shapes: lookbackX (10, 5, 3) lookbackY (10, 4)
Look back lookBackTsData ... 



In [8]:
X = lookbackX
Y = lookbackY[:, :-1] # exclude label
X.shape, Y.shape

((10, 5, 3), (10, 3))

In [9]:
for i in range(len(X)):
    print(X[i], X[i].shape, Y[i], Y[i].shape)

[[ 10  15  25]
 [ 20  25  45]
 [ 30  35  65]
 [ 40  45  85]
 [ 50  55 105]] (5, 3) [ 60  65 125] (3,)
[[ 20  25  45]
 [ 30  35  65]
 [ 40  45  85]
 [ 50  55 105]
 [ 60  65 125]] (5, 3) [ 70  75 145] (3,)
[[ 30  35  65]
 [ 40  45  85]
 [ 50  55 105]
 [ 60  65 125]
 [ 70  75 145]] (5, 3) [ 80  85 165] (3,)
[[ 40  45  85]
 [ 50  55 105]
 [ 60  65 125]
 [ 70  75 145]
 [ 80  85 165]] (5, 3) [ 90  95 185] (3,)
[[ 50  55 105]
 [ 60  65 125]
 [ 70  75 145]
 [ 80  85 165]
 [ 90  95 185]] (5, 3) [100 105 205] (3,)
[[ 60  65 125]
 [ 70  75 145]
 [ 80  85 165]
 [ 90  95 185]
 [100 105 205]] (5, 3) [110 115 225] (3,)
[[ 70  75 145]
 [ 80  85 165]
 [ 90  95 185]
 [100 105 205]
 [110 115 225]] (5, 3) [120 125 245] (3,)
[[ 80  85 165]
 [ 90  95 185]
 [100 105 205]
 [110 115 225]
 [120 125 245]] (5, 3) [130 135 265] (3,)
[[ 90  95 185]
 [100 105 205]
 [110 115 225]
 [120 125 245]
 [130 135 265]] (5, 3) [140 145 285] (3,)
[[100 105 205]
 [110 115 225]
 [120 125 245]
 [130 135 265]
 [140 145 285]] (5, 3)

## NASA Statlog Shuttle multivariate

In [11]:
%%writefile Shuttle-LSTM.py

import sys
import os
import time
import glob
import pandas as pd
import numpy as np
from math import sqrt
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

def label_outliers(nasa_df_row):
    if nasa_df_row['class'] == 1 :
        return 0
    else :
        return 1
    
def cleanup() :
    colnames =['time', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'class']
    train_df = pd.read_csv("../Stochastic-Methods/data/nasa/shuttle.trn/shuttle.trn",names=colnames,sep=" ")
    test_df = pd.read_csv("../Stochastic-Methods/data/nasa/shuttle.tst",names=colnames,sep=" ")

    # merge train and test
    merged_df = pd.concat([train_df, test_df])
    # print("Unique classes {}".format(np.unique(merged_df['class'].values, return_counts=True)))

    # drop class = 4
    minus4_df = merged_df.loc[merged_df['class'] != 4]
    # print("Frame after dropping 4 \n{}".format(minus4_df))
    # print("Unique classes after dropping 4 {}".format(np.unique(minus4_df['class'].values, return_counts=True)))

    # mark class 1 as inlier and rest as outlier
    is_anomaly_column = minus4_df.apply(lambda row: label_outliers(row), axis=1)
    labelled_df = minus4_df.assign(is_anomaly=is_anomaly_column.values)

    #print("Frame after labelling outliers \n{}".format(labelled_df))
    print("Unique classes after labelling outliers {}".format(np.unique(labelled_df['class'].values, return_counts=True)))
    print("Unique outliers after labelling outliers {}".format(np.unique(labelled_df['is_anomaly'].values, return_counts=True)))

    # sort by time

    sorted_df = labelled_df.sort_values('time')

    #print("Sorted Frame\n{}".format(sorted_df))
    
    return sorted_df

def read_data_with_labels(df, timeVariantColumns, labelColumnNum):
#     df = pd.read_csv(file)
    data = df.values.astype('float64')
    tsData = df[timeVariantColumns].values.astype('float64')
    labels = data[:, labelColumnNum].reshape((-1,1))
    tsDataWithLabels = np.hstack((tsData, labels))
    return tsDataWithLabels, data

def scale(data):
    scaler = MinMaxScaler(feature_range=(0,1))
    scaler.fit(data)
    return scaler, scaler.transform(data)

"""
# input expected to be a 2D array with last column being label
# Returns looked back X adn Y; last column in look back Y data returned is label
# Only one step ahead prediction setting is expected.
"""

def look_back_and_create_dataset(tsDataWithLabels, look_back = 1):
    lookbackTsDataX = [] 
    lookbackTsDataYAndLabel = []
    for i in range(look_back, len(tsDataWithLabels)):
        a = tsDataWithLabels[i-look_back:i, :-1]
        lookbackTsDataX.append(a)
        lookbackTsDataYAndLabel.append(tsDataWithLabels[i])
    return np.array(lookbackTsDataX), np.array(lookbackTsDataYAndLabel)

def split_data_set(dataset, split=0.67):
    train_size = int(len(dataset) * split)
    train, test = dataset[0:train_size, :], dataset[train_size:len(dataset), :]
    return train, test

def get_train_validation(Xtrain, Ytrain, validation_ratio=0.1):
    validation_size = int(len(Xtrain) * validation_ratio)
    Xtrain, Xvalid = Xtrain[validation_size:], Xtrain[:validation_size]
    Ytrain, Yvalid = Ytrain[validation_size:], Ytrain[:validation_size]
    return Xtrain, Ytrain, Xvalid, Yvalid

# Note here the slight change in how we stack the hidden LSTM layers - special for the last LSTM layer.
def baseline_model(input_shape, learning_rate):
    def build_model(input_shape=input_shape, n_hidden = 1, n_units = 50, learning_rate = learning_rate):
        model = keras.models.Sequential()
        model.add(keras.layers.InputLayer(input_shape=input_shape))
        for layer in range(n_hidden - 1):
            # return sequence = true for all layers except last layer
            model.add(keras.layers.LSTM(n_units, return_sequences = True, activation = 'relu'))
        model.add(keras.layers.LSTM(n_units, activation = 'relu'))
        model.add(keras.layers.Dense(1))
        optimizer = keras.optimizers.Adam(lr=learning_rate)
        model.compile(loss="mse", optimizer=optimizer)
        return model
    return build_model

    

############## main #########################

sorted_df = cleanup()

timeVariantColumns = ['a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8']
labelColumnNum = 10

# read data
tsDataWithLabels, data = read_data_with_labels(sorted_df, timeVariantColumns, labelColumnNum)
print("Shapes: time variant data array with labels {}, full data {}".format(tsDataWithLabels.shape, data.shape))
print("Unique outliers in full data array {}".format(np.unique(data[:, -1], return_counts=True)))
print("Unique outliers in time variant data array with labels {}".format(np.unique(tsDataWithLabels[:, -1], 
                                                                                   return_counts=True)))

# print(tsDataWithLabels)

# scale data
scaler, tsDataScaled = scale(tsDataWithLabels)

look_back=24
# Get look back data in the 3D array shape (n_samples, n_lookback_steps, n_features)
lookbackX, lookbackY = look_back_and_create_dataset(tsDataScaled, look_back=look_back)
print("Look back data shapes: lookbackX {} lookbackY {}".format(lookbackX.shape, lookbackY.shape))




Overwriting Shuttle-LSTM.py


In [14]:
# %load Shuttle-LSTM.py

import sys
import os
import time
import glob
import pandas as pd
import numpy as np
from math import sqrt
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

def label_outliers(nasa_df_row):
    if nasa_df_row['class'] == 1 :
        return 0
    else :
        return 1
    
def cleanup() :
    colnames =['time', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'class']
    train_df = pd.read_csv("../Stochastic-Methods/data/nasa/shuttle.trn/shuttle.trn",names=colnames,sep=" ")
    test_df = pd.read_csv("../Stochastic-Methods/data/nasa/shuttle.tst",names=colnames,sep=" ")

    # merge train and test
    merged_df = pd.concat([train_df, test_df])
    # print("Unique classes {}".format(np.unique(merged_df['class'].values, return_counts=True)))

    # drop class = 4
    minus4_df = merged_df.loc[merged_df['class'] != 4]
    # print("Frame after dropping 4 \n{}".format(minus4_df))
    # print("Unique classes after dropping 4 {}".format(np.unique(minus4_df['class'].values, return_counts=True)))

    # mark class 1 as inlier and rest as outlier
    is_anomaly_column = minus4_df.apply(lambda row: label_outliers(row), axis=1)
    labelled_df = minus4_df.assign(is_anomaly=is_anomaly_column.values)

    #print("Frame after labelling outliers \n{}".format(labelled_df))
    print("Unique classes after labelling outliers {}".format(np.unique(labelled_df['class'].values, return_counts=True)))
    print("Unique outliers after labelling outliers {}".format(np.unique(labelled_df['is_anomaly'].values, return_counts=True)))

    # sort by time

    sorted_df = labelled_df.sort_values('time')

    #print("Sorted Frame\n{}".format(sorted_df))
    
    return sorted_df

def read_data_with_labels(df, timeVariantColumns, labelColumnNum):
#     df = pd.read_csv(file)
    data = df.values.astype('float64')
    tsData = df[timeVariantColumns].values.astype('float64')
    labels = data[:, labelColumnNum].reshape((-1,1))
    tsDataWithLabels = np.hstack((tsData, labels))
    return tsDataWithLabels, data

def scale(data):
    scaler = MinMaxScaler(feature_range=(0,1))
    scaler.fit(data)
    return scaler, scaler.transform(data)

"""
# input expected to be a 2D array with last column being label
# Returns looked back X adn Y; last column in look back Y data returned is label
# Only one step ahead prediction setting is expected.
"""

def look_back_and_create_dataset(tsDataWithLabels, look_back = 1):
    lookbackTsDataX = [] 
    lookbackTsDataYAndLabel = []
    for i in range(look_back, len(tsDataWithLabels)):
        a = tsDataWithLabels[i-look_back:i, :-1]
        lookbackTsDataX.append(a)
        lookbackTsDataYAndLabel.append(tsDataWithLabels[i])
    return np.array(lookbackTsDataX), np.array(lookbackTsDataYAndLabel)

def split_data_set(dataset, split=0.67):
    train_size = int(len(dataset) * split)
    train, test = dataset[0:train_size, :], dataset[train_size:len(dataset), :]
    return train, test

def get_train_validation(Xtrain, Ytrain, validation_ratio=0.1):
    validation_size = int(len(Xtrain) * validation_ratio)
    Xtrain, Xvalid = Xtrain[validation_size:], Xtrain[:validation_size]
    Ytrain, Yvalid = Ytrain[validation_size:], Ytrain[:validation_size]
    return Xtrain, Ytrain, Xvalid, Yvalid

# Note here the slight change in how we stack the hidden LSTM layers - special for the last LSTM layer.
def baseline_model(input_shape, learning_rate):
    def build_model(input_shape=input_shape, n_hidden = 1, n_units = 50, learning_rate = learning_rate):
        model = keras.models.Sequential()
        model.add(keras.layers.InputLayer(input_shape=input_shape))
        for layer in range(n_hidden - 1):
            # return sequence = true for all layers except last layer
            model.add(keras.layers.LSTM(n_units, return_sequences = True, activation = 'relu'))
        model.add(keras.layers.LSTM(n_units, activation = 'relu'))
        model.add(keras.layers.Dense(1))
        optimizer = keras.optimizers.Adam(lr=learning_rate)
        model.compile(loss="mse", optimizer=optimizer)
        return model
    return build_model

    

############## main #########################

sorted_df = cleanup()

timeVariantColumns = ['a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8']
labelColumnNum = 10

# read data
tsDataWithLabels, data = read_data_with_labels(sorted_df, timeVariantColumns, labelColumnNum)
print("Shapes: time variant data array with labels {}, full data {}".format(tsDataWithLabels.shape, data.shape))
print("Unique outliers in full data array {}".format(np.unique(data[:, -1], return_counts=True)))
print("Unique outliers in time variant data array with labels {}".format(np.unique(tsDataWithLabels[:, -1], 
                                                                                   return_counts=True)))

# print(tsDataWithLabels)

# scale data
scaler, tsDataScaled = scale(tsDataWithLabels)

look_back=24
# Get look back data in the 3D array shape (n_samples, n_lookback_steps, n_features)
lookbackX, lookbackY = look_back_and_create_dataset(tsDataScaled, look_back=look_back)
print("Look back data shapes: lookbackX {} lookbackY {}".format(lookbackX.shape, lookbackY.shape))




Unique classes after labelling outliers (array([1, 2, 3, 5, 6, 7]), array([45586,    50,   171,  3267,    10,    13]))
Unique outliers after labelling outliers (array([0, 1]), array([45586,  3511]))
Shapes: time variant data array with labels (49097, 9), full data (49097, 11)
Unique outliers in full data array (array([0., 1.]), array([45586,  3511]))
Unique outliers in time variant data array with labels (array([0., 1.]), array([45586,  3511]))
Look back data shapes: lookbackX (49073, 24, 8) lookbackY (49073, 9)
