In [None]:
!nvidia-smi

In [None]:
!pip install tensorflow-gpu

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import LSTM, Dense, RepeatVector, Dropout, TimeDistributed
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df = pd.read_csv("/kaggle/input/sp500-daily-19862018/spx.csv", parse_dates=['date'], index_col='date')

In [None]:
df.plot(figsize=(14,8))
plt.show()

In [None]:
df.info()

In [None]:
df.describe()

### Data Preparation

In [None]:
### Using 95% as training data

# We'll look back 30 days of historical data to learn past trend. 
# Setting shuffle to False to retain the time series
TIMESTEPS = 30            

train_data, test_data = train_test_split(df, train_size=0.95, shuffle=False)
train_data.sort_index(inplace=True)
test_data.sort_index(inplace=True)
train_data.shape, test_data.shape

In [None]:
train_data

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
def getScaledData(method='standard', train_df=None, test_df=None, feature_col='feature'):
    if method == 'standard':
        scaler = StandardScaler()
    else:
        scaler = MinMaxScaler()
    scaler = scaler.fit(train_df[[feature_col]])
    train_df['scaled_'+feature_col] = scaler.transform(train_df[[feature_col]])
    test_df['scaled_'+feature_col] = scaler.transform(test_df[[feature_col]])
    return train_df, test_df, scaler
    
def createDataset(df, lookback=30, feature_col=None):
    data_x, data_y = [], []
    for i in range(lookback, len(df)):
        data_x.append(df.iloc[i-lookback:i][[feature_col]].values)
        data_y.append(df.iloc[i][feature_col])
    data_x = np.array(data_x)
    data_y = np.array(data_y)
    return data_x, data_y

We will fit a separate scaler for training (& validation set) and test set.
We are assuming that the data used in training is normal with no anomalies and hence will fit a scaler from training dataset and will extract a subset of data as validation. Since validation data is also normal, this will be used for validation during training process.

In [None]:
train_df, test_df, scaler = getScaledData('standard', train_data, test_data, 'close')
train_df.shape, test_df.shape

In [None]:
train_df['scaled_close'].plot(figsize=(14,8))
plt.show()

In [None]:
train_x, train_y = createDataset(train_df, TIMESTEPS, 'scaled_close')
test_x, test_y = createDataset(test_df, TIMESTEPS, 'scaled_close')

In [None]:
train_x.shape, train_y.shape, test_x.shape, test_y.shape

The LSTM autoencoder will get train_x as input and will return an output with the same shape that will be compared with this input.

1. ### Model configuration & training 

Each LSTM unit cell has an internal state called as cell state and an output called as hidden state.
We set return sequences to true to return hidden state for each timestep. This is set to true when 
stacking multiple LSTM layers where each LSTM layer receives a 3-dimensional input sequence or while returning a sequence of outputs.
We'll be using Timedistributed Layer to wrap output of dense layer for every timestep to return an output sequence.

We use RepeatVector to repeat our vector output returned by last layer in encoder LSTM. This vector is repeated TIMESTEPS time since the 1st layer in the decoder - decoder_lstm requires a 3-D input compressed sequence.


In [None]:

LSTM_units = 64
model = keras.Sequential()
model.add(LSTM(LSTM_units, input_shape=(train_x.shape[1], train_x.shape[2]), return_sequences=False,name='encoder_lstm'
              ))
model.add(Dropout(0.2, name='encoder_dropout'))
model.add(RepeatVector(train_x.shape[1], name='decoder_repeater'))
model.add(LSTM(LSTM_units, return_sequences=True, name='decoder_lstm'))
model.add(Dropout(rate=0.2, name='decoder_dropout'))
model.add(TimeDistributed(Dense(train_x.shape[2],name='decoder_dense_output')))

model.compile(loss='mae', optimizer='adam')

In [None]:
model.summary()

In [None]:
%time history = model.fit(train_x, train_x, epochs=10, batch_size=32, validation_split=0.1, shuffle=False)

In [None]:
plt.plot(history.history['loss'], label='training_loss')
plt.plot(history.history['val_loss'], label='validation_loss')
plt.legend()
plt.show()

### Finding Anomalies

Plotting the distribution of error for train set to set a threshold for reconstruction error beyond which the input record will be labelled as anomaly.

In [None]:
reconstructed = model.predict(train_x)
reconstructed.shape, train_x.shape

In [None]:
# Reconstruction error - MAE for each sample

mae_loss = np.mean(np.abs(reconstructed - train_x), axis=1)
mae_loss.shape

In [None]:
sns.distplot(mae_loss[:,0])
plt.show()

Setting a threshold to label anomalies

In [None]:
THRESHOLD = 0.65

In [None]:
test_reconstruction = model.predict(test_x)
test_reconstruction.shape

In [None]:
# MAE for reconstruction on test data
test_mae_loss = np.mean(np.abs(test_x - test_reconstruction), axis=1)
test_mae_loss.shape

In [None]:
test_df.info()

### Observing the anomalies

In [None]:
# Setting index after N timesteps from past in test_df
anomaly_results_df = test_df[TIMESTEPS:][['close', 'scaled_close']].copy()
anomaly_results_df.index = test_df[TIMESTEPS:].index

# Including reconstructed predictions
anomaly_results_df['deviation'] = test_mae_loss
anomaly_results_df['threshold'] = THRESHOLD
anomaly_results_df['anomaly'] = anomaly_results_df['deviation'].apply(lambda dev: 1 if dev > THRESHOLD else 0)


anomalies = anomaly_results_df[anomaly_results_df['anomaly'] == 1]
anomalies.shape

In [None]:
anomaly_results_df['anomaly'].plot(kind='hist')
plt.show()

In [None]:
anomaly_results_df[['deviation', 'threshold']].plot(figsize=(14, 6))
plt.show()

In [None]:
anomaly_results_df[['close']].plot(figsize=(14, 6))
sns.scatterplot(anomalies.index, anomalies['close'],label='anomaly',color='red')
plt.show()

#### References

* https://machinelearningmastery.com/return-sequences-and-return-states-for-lstms-in-keras/
* https://www.curiousily.com/posts/anomaly-detection-in-time-series-with-lstms-using-keras-in-python/
* https://towardsdatascience.com/step-by-step-understanding-lstm-autoencoder-layers-ffab055b6352