# Predicting anomalies on NASA data using LSTM Pipeline

This is a demo notebook that shows how to use a pipeline with LSTM Regressor and
dynamic error thresholding primitives to detect anomalies in the NASA data.

In [1]:
import pandas as pd
from mlblocks import add_primitives_path, MLPipeline

from orion.data import load_nasa_signal

train, test = load_nasa_signal('P-3', test_size=0.3)

In [2]:
train.head()

Unnamed: 0,timestamp,value
0,1222819200,-0.225807
1,1222840800,-0.152074
2,1222862400,-0.152074
3,1222884000,-0.161291
4,1222905600,-0.152074


In [3]:
test.head()

Unnamed: 0,timestamp,value
11890,1479643200,-0.152074
11891,1479664800,-0.161291
11892,1479686400,-0.152074
11893,1479708000,-0.161291
11894,1479729600,-0.161291


In [4]:
# Prepare the pipeline arguments

primitives = [
    'mlprimitives.candidates.timeseries.time_segments_average',
    'mlprimitives.candidates.timeseries.rolling_window_sequences',
    'keras.Sequential.LSTMTimeSeriesRegressor',
    'mlprimitives.candidates.timeseries_errors.get_forecast_errors',
    'mlprimitives.candidates.timeseries_errors.extract_anomalies'
]

init_params = {
    'mlprimitives.candidates.timeseries.time_segments_average#1': {
        'value_column': 'value',
        'time_column': 'timestamp',
        'interval': 21600    # 6 hours
    },
    'mlprimitives.candidates.timeseries.rolling_window_sequences#1': {
        'value_column': 'value',
        'time_column': 'timestamp'
    },
    'mlprimitives.candidates.timeseries_errors.extract_anomalies#1': {
        'batch_size': 10
    }
}

input_names = {
    'keras.Sequential.LSTMTimeSeriesRegressor#1': {
        "y": "y_true"
    },
    'mlprimitives.candidates.timeseries_errors.get_forecast_errors#1': {
        "y_hat": "y"   # output from LSTM
    },
    'mlprimitives.candidates.timeseries_errors.extract_anomalies#1': {
        "smoothed_errors": "moving_avg" # output from get_forecast_errors
    }
}

output_names = {
    'mlprimitives.candidates.timeseries.rolling_window_sequences#1': {
        'y': 'y_true'
    }
}

In [5]:
# Create the pipeline

pipeline = MLPipeline(primitives, init_params, input_names, output_names)

Using TensorFlow backend.


In [6]:
# Fit and predict

pipeline.fit(train)
anomalies = pipeline.predict(test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
anomalies_data = []
for (start, end), score in zip(*anomalies):
    start = test.iloc[start]['timestamp']
    end = test.iloc[end]['timestamp']
    anomalies_data.append({
        'start': start,
        'end': end,
        'severity': score
    })

adf = pd.DataFrame(anomalies_data, columns=['start', 'end', 'severity'])
adf['start'] = adf['start'].astype(int)
adf['end'] = adf['end'].astype(int)

In [12]:
adf

Unnamed: 0,start,end,severity
0,1502020800,1502560800,0.1826
1,1515132000,1515650400,0.193583
2,1518480000,1518955200,0.184643
3,1521806400,1522648800,2.145279
4,1523016000,1523966400,2.779098
5,1524031200,1524333600,2.7097
6,1530273600,1530727200,2.678006
7,1530878400,1531353600,3.106141
8,1532390400,1532779200,3.295882
9,1536904800,1537358400,3.388191
