In [None]:
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing
import seaborn as sns
sns.set(color_codes=True)
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
%matplotlib inline

from numpy.random import seed
import tensorflow
from keras.layers import Input, Dropout
from keras.layers.core import Dense 
from keras.models import Model, Sequential, load_model
from keras import regularizers
from keras.models import model_from_json

In [None]:
import os
for dirname, _, filenames in os.walk('../input/nasa-bearing-dataset-aggregated-sets-no-1-2-3'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Dataset preprocessing

In [None]:
# Read the CSV file and set first column as the dataframe index
dataset = pd.read_csv("../input/nasa-bearing-dataset-aggregated-sets-no-1-2-3/merged_dataset_BearingTest_1.csv", index_col=0)
dataset.head()

# Normaliza data

In [None]:
from sklearn import preprocessing

# Decide on what normalizer function to use
## https://www.geeksforgeeks.org/standardscaler-minmaxscaler-and-robustscaler-techniques-ml
scaler = preprocessing.MinMaxScaler() # scales all the data features in the range [0, 1] or if there are negative values to [-1, 1] 
#scaler = preprocessing.StandardScaler() # It follows Standard Normal Distribution (SND). Therefore, it makes mean = 0 and scales the data to unit variance

# If you needed to operate in the whole dataset, you could apply normalization to the full time series
#X_all = scaler.fit_transform(dataset)
#X_all = pd.DataFrame(dataset)
#X_all.columns = dataset.columns

# Dataset is scaled so that maximum for every column is 1
dataset_scaled = pd.DataFrame(scaler.fit_transform(dataset), 
                              columns=dataset.columns, 
                              index=dataset.index)
dataset_scaled.describe()

## Split into training and test datasets
- We want the training set contains only "normal" data
- The rest of points will be in the test set, that will contain both "normal" and anomalous data

In [None]:
print("dataset_scaled shape is",dataset_scaled.shape,"\n\n", dataset_scaled.index)

We will split into training and test sets:
 
 - The **training set** corresponds to the first part of the time serie (25% approximately), where bearing status is healthy
     - It will train the **Autoencoder model**
     - So the training step will provide with the **baseline** that we will use to flag anomalies later
     
 - The **test set** covers the remaining 75% of the of the serie (right part)
     - We will apply on it the threshold value provided by the autoencoder model (baseline)
     - Then we will flag as anomalous every point whose score is above the threshold

In [None]:
# Split baseline and analysis set with a ratio 1:3 (25% : 75%)
row_slice = round( 0.25*dataset_scaled.shape[0] )
index_slice = dataset_scaled.index[row_slice]
index_slice_ = dataset_scaled.index[row_slice + 1]

print("dataset_scaled shape is",dataset_scaled.shape,"and will be slice at timestamp", index_slice)
print("Analysis set will start at timestamp", index_slice_)

In [None]:
X_train = dataset_scaled[:index_slice]
X_test  = dataset_scaled[index_slice_:]
# Random shuffle training data
X_train.sample(frac=1)

print("Train dataset has lenght", X_train.shape[0], "while test dataset is", X_test.shape[0],
      "\nIn TOTAL there are", X_train.shape[0]+X_test.shape[0],"rows")

In [None]:
x_ticks_span = 50

X_train.plot(figsize = (6,6), title ='Left time series with "normal" data (normalized signals)')
plt.xticks(np.arange(0, X_train.shape[0], x_ticks_span), fontsize=10, rotation = 30)
plt.ylim(0,1)
plt.legend(loc="upper left")  
plt.show()

X_test.plot(figsize = (18,6), title='Right time series with "normal" & "anomalous" data (normalized signals)')
plt.xticks(np.arange(0, X_test.shape[0], x_ticks_span), fontsize=10, rotation = 30)
plt.ylim(0,1)
plt.legend(loc="upper left")  
plt.show()

## All components (PCA) just for visualization purposes
Training of the model will use the 4 bearings data, without PCA dimensional reduction.

In fact, the Autoencoder model will have a central (hidden) layer with two nodes (that play the role of the 2 PCA components). Neural Network have the advantage of being able to deal with both linear & non linear models. 

In [None]:
from sklearn.decomposition import PCA
pca = PCA(8)
x_pca = pca.fit_transform(X_train)
x_pca = pd.DataFrame(x_pca)
x_pca.columns=['PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8']

# Plot
plt.scatter(x_pca['PC1'], x_pca['PC2'])
plt.title('Training dataset projected onto 2 Principal Components')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

### Explained variance of each PCA component

In [None]:
np.set_printoptions(precision=3, suppress=True) # 3 decimal places and don't use scientific
print(pca.explained_variance_ratio_)

# Build autoencoder model

We don't need to apply dimensional reduction, it's done by the Autoencoder model (central layer of two nodes in the Neural Network are the equivalent to the 2 Principal Components)

In [None]:
seed(10)
tensorflow.random.set_seed(10)
act_func = 'elu'

# Input layer:
model=Sequential()
# First hidden layer, connected to input vector X. 
model.add(Dense(2,activation=act_func,
                kernel_initializer='glorot_uniform',
                kernel_regularizer=regularizers.l2(0.0),
                input_shape=(X_train.shape[1],)
               )
         )

model.add(Dense(1,activation=act_func,
                kernel_initializer='glorot_uniform'))

model.add(Dense(2,activation=act_func,
                kernel_initializer='glorot_uniform'))

model.add(Dense(X_train.shape[1],
                kernel_initializer='glorot_uniform'))

model.compile(loss='mse',optimizer='adam')

In [None]:
# Train model for 30 epochs, batch size of 30: 
NUM_EPOCHS=30
BATCH_SIZE=10

In [None]:
from keras.utils.vis_utils import model_to_dot
from IPython.display import SVG
SVG(model_to_dot(model, dpi=70, show_shapes=True, show_layer_names=True, rankdir='TB').create(prog='dot', format='svg'))

# Fitting the model
To keep track of the accuracy during training, we use 5% of the training data for validation after each epoch (validation_split = 0.05)

In [None]:
history=model.fit(np.array(X_train),np.array(X_train),
                  batch_size=BATCH_SIZE, 
                  epochs=NUM_EPOCHS,
                  validation_split=0.05,
                  verbose = 1)

## Evaluate the model: validation vs. training loss

In [None]:
plt.plot(history.history['loss'],
         'b',
         label='Training loss')
plt.plot(history.history['val_loss'],
         'r',
         label='Validation loss')
plt.legend(loc='upper right')
plt.xlabel('Epochs')
plt.ylabel('Loss, [mse]')
#plt.ylim([0,.1])
plt.show()

# Distribution of loss function in the training set
By plotting the distribution of the calculated loss in the training set, one can use this to identify a suitable threshold value for identifying an anomaly.

In doing this, one can make sure that this threshold is set above the “noise level”, and that any flagged anomalies should be statistically significant above the noise background.

In [None]:
X_pred = model.predict(np.array(X_train))
X_pred = pd.DataFrame(X_pred, 
                      columns=X_train.columns)
X_pred.index = X_train.index

scored = pd.DataFrame(index=X_train.index)
scored['Loss_mae'] = np.mean(np.abs(X_pred-X_train), axis = 1)
scored.head()

In [None]:
plt.figure()
sns.distplot(scored['Loss_mae'],
             bins = 40, 
             kde= True,
            color = 'blue');
plt.xlim([0.0,.1])
plt.ylim([0,10])

From the above loss distribution, let us try a threshold of 0.08 for flagging an anomaly.

In [None]:
threshold = 0.06

 We can then calculate the loss in the test set, to check when the output crosses the anomaly threshold.

In [None]:
X_pred = model.predict(np.array(X_test))
X_pred = pd.DataFrame(X_pred, 
                      columns=X_test.columns)
X_pred.index = X_test.index

scored = pd.DataFrame(index=X_test.index)
scored['Loss_mae'] = np.mean(np.abs(X_pred-X_test), axis = 1)
scored['Threshold'] = threshold
scored['Anomaly'] = scored['Loss_mae'] > scored['Threshold']
scored.tail()

We then calculate the same metrics also for the training set, and merge all data in a single dataframe:

In [None]:
X_pred_train = model.predict(np.array(X_train))
X_pred_train = pd.DataFrame(X_pred_train, 
                      columns=X_train.columns)
X_pred_train.index = X_train.index

scored_train = pd.DataFrame(index=X_train.index)
scored_train['Loss_mae'] = np.mean(np.abs(X_pred_train-X_train), axis = 1)
scored_train['Threshold'] = threshold
scored_train['Anomaly'] = scored_train['Loss_mae'] > scored_train['Threshold']

scored = pd.concat([scored_train, scored])

An outlier is a point that is distant from others, so the **score** value can be understood *as a distance*. Let's add a column in the training set to flag the anomalies.

# Results from Autoencoder model
Having calculated the loss distribution and the anomaly threshold, we can visualize the model output in the time leading up to the bearing failure:

In [None]:
scored.plot(logy=True,  figsize = (18,6), ylim = [1e-3,1e2], color = ['blue','red'])
plt.xticks(np.arange(0, scored.shape[0], 50), fontsize=10, rotation = 30)
#plt.gca().grid(True)
plt.ylabel('Reconstruction error')
plt.show()

# Conclusion
- The Autoencoder model performs as well as the PCA + Mahalanobis distance model.
The dataset is not complex enough as to find an improved result if using a neural network.