# Start here

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("../input"))

In [None]:
print(os.listdir("../input/hpcc20steps/"))

In [None]:
!pip install tensorflow==1.14.0

# Data

In [None]:
variables_name = pd.read_csv("../input/hpcc20steps/variables_name.csv", header=None)
features = variables_name.values[:,1]

In [None]:
variables_name

In [None]:
features

In [None]:
import json
with open("../input/hpcc20steps/X_train_HPCC_1_20.json") as of:
    X_train = np.array(json.load(of))
with open("../input/hpcc20steps/y_train_HPCC_1_20.json") as of:
    y_train = np.array(json.load(of))
with open("../input/hpcc20steps/X_test_HPCC_1_20.json") as of:
    X_test = np.array(json.load(of))
with open("../input/hpcc20steps/y_test_HPCC_1_20.json") as of:
    y_test = np.array(json.load(of))    

In [None]:
X_train.shape, y_train.shape

In [None]:
X_test.shape, y_test.shape

# Model

In [None]:
from keras import regularizers
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Activation
from keras.layers import Dropout
from keras.layers import Flatten
from keras.optimizers import Adam


def createModel(l1Nodes, l2Nodes, d1Nodes, d2Nodes, inputShape):
    # input layer
    lstm1 = LSTM(l1Nodes, input_shape=inputShape, return_sequences=True)
    lstm2 = LSTM(l2Nodes, return_sequences=True)
    flatten = Flatten()
    dense1 = Dense(d1Nodes)
    dense2 = Dense(d2Nodes)

    # output layer
#     outL = Dense(1, activation='relu')
    outL = Dense(1)
    # combine the layers
    layers = [lstm1, lstm2, flatten,  dense1, dense2, outL]
    # create the model
    model = Sequential(layers)
    opt = Adam(learning_rate=0.005)
    model.compile(optimizer=opt, loss='mse')
    return model

In [None]:
# create model
model = createModel(8, 8, 8, 4, (X_train.shape[1], X_train.shape[2]))
model.fit(X_train, y_train, batch_size=8, epochs=30)

In [None]:
from sklearn.metrics import mean_squared_error as mse

In [None]:
y_pred_train = model.predict(X_train)
mse(y_train, y_pred_train)

In [None]:
y_pred = model.predict(X_test)
mse(y_test, y_pred)

In [None]:
model.summary()

In [None]:
# Save the entire model to a HDF5 file.
# The '.h5' extension indicates that the model shuold be saved to HDF5.
model.save('HPCCv1_model.h5') 

# SHAP

In [None]:
import shap

In [None]:
import tensorflow as tf
tf.__version__

## DeepSHAP

In [None]:
# Use the training data for deep explainer => can use fewer instances
explainer = shap.DeepExplainer(model, X_train)
# explain the the testing instances (can use fewer instanaces)
# explaining each prediction requires 2 * background dataset size runs
shap_values = explainer.shap_values(X_test)
# init the JS visualization code
shap.initjs()

In [None]:
explainer.expected_value

In [None]:
len(shap_values)

In [None]:
X_test.shape

In [None]:
shap_values[0].shape

In [None]:
shap_values[0][0].shape

In [None]:
# shap.force_plot(explainer.expected_value[0], shap_values[0][0][0,:], features)
print(features)
print(len(features))

In [None]:
i=0
j=0

In [None]:
shap_values[0][i][j]

In [None]:
X_test[i][j].shape


In [None]:
# shap.force_plot(explainer.expected_value[0], shap_values[0][0], features)
i = 0
j = 0
x_test_df = pd.DataFrame(data=X_test[i][j].reshape(1,10), columns = features)
shap.force_plot(explainer.expected_value[0], shap_values[0][i][j], x_test_df)

## Check sum of shap values vs prediction

In [None]:
shap.__version__

In [None]:
shap_values[0][i].shape

In [None]:
shap_values[0][0].shape

In [None]:
i = 11
pred_i = model.predict(X_test[i:i+1])
sum_shap_i = shap_values[0][i].sum() + explainer.expected_value[0]

pred_i, sum_shap_i

They are the same. It looks ok

In [None]:
# Plot SHAP for ONLY one observation i
i = 0
shap.initjs()

x_test_df = pd.DataFrame(data=X_test[i], columns = features)
shap.force_plot(explainer.expected_value[0], shap_values[0][i], x_test_df)
## Problem:  Can not take into account many observations at the same time.
### The pic below explain for only 1 observation of 20 time steps, each time step has 10 features.

### AVERAGE shap for ALL Obs

In [None]:
################# Plot AVERAGE shap values for ALL observations  #####################
## Consider ABSOLUTE of SHAP values ##
shap_average_value = np.abs(shap_values[0]).mean(axis=0)

x_average_value = pd.DataFrame(data=X_test.mean(axis=0), columns = features)
shap.force_plot(0, shap_average_value, x_average_value)

In [None]:
################# Plot AVERAGE shap values for ALL observations  #####################
## Consider average (+ is different from -)
shap_average_value = shap_values[0].mean(axis=0)

x_average_value = pd.DataFrame(data=X_test.mean(axis=0), columns = features)
shap.force_plot(explainer.expected_value[0], shap_average_value, x_average_value)

In [None]:
shap_values_2D = shap_values[0].reshape(-1,10)
X_test_2D = X_test.reshape(-1,10)


shap_values_2D.shape, X_test_2D.shape

In [None]:
x_test_2d = pd.DataFrame(data=X_test_2D, columns = features)

In [None]:
x_test_2d.corr()

In [None]:
shap.summary_plot(shap_values_2D, x_test_2d)

In [None]:
shap.summary_plot(shap_values_2D, x_test_2d, plot_type="bar")

In [None]:
len_test_set = X_test_2D.shape[0]
len_test_set

In [None]:
## SHAP for each time step
NUM_STEPS = 20
NUM_FEATURES = 10


# step = 0
for step in range(NUM_STEPS):
    indice = [i for i in list(range(len_test_set)) if i%NUM_STEPS == step]
    shap_values_2D_step = shap_values_2D[indice]
    x_test_2d_step = x_test_2d.iloc[indice]
    print("_______ time step {} ___________".format(step))
    shap.summary_plot(shap_values_2D_step, x_test_2d_step, plot_type="bar")
    shap.summary_plot(shap_values_2D_step, x_test_2d_step)
    print("\n")

## Outliers vs Non-Outliers

In [None]:
# X_train_outlier
with open("../input/hpcc20steps/X_train_outlier.json") as of:
    X_train_outlier = np.array(json.load(of))
with open("../input/hpcc20steps/y_train_outlier.json") as of:
    y_train_outlier = np.array(json.load(of))

    # X_train_normal
with open("../input/hpcc20steps/X_train_not_outlier.json") as of:
    X_train_not_outlier = np.array(json.load(of))
with open("../input/hpcc20steps/y_train_not_outlier.json") as of:
    y_train_not_outlier = np.array(json.load(of))

In [None]:
## OUTLIERS
shap_values = explainer.shap_values(X_train_outlier)
i = 0
x_test_df = pd.DataFrame(data=X_train_outlier[i], columns = features)
shap.force_plot(explainer.expected_value[0], shap_values[0][i], x_test_df)

In [None]:
## NON-OUTLIERS
shap_values = explainer.shap_values(X_train_not_outlier)
i = 0
x_test_df = pd.DataFrame(data=X_train_not_outlier[i], columns = features)
shap.force_plot(explainer.expected_value[0], shap_values[0][i], x_test_df)

In [None]:
y_train_not_outlier[0]

In [None]:
y_train_outlier[0]

## GradientExplainer

In [None]:
# Use the training data for deep explainer => can use fewer instances
explainer_2 = shap.GradientExplainer(model, X_train)
# explain the the testing instances (can use fewer instanaces)
# explaining each prediction requires 2 * background dataset size runs
shap_values_2 = explainer_2.shap_values(X_test)
# init the JS visualization code
shap.initjs()

In [None]:
################# Plot AVERAGE shap values for ALL observations  #####################
## Consider ABSOLUTE of SHAP values ##
shap_average_abs_value_2 = np.abs(shap_values_2[0]).mean(axis=0)

x_average_value = pd.DataFrame(data=X_test.mean(axis=0), columns = features)
shap.force_plot(0, shap_average_abs_value_2, x_average_value)

# Importance for each training instance with SHAP GradientExplainer 

In [None]:
################# Plot AVERAGE shap values for ALL observations  #####################
## Consider ABSOLUTE of SHAP values ##
shap.initjs()
shap_values_train = explainer.shap_values(X_train)

shap_average_abs_value_train = np.abs(shap_values_train[0]).mean(axis=0)

x_average_value_train = pd.DataFrame(data=X_train.mean(axis=0), columns = features)
shap.force_plot(0, shap_average_abs_value_train, x_average_value_train)

In [None]:
shap_values_train_2D = shap_values_train[0].reshape(-1,10)
X_train_2D = X_train.reshape(-1,10)


shap.summary_plot(shap_values_train_2D, X_train_2D, features)

**Some Comments**:
- *CPU1 Temp* is the most important feature
- *Fan3 Speed* and *Fan2 Speed* seem to have positive corr with the output. Conversely, *Fan4* and *Fan1* have negative relationships with the output.
- *CPU Load* doesn't have a clear linear relationship with the output (red dots at the both sides)

In [None]:
# COLOR: https://seaborn.pydata.org/tutorial/color_palettes.html
import seaborn as sns
import matplotlib.pyplot as plt

for i, feature in enumerate(features):
    print(feature)

    plt.figure(figsize = (8,6)) 
    tmp = shap_values_train[0][:,:,i].reshape((-1,20))
    print(tmp.shape)
    plot_shap = sns.heatmap(tmp, cmap="coolwarm")
    plt.show(plot_shap)
    print("-----------")

** Some Comments **

- *CPU1 Temp*: Light color at early time steps. It starts bolder from 10th to 18th steps => These steps play an important roles in prediction.
(recall that output is the sum of 20*10 importance scores)
- *CPU2 Temp*: time step 9th, 10th have light color; some in the end have darker color.
- *Inlet Temp* & *Power consumption*: Early time steps have the most impact on the prediction.
- *CPU Load*: Almost in blue => Has negative impact on the prediction
- *Memory Usage*: As opposed to *CPU Load*
- *Fan1* & *Fan 3*: almost blue; while *Fan2 & 4* are in orange