# SHAP Interpretation

In [None]:
import shap
import pandas as pd
import keras
from sklearn.preprocessing import *
import numpy as np
Minmaxsc  = MinMaxScaler(feature_range=(0, 1))
Minmaxsc2  = MinMaxScaler(feature_range=(0, 1))
Stdsc  = StandardScaler()
Stdsc2  = StandardScaler()
MAsc  = MaxAbsScaler()
MAsc2  = MaxAbsScaler()
Rsc  = RobustScaler()
Rsc2  = RobustScaler()

In [None]:
database=pd.read_csv('processed_database.csv')
data_output_full=database.iloc[:,1]
data_input_full=database.iloc[:,2:]
data_input_full_ANN=Stdsc.fit_transform(data_input_full)
data_output_full_ANN=Stdsc2.fit_transform(np.array(data_output_full).reshape(-1,1))

In [None]:
data_input_full_ANN_for_shap=pd.DataFrame(data_input_full_ANN,columns=data_input_full.columns)

In [None]:
model=keras.models.load_model("Acid_HER.h5")

In [None]:
from keras.models import load_model
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.inspection import plot_partial_dependence
from sklearn.utils.validation import check_is_fitted

# Define a new class that inherits from sklearn's base estimator classes.
class KerasPDPWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, model):
        self.model = model
        self.fitted_ = True  # Indicate that the model is already fitted
    def fit(self, X, y=None):
        # Do nothing, the model is already trained
        return self
    def predict(self, X, y=None):
        return self.model.predict(X)

# Create an instance of the wrapper with the Keras model.
PDP_model = load_model("Acid_HER.h5")
PDP_wrapped_model = KerasPDPWrapper(PDP_model)


In [None]:
# compute SHAP values
explainer = shap.Explainer(model, data_input_full_ANN)
shap_values = explainer(data_input_full_ANN_for_shap)

In [None]:
# Extract the values from the 64th feature onwards
sliced_values = shap_values.values[:, 64:]

# Create a new Explanation object with the sliced values
sliced_shap_values = shap.Explanation(values=sliced_values, data=shap_values.data[:, 64:], feature_names=shap_values.feature_names[64:])

# Calculate the absolute mean
cohorts = sliced_shap_values.cohorts(2).abs.mean(0)

# Create the bar plot
shap.plots.bar(cohorts)

In [None]:
shap.summary_plot(shap_values[:,64:],feature_names=list(data_input_full.columns)[64:],max_display=13,alpha=0.5,plot_size=[6,6])

In [None]:
shap.summary_plot(shap_values[:,64:],feature_names=list(data_input_full.columns)[64:],max_display=100,alpha=0.5)

In [None]:
# shap.summary_plot(shap_values[:,64:],feature_names=list(data_input_full.columns)[64:],max_display=15,alpha=0.5)
shap.plots.heatmap(shap_values[:,64:],max_display=14)

In [None]:
shap.dependence_plot('Judge_Nanoparticles',shap_values.values,data_input_full,interaction_index='Judge_heterostructures',alpha=0.5)
# Plot the partial dependence.
plot_partial_dependence(PDP_wrapped_model, data_input_full, [('Judge_Nanoparticles','Judge_heterostructures')],grid_resolution=50)

In [None]:
shap.dependence_plot('Judge_3D',shap_values.values,data_input_full,interaction_index='Judge_alloy',alpha=0.5)
# Plot the partial dependence.
plot_partial_dependence(PDP_wrapped_model, data_input_full, [('Judge_3D','Judge_alloy')], grid_resolution=50)

In [None]:
#####clustered plot#####
clustering = shap.utils.hclust(data_input_full_ANN, data_output_full_ANN) 
shap.plots.bar(shap_values, max_display=15,clustering=clustering,clustering_cutoff=0.5)

In [None]:
from sklearn.inspection import partial_dependence
def compute_pdp_importances(model, X):
    pdp_importances = []
    for feature_idx in range(X.shape[1]):
        pdp_results, _ = partial_dependence(model, X, [feature_idx], grid_resolution=50)
        pdp_range = np.max(pdp_results) - np.min(pdp_results)
        pdp_importances.append(pdp_range)
    return np.array(pdp_importances)

In [None]:
import matplotlib.pyplot as plt
# Compute PDP importances
pdp_importances = compute_pdp_importances(PDP_wrapped_model, data_input_full_ANN_for_shap)

# Create a DataFrame to pair importances with feature names
importances_df = pd.DataFrame({
    'feature': data_input_full_ANN_for_shap.columns,
    'importance': pdp_importances
})

# Filter for only features after the 64th
importances_df = importances_df.iloc[64:]

# Sort importances
importances_df = importances_df.sort_values(by='importance')

# Plot
plt.figure(figsize=(9, 18))
plt.barh(range(importances_df.shape[0]), importances_df['importance'])
plt.yticks(range(importances_df.shape[0]), importances_df['feature'])
plt.xlabel('Partial Dependence Importance')
plt.show()