Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Saliency Map for GAT #435

Merged
merged 19 commits into from Aug 8, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Empty file added __init__.py
Empty file.
1,101 changes: 1,101 additions & 0 deletions demos/interpretability/gat/node-link-importance-demo-gat.ipynb

Large diffs are not rendered by default.

40 changes: 35 additions & 5 deletions stellargraph/layer/graph_attention.py
Expand Up @@ -98,6 +98,7 @@ def __init__(
kernel_constraint=None,
bias_constraint=None,
attn_kernel_constraint=None,
saliency_map_support=False,
**kwargs
):

Expand Down Expand Up @@ -130,6 +131,7 @@ def __init__(
self.bias_constraint = constraints.get(bias_constraint)
self.attn_kernel_constraint = constraints.get(attn_kernel_constraint)

self.saliency_map_support = saliency_map_support
# Populated by build()
self.kernels = [] # Layer kernels for attention heads
self.biases = [] # Layer biases for attention heads
Expand Down Expand Up @@ -210,6 +212,17 @@ def build(self, input_shapes):
feat_shape = input_shapes[0]
input_dim = feat_shape[-1]

# Variables to support integrated gradients
self.delta = self.add_weight(
name="ig_delta", shape=(), trainable=False, initializer=initializers.ones()
)
self.non_exist_edge = self.add_weight(
name="ig_non_exist_edge",
shape=(),
trainable=False,
initializer=initializers.zeros(),
)

# Initialize weights for each attention head
for head in range(self.attn_heads):
# Layer kernel
Expand Down Expand Up @@ -279,6 +292,7 @@ def call(self, inputs):
X = inputs[0] # Node features (1 x N x F)
out_indices = inputs[1] # output indices (1 x K)
A = inputs[2] # Adjacency matrix (N x N)
N = K.int_shape(A)[-1]

batch_dim, n_nodes, _ = K.int_shape(X)
if batch_dim != 1:
Expand Down Expand Up @@ -322,11 +336,25 @@ def call(self, inputs):
# YT: this only works for 'binary' A, not for 'weighted' A!
# YT: if A does not have self-loops, the node itself will be masked, so A should have self-loops
# YT: this is ensured by setting the diagonal elements of A tensor to 1 above
mask = -10e9 * (1.0 - A)
dense += mask
if not self.saliency_map_support:
mask = -10e9 * (1.0 - A)
self.A = A
dense += mask
dense = K.softmax(dense) # (N x N), Eq. 3 of the paper

# Apply softmax to get attention coefficients
dense = K.softmax(dense, axis=1) # (N x N), Eq. 3 of the paper
else:
# dense = dense - tf.reduce_max(dense)
# GAT with support for saliency calculations
W = (self.delta * A) * K.exp(
dense - K.max(dense, axis=1, keepdims=True)
) * (1 - self.non_exist_edge) + self.non_exist_edge * (
A
+ self.delta * (K.ones(shape=[N, N], dtype="float") - A)
+ K.eye(N)
) * K.exp(
dense - K.max(dense, axis=1, keepdims=True)
)
dense = W / K.sum(W, axis=1, keepdims=True)

# Apply dropout to features and attention coefficients
dropout_feat = Dropout(self.in_dropout_rate)(features) # (N x F')
Expand Down Expand Up @@ -600,12 +628,13 @@ def __init__(
attn_dropout=0.0,
normalize=None,
generator=None,
saliency_map_support=False,
):
self.bias = bias
self.in_dropout = in_dropout
self.attn_dropout = attn_dropout
self.generator = generator

self.saliency_map_support = saliency_map_support
# Check layer_sizes (must be list of int):
# check type:
if not isinstance(layer_sizes, list):
Expand Down Expand Up @@ -760,6 +789,7 @@ def __init__(
activation=self.activations[ii],
use_bias=self.bias,
final_layer=ii == (n_layers - 1),
saliency_map_support=self.saliency_map_support,
)
)

Expand Down
2 changes: 2 additions & 0 deletions stellargraph/utils/saliency_maps/__init__.py
Expand Up @@ -16,3 +16,5 @@

from .integrated_gradients import IntegratedGradients
from .saliency import GradientSaliency
from .integrated_gradients_gat import IntegratedGradientsGAT
from .saliency_gat import GradientSaliencyGAT
136 changes: 136 additions & 0 deletions stellargraph/utils/saliency_maps/integrated_gradients_gat.py
@@ -0,0 +1,136 @@
# -*- coding: utf-8 -*-
#
# Copyright 2018 Data61, CSIRO
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
The vanilla gradients may not work well for the graph setting. The main reason is that when you compute the vanilla gradients,
you only get the direction of changing at the current state of the graph (i.e., the adjacency matrix and feature matrix). However,
even though the feature values and entries in the adjacency matrix are not continous values, the model (e.g., GCN or GAT) learns
a continous function which may not be linear when a feature or edge value changes discretely. Let's take ReLU(x) as an example, when x
changes from 0 to 1, the output of the function changes from 0 to 1 as well. However, when you compute the gradient of the function
at x = 0, you will get grad(ReLU(x = 0)) = 0 which is obviously not what we want.

Integrated gradients approximates Shapley values by integrating partial gradients w.r.t input features from reference input to the
actual input. Therefore, it could solve the problem we described above and give much better accuracy. It was initially proposed in the paper
"Axiomatic attribution for deep neuron networks" published in ICML'17.
"""

import numpy as np
from .saliency_gat import GradientSaliencyGAT
import scipy.sparse as sp
import keras.backend as K


class IntegratedGradientsGAT(GradientSaliencyGAT):
"""
A SaliencyMask class that implements the integrated gradients method.
"""

def __init__(self, model, generator, node_list):
self.node_list = node_list
super().__init__(model, generator)

def get_integrated_node_masks(
self,
node_id,
class_of_interest,
X_baseline=None,
steps=20,
non_exist_feature=False,
):
"""
This function computes the integrated gradients which measure the importance of each feature to the prediction score of 'class_of_interest'
for node 'node_id'.

Args:
node_id (int): The node ID in the StellarGraph object.
class_of_interest (int): The class of interest for which the saliency maps are computed.
X_baseline: For integrated gradients, X_baseline is the reference X to start with. Generally we should set X_baseline to a all-zero
matrix with the size of the original feature matrix for existing features.
steps (int): The number of values we need to interpolate. Generally steps = 20 should give good enough results.
non_exist_feature (bool): Setting it to True allows to compute the importance of features that are 0.
return (Numpy array): Integrated gradients for the node features.
"""
node_idx = self.node_list.index(node_id)

X_val = self.X
if X_baseline is None:
if not non_exist_feature:
X_baseline = np.zeros(X_val.shape)
else:
X_baseline = X_val
X_val = np.ones_like(X_val)
X_diff = X_val - X_baseline
total_gradients = np.zeros(X_val.shape)

for alpha in np.linspace(1.0 / steps, 1, steps):
X_step = X_baseline + alpha * X_diff
total_gradients += super().get_node_masks(
node_idx, class_of_interest, X_val=X_step
)
return np.squeeze(total_gradients * X_diff, 0)

def get_link_importance(
self, node_id, class_of_interest, steps=20, non_exist_edge=False
):
"""
This function computes the integrated gradients which measure the importance of each edge to the prediction score of 'class_of_interest'
for node 'node_id'.

Args:
node_id (int): The node ID in the StellarGraph object.
class_of_interest (int): The class of interest for which the saliency maps are computed.
steps (int): The number of values we need to interpolate. Generally steps = 20 should give good enough results.\
non_exist_edge (bool): Setting to True allows the function to get the importance for non-exist edges. This is useful when we want to understand
adding which edges could change the current predictions. But the results for existing edges are not reliable. Simiarly, setting to False ((A_baseline = all zero matrix))
could only accurately measure the importance of existing edges.

return (Numpy array): shape the same with A_val. Integrated gradients for the links.
"""
node_idx = self.node_list.index(node_id)

A_val = self.A
total_gradients = np.zeros(A_val.shape)
A_diff = (
A_val
if not non_exist_edge
else (np.ones_like(A_val) - np.eye(A_val.shape[0]) - A_val)
)
for alpha in np.linspace(1.0 / steps, 1.0, steps):
if self.is_sparse:
A_val = sp.lil_matrix(A_val)
tmp = super().get_link_masks(
alpha, node_idx, class_of_interest, int(non_exist_edge)
)
if self.is_sparse:
tmp = sp.csr_matrix(
(tmp, A_val.indices, A_val.indptr), shape=A_val.shape
).toarray()
total_gradients += tmp
return np.squeeze(np.multiply(total_gradients, A_diff) / steps, 0)

def get_node_importance(self, node_id, class_of_interest, steps=20):
"""
The importance of the node is defined as the sum of all the feature importance of the node.

Args:
Refer to the parameters in get_integrated_node_masks.

return (float): Importance score for the node.
"""
gradients = self.get_integrated_node_masks(
node_id, class_of_interest, steps=steps
)
return np.sum(gradients, axis=1)