In [1]:
%matplotlib inline  

import warnings
warnings.simplefilter('ignore')

import os
import json
import copy
import time
import numpy as np
import pandas as pd
import keras.models
import keras.backend as K
from keras.models import Model

Using TensorFlow backend.


In [2]:
import sys; sys.path.append("../")

from config import load_config
from utils import (load_data, preprocess, keras_gcn,
                  occlude_and_predict)
from plot_utils import (draw_chem_activations, plot_image_grid, 
                        create_figs, create_im_arrs)

from methods import (CAM, GradCAM, GradCAMAvg, Gradient, EB, cEB)

In [3]:
os.environ['CUDA_VISIBLE_DEVICES'] = ""
os.environ['HDF5_USE_FILE_LOCKING'] = 'FALSE'

In [4]:
##Choose dataset
#dataset = "BBBP"
#dataset = "BACE"
dataset = "TOX21"

In [4]:
data_dir = "../data/"
results_dir = "../results"
out_dir = os.path.join(results_dir, "occlusions")

# Data

In [6]:
### Data
config = load_config(dataset)
data_fp = os.path.join(config['data_dir'], config['data_fn'])
raw_data = load_data(data_fp)
data = preprocess(raw_data)
smiles = raw_data["smiles"]

if dataset == "TOX21":
    dataset_external = dataset + "-NR-ER"
else:
    dataset_external = dataset

label_to_class_name = {0 : "Not {}".format(dataset_external), 
                       1 : "{}".format(dataset_external)}

 58%|█████▊    | 3565/6193 [00:03<00:02, 985.89it/s] parse(), type: AssertionError, ()
100%|██████████| 6193/6193 [00:06<00:00, 925.11it/s]


In [7]:
labels = np.array(data["labels_one_hot"])

# Model

In [8]:
# Model
model_fn = "gcn_{}.h5".format(dataset.lower())
model_fp = os.path.join(config["saved_models_dir"], model_fn)
model = keras_gcn(config)
model.load_weights(model_fp)
num_classes = data['labels_one_hot'].shape[1]

# Select Occlusion Data

In [9]:
# # Select 2k examples class balanced.
# k = 100
# pos_inds = [i for i,x in enumerate(labels) if np.argmax(x) == 1 ]
# neg_inds = [i for i,x in enumerate(labels) if np.argmax(x) == 0 ]
# occlude_data_inds = np.array(pos_inds[:k] + neg_inds[:k])
# occlude_data_inds = np.concatenate([np.random.choice(pos_inds, k), np.random.choice(neg_inds, k)], axis=0)

# # All inds
occlude_data_inds = np.arange(0, len(labels))

# # Random inds
#k = 5
#occlude_data_inds = np.random.choice(np.arange(0, len(labels)), k)

In [10]:
occlude_smiles = [smiles[x] for x in occlude_data_inds]

In [11]:
num_to_explain = len(occlude_data_inds)

In [12]:
occlude_data = {}
for k,v in data.items():
    if isinstance(v, np.ndarray):
        vv = v[occlude_data_inds]
    elif isinstance(v, list):
        vv = [v[i] for i in occlude_data_inds]
    else:
        raise Exception("Data Type Not Supported")
    occlude_data[k] = vv

In [13]:
#Init explanation methods
cam = CAM(model)
gcam = GradCAM(model)
gcam_avg = GradCAMAvg(model)
grad = Gradient(model)
eb = EB(model)
ceb = cEB(model)


methods = [grad, eb, ceb, gcam, gcam_avg]
method_names = ["Gradient", "EB", "cEB", "CAM-GradCAM", "GradCAM-avg"]

In [14]:
N = len(occlude_data['norm_adjs'])

# Explain / Occlude / Predict

In [15]:
occlude_thresholds = [0, 0.01, 0.05, 0.1]
#occlude_thresholds = np.linspace(0,1, num=10, endpoint=False)

In [16]:
results = []

for i in range(N):
    A_arr = occlude_data['norm_adjs'][i][np.newaxis, :, :]
    X_arr = occlude_data['node_features'][i][np.newaxis, :, :]
    Y_arr = occlude_data['labels_one_hot'][i]
    smile = occlude_smiles[i]

    num_nodes = A_arr.shape[1]
    prob = model.predict_on_batch(x=[A_arr, X_arr])
    y_hat = prob.argmax()
    y = Y_arr.argmax()

    results_ = []
    for name,method in zip(method_names, methods):
        mask = method.getMasks([A_arr, X_arr])
        #Normalize
        mask = np.array(mask)
        mask /= mask.max()
        masks_c0, masks_c1 = mask
        
        
        y_hat_occ_c0_l = [occlude_and_predict(X_arr, A_arr, masks_c0, thresh, model) 
                              for thresh in occlude_thresholds]
        y_hat_occ_c1_l = [occlude_and_predict(X_arr, A_arr, masks_c1, thresh, model) 
                              for thresh in occlude_thresholds]


        results_.append({'weights': masks_c0, 
                         'smile': smile,
                         'method': name,
                         'class': 0,
                         'pred_y': y_hat,
                         'pred_y_occ': y_hat_occ_c0_l,
                         'gt_y': y
                         })
        results_.append({'weights': masks_c1, 
                         'smile': smile,
                         'method': name,
                         'class': 1,
                         'pred_y': y_hat,
                         'pred_y_occ': y_hat_occ_c1_l,
                         'gt_y': y
                        })
    results.append(results_)

### Occlusion Eval

Accuracy vs. occluded accuracy for each method

In [17]:
#Collect evaluation results

In [18]:
eval_results = []
for result in results:
    for result_ in result:
        method = result_["method"]
        method_cls = result_["class"]
        gt_y = result_["gt_y"]
        pred_y = result_["pred_y"]
        pred_y_occ = result_["pred_y_occ"]
        for thres,p in zip(occlude_thresholds, pred_y_occ):
            eval_results.append({"method": method, "class": method_cls, "threshold": thres,
                                 "accuracy": int(gt_y == pred_y), "accuracy_occluded": int(gt_y == p)})

In [19]:
df = pd.DataFrame(eval_results)

In [20]:
grps = df.groupby(["method", "class", "threshold"]).mean().reset_index()

#Define fidelity as unoccluded accuracy minus occluded accuracy
grps["fidelity"] = (grps.accuracy - grps.accuracy_occluded)

In [21]:
#Save
out_fn = "{0}_cls_occlusion_results.csv".format(dataset.lower())
out_fp = os.path.join(out_dir, out_fn)
grps.to_csv(out_fp, index=False)

### Inspect results

In [1]:
import os; import pandas as pd


In [2]:
data_dir = "../data/"
results_dir = "../results"
out_dir = os.path.join(results_dir, "occlusions")

In [9]:
#dataset = "BBBP"
#dataset = "BACE"
dataset = "TOX21"
in_fn = "{0}_cls_occlusion_results.csv".format(dataset.lower())
in_fp = os.path.join(out_dir, in_fn)
df = pd.read_csv(in_fp)

In [10]:
thres = 0.01 #final

In [11]:
dff = df[df.threshold == thres].groupby(['method']).mean()
dff.reset_index(inplace=True)
dff.fidelity = dff.fidelity.round(decimals=2)

print(dff[["method", "fidelity"]])

        method  fidelity
0  CAM-GradCAM      0.11
1           EB      0.19
2  GradCAM-avg      0.17
3     Gradient      0.53
4          cEB      0.12
