In [1]:
from shap import KernelExplainer

from sklearn.pipeline import make_pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import pickle
import time
from operator import itemgetter

In [2]:
import util

## Load Dataset

In [3]:
categories = ['alt.atheism', 'soc.religion.christian']
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)

In [4]:
# converting text to vectors
vectorizer = TfidfVectorizer()
vectors_train = vectorizer.fit_transform(newsgroups_train.data)
vectors_test = vectorizer.transform(newsgroups_test.data)

## Black Box Model

In [5]:
filename = './models/newsgroup_model.sav'
model = pickle.load(open(filename, 'rb'))

## Experiment

In [6]:
def mask(sample, predict, explanation):
    masked = np.array(sample.todense())
    arg = np.argmax(predict(sample))
    n_actions = 0
    while arg == target:
        argmax = np.argmax(explanation).astype(int)
        maximum = np.amax(explanation)
        if maximum <= -100: 
            break
        if np.isclose(masked[0][argmax], 0.0):
            explanation[0][argmax] = -100
            continue
        masked[0][argmax] = 0.0
        explanation[0][argmax] = -100
        arg = np.argmax(predict(masked))
        n_actions += 1
    if arg == target:
        return -1, masked
    else:
        return n_actions, masked

In [7]:
def evaluate(data):
    start_t = time.time()
    shap_values = explainer.shap_values(data)[1]
    
    end_t = time.time()
    t = end_t - start_t
    
    # number of actions
    n, masked = mask(data, model.predict_proba, shap_values)
            
    # change in log odds
    c = util.change_in_log_odds(model.predict_proba, data, masked, target, end)
    
    print("minus n_actions")
    print(n)
    print("log-odds")
    print(c)
    return n, c

In [8]:
target = 0
end = 1
explainer = KernelExplainer(model.predict_proba, vectors_train[:100])
    
ns = []
cs = []

i = 0
j = 0
while True:
    data = vectors_test[i]
    y = np.argmax(model.predict_proba(data))
    t = newsgroups_test.target[i]
    i += 1
    if i >= 500:
        break
    if t != target or y != target:
        continue
    print("Sample: " + str(j))
    if j == 41:
        j += 1
        continue
    n, c = evaluate(data)
    with open('./results/newsgroup/shap/shap_' + j + '.pkl', 'wb') as f:
        pickle.dump(n, f)
    with open('./results/newsgroup/shap/shap_change_' + j + '.pkl', 'wb') as f:
        pickle.dump(c, f)
    ns.append(n)
    cs.append(c)
    j += 1
    
    if j >= 50:
        break

Sample: 0


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




KeyboardInterrupt: 

In [None]:
with open('./results/newsgroup/shap.pkl', 'wb') as f:
    pickle.dump(ns, f)
with open('./results/newsgroup/shap_change.pkl', 'wb') as f:
    pickle.dump(cs, f)