In [1]:
from lime import lime_text
from sklearn.pipeline import make_pipeline
from lime.lime_text import LimeTextExplainer

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import pickle
import time
from operator import itemgetter

In [2]:
import util

## Load Dataset

In [3]:
categories = ['alt.atheism', 'soc.religion.christian']
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)

In [4]:
# converting text to vectors
vectorizer = TfidfVectorizer()
vectors_train = vectorizer.fit_transform(newsgroups_train.data)
vectors_test = vectorizer.transform(newsgroups_test.data)

## Black Box Model

In [5]:
filename = './models/newsgroup_model.sav'
model = pickle.load(open(filename, 'rb'))

## Experiment

In [6]:
def mask_shortest_path_target(sample, predict, explanation):
    masked = sample
    arg = np.argmax(predict([sample]))
    n_actions = 0        
    while arg == target:
        if len(explanation) <= 0:
            break
        maxword = max(explanation, key=itemgetter(1))[0]
        amax = explanation.index(max(explanation, key=itemgetter(1)))
        del explanation[amax]
        masked = masked.replace(maxword, "")
        arg = np.argmax(predict([masked]))
        n_actions += 1
    #print(predict([sample]))
    #print(predict([masked]))
    if arg == target:
        return -1, masked
    else:
        return n_actions, masked

In [7]:
def evaluate(data):
    start_t = time.time()
    explanation = explainer.explain_instance(data, model_pipe.predict_proba, num_features=15698)
    #print(explanation.as_list())
    
    end_t = time.time()
    t = end_t - start_t
    
    # number of actions
    n, masked = mask_shortest_path_target(data, model_pipe.predict_proba, explanation.as_list())
            
    # change in log odds
    c = util.change_in_log_odds(model_pipe.predict_proba, data, masked, target, end, lime=True)
    
    print("minus n_actions")
    print(n)
    print("log-odds")
    print(c)
    return n, c

In [8]:
class_names = ['atheism', 'christian']
model_pipe = make_pipeline(vectorizer, model)
explainer = LimeTextExplainer(class_names=class_names)

target = 1
end = 0
n_samples = 10000
sample_size = (1, 15698)
kernel_size = (1, 1)
    
ns = []
cs = []

i = 0
j = 0
while True:
    data = newsgroups_test.data[i]
    y = np.argmax(model_pipe.predict_proba([data]))
    i += 1
    if y != target:
        continue
    print("Sample: " + str(j))
    if j == 41:
        j += 1
        continue
    n, c = evaluate(data)
    ns.append(n)
    cs.append(c)
    j += 1
    
    if j >= 50:
        break

Sample: 0
minus n_actions
1
log-odds
2.3930324467616977
Sample: 1
minus n_actions
4
log-odds
1.8640554424456464
Sample: 2
minus n_actions
26
log-odds
5.597502608350386
Sample: 3
minus n_actions
7
log-odds
3.972692554095742
Sample: 4
minus n_actions
22
log-odds
11.336215381908605
Sample: 5
minus n_actions
21
log-odds
9.800354597839732
Sample: 6
minus n_actions
24
log-odds
8.731800942938325
Sample: 7
minus n_actions
1
log-odds
3.0864631867018533
Sample: 8
minus n_actions
36
log-odds
12.84940151759644
Sample: 9
minus n_actions
9
log-odds
6.1433073360145
Sample: 10
minus n_actions
66
log-odds
10.768016509549849
Sample: 11
minus n_actions
1
log-odds
1.2838480285003337
Sample: 12
minus n_actions
18
log-odds
7.937853057971191
Sample: 13
minus n_actions
1
log-odds
2.3162067686179775
Sample: 14
minus n_actions
5
log-odds
5.616981077198603
Sample: 15
minus n_actions
-1
log-odds
7.518565489636984
Sample: 16
minus n_actions
-1
log-odds
14.88316512169644
Sample: 17
minus n_actions
8
log-odds
8.1760

In [10]:
with open('./results/newsgroup/lime.pkl', 'wb') as f:
    pickle.dump(ns, f)
with open('./results/newsgroup/lime_change.pkl', 'wb') as f:
    pickle.dump(cs, f)