In [1]:
from lime import lime_text
from lime.lime_text import LimeTextExplainer

from sklearn.pipeline import make_pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import pickle
import time
from operator import itemgetter

In [2]:
import util

## Load Dataset

In [3]:
categories = ['alt.atheism', 'soc.religion.christian']
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)

In [4]:
# converting text to vectors
vectorizer = TfidfVectorizer()
vectors_train = vectorizer.fit_transform(newsgroups_train.data)
vectors_test = vectorizer.transform(newsgroups_test.data)

## Black Box Model

In [5]:
filename = './models/newsgroup_model.sav'
model = pickle.load(open(filename, 'rb'))

## Experiment

In [6]:
def mask(sample, predict, explanation):
    masked = " " + sample + " "
    arg = np.argmax(predict([sample]))
    n_actions = 0        
    while arg == target:
        if len(explanation) <= 0:
            break
        maxword = max(explanation, key=itemgetter(1))[0]
        amax = explanation.index(max(explanation, key=itemgetter(1)))
        del explanation[amax]
        masked = masked.replace(maxword, " ")
        arg = np.argmax(predict([masked]))
        n_actions += 1
    if arg == target:
        return -1, masked
    else:
        return n_actions, masked

In [7]:
def evaluate(data):
    start_t = time.time()
    explanation = explainer.explain_instance(data, model_pipe.predict_proba, num_features=15698)
    #print(explanation.as_list())
    
    end_t = time.time()
    t = end_t - start_t
    
    # number of actions
    n, masked = mask(data, model_pipe.predict_proba, explanation.as_list())
            
    # change in log odds
    c = util.change_in_log_odds(model_pipe.predict_proba, data, masked, target, end, lime=True)
    
    print("minus n_actions")
    print(n)
    print("log-odds")
    print(c)
    return n, c

In [8]:
class_names = ['atheism', 'christian']
model_pipe = make_pipeline(vectorizer, model)
explainer = LimeTextExplainer(class_names=class_names)

target = 0
end = 1
n_samples = 10000
sample_size = (1, 15698)
kernel_size = (1, 1)
    
ns = []
cs = []

i = 0
j = 0
while True:
    data = newsgroups_test.data[i]
    t = newsgroups_test.target[i]
    y = model_pipe.predict([data]) 
    i += 1
    if i >= 500:
        break
    if t != target or y != target:
        continue
    print("Sample: " + str(j))
    if j == 41:
        j += 1
        continue
    n, c = evaluate(data)
    ns.append(n)
    cs.append(c)
    j += 1
    
    if j >= 50:
        break

Sample: 0
minus n_actions
50
log-odds
24.5778544760645
Sample: 1
minus n_actions
-1
log-odds
1.8592749331443121
Sample: 2
minus n_actions
8
log-odds
1.1779641136593961
Sample: 3
minus n_actions
45
log-odds
2.3668459938267166
Sample: 4
minus n_actions
48
log-odds
7.361077302579538
Sample: 5
minus n_actions
21
log-odds
4.2981156562909195
Sample: 6
minus n_actions
54
log-odds
20.514242496287075
Sample: 7
minus n_actions
89
log-odds
4.483933818792996
Sample: 8
minus n_actions
75
log-odds
2.7154911848735326
Sample: 9
minus n_actions
82
log-odds
13.676409659429787
Sample: 10
minus n_actions
42
log-odds
6.218179815792096
Sample: 11
minus n_actions
51
log-odds
2.3642103920055972
Sample: 12
minus n_actions
30
log-odds
4.061137510863703
Sample: 13
minus n_actions
69
log-odds
7.551178581744226
Sample: 14
minus n_actions
43
log-odds
13.438305122682811
Sample: 15
minus n_actions
88
log-odds
12.342575919495813
Sample: 16
minus n_actions
74
log-odds
3.5668419363916612
Sample: 17
minus n_actions
51
lo

In [9]:
with open('./results/newsgroup/lime.pkl', 'wb') as f:
    pickle.dump(ns, f)
with open('./results/newsgroup/lime_change.pkl', 'wb') as f:
    pickle.dump(cs, f)