# Text Explainer in pure Numpy and Scikit-learn

## Steps:

* Sample data and generate synthetic neighborhood
    * For text, this involves removing words from the document
    * Use BoW to create a binary vector
* Get model predictions
    * This involves returning binary data back to data domain
    * Split data into tokens defined by CountVectorizer analyzer
    * Index based on which non-zero elements of binary vector (for each synthetic example)
    * Concatenate to form "raw text" (default: whitespace)
* Solve
* Explain (i.e. get the most important features)

## Notes on how LIME does it

* `lime.lime_text.__data_labels_distances`
* Sample N random integers, each integer representing how many words to remove

## Load 20newsgroups and train a model

In [47]:
# Some auxiliary imports for the tutorial
import sys
import random
import numpy as np
from pprint import pprint
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.linear_model import Ridge

from lime.lime_text import LimeTextExplainer

# Set seed for reproducibility
np.random.seed(123456)

In [48]:
# Train on a subset of categories

categories = [
    'rec.sport.baseball',
    'soc.religion.christian',
    'sci.med'
]

raw_train = datasets.fetch_20newsgroups(subset='train', categories=categories)
print(list(raw_train.keys()))
print(raw_train.target_names)
print(raw_train.target[:10])
raw_test = datasets.fetch_20newsgroups(subset='test', categories=categories)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(raw_train.data)
y_train = raw_train.target

print(X_train.shape)

X_test = vectorizer.transform(raw_test.data)
y_test = raw_test.target

print(X_test.shape)

clf = MultinomialNB(alpha=0.1)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

['data', 'filenames', 'target_names', 'target', 'DESCR']
['rec.sport.baseball', 'sci.med', 'soc.religion.christian']
[1 0 2 2 0 2 0 0 0 1]
(1790, 29009)
(1191, 29009)


0.9689336691855583

In [57]:
X_train

<1790x29009 sparse matrix of type '<class 'numpy.float64'>'
	with 291648 stored elements in Compressed Sparse Row format>

In [3]:
raw_test.data[0]

"From: luriem@alleg.edu(Michael Lurie) The Liberalizer\nSubject: Re: RE:Re:ALL-TIME BEST PLAYERS\nOrganization: Allegheny College\nLines: 20\n\nIn article <1993Apr21.120525.1@tesla.njit.edu> drm6640@tesla.njit.edu  \nwrites:\n> Overall (career)\n> 1.\tDon Mattingly\n> 2.\tDon Mattingly\n> 3.\tDon Mattingly\n> 4.\tDon Mattingly\n> 5.\tDon Mattingly\n> 6.\tDon Mattingly\n> 7.\tDon Mattingly\n> 8.\tDon Mattingly\n> 9.\tDon Mattingly\n> 10.\tDon Mattingly\n> 11.\tDon Mattingly\n> ..\n\n\nWanna go to a game sometime?\nJesus christ boy, have you not heard of the real all-time best....STEVE  \nBALBONI...Now that's Yankee pride.\n"

In [4]:
def predict_fn(instance):
    vec = vectorizer.transform(instance)
    return clf.predict_proba(vec)


lime_text_explainer = LimeTextExplainer(
    class_names=categories
)

exp = lime_text_explainer.explain_instance(
    text_instance=raw_test.data[0],
    classifier_fn=predict_fn,
    labels=(0, 1, 2)
)

In [5]:
list(sorted(exp.as_list(0), key=lambda x: x[1], reverse=True))

[('Mattingly', 0.1630204569197587),
 ('Yankee', 0.047128435532711455),
 ('Lurie', 0.04592710278967293),
 ('PLAYERS', 0.045541508852427214),
 ('Allegheny', 0.04400147104174964),
 ('luriem', 0.04385267215867704),
 ('Liberalizer', 0.04244576588487228),
 ('Don', -0.030393475108189762),
 ('tesla', -0.04552783302602691),
 ('njit', -0.05400846560032084)]

In [6]:
count_vectorizer = CountVectorizer()
count_vectorizer.fit(raw_test.data[:1])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [7]:
vec = count_vectorizer.transform(raw_test.data[:1]).toarray()
vec / vec

array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1.]])

In [8]:
count_vectorizer.inverse_transform(vec)

[array(['10', '11', '120525', '1993apr21', '20', 'all', 'alleg',
        'allegheny', 'article', 'balboni', 'best', 'boy', 'career',
        'christ', 'college', 'don', 'drm6640', 'edu', 'from', 'game', 'go',
        'have', 'heard', 'in', 'jesus', 'liberalizer', 'lines', 'lurie',
        'luriem', 'mattingly', 'michael', 'njit', 'not', 'now', 'of',
        'organization', 'overall', 'players', 'pride', 're', 'real',
        'sometime', 'steve', 'subject', 'tesla', 'that', 'the', 'time',
        'to', 'wanna', 'writes', 'yankee', 'you'], dtype='<U12')]

In [9]:
print(dir(count_vectorizer))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_char_ngrams', '_char_wb_ngrams', '_check_stop_words_consistency', '_check_vocabulary', '_count_vocab', '_get_param_names', '_get_tags', '_limit_features', '_more_tags', '_sort_features', '_stop_words_id', '_validate_custom_analyzer', '_validate_params', '_validate_vocabulary', '_warn_for_unused_params', '_white_spaces', '_word_ngrams', 'analyzer', 'binary', 'build_analyzer', 'build_preprocessor', 'build_tokenizer', 'decode', 'decode_error', 'dtype', 'encoding', 'fit', 'fit_transform', 'fixed_vocabulary_', 'get_feature_names', 'get_params', 'get_stop_words', 'input', 'inverse_transform', 'lowercase', 'max_df', 'max

In [10]:
count_vectorizer.analyzer

'word'

In [11]:
analyzer = count_vectorizer.build_analyzer()
analyzer

functools.partial(<function _analyze at 0x7fc961b9d320>, ngrams=<bound method _VectorizerMixin._word_ngrams of CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)>, tokenizer=<built-in method findall of re.Pattern object at 0x7fc95f1719f0>, preprocessor=functools.partial(<function _preprocess at 0x7fc961bd1d40>, accent_function=None, lower=True), decoder=<bound method _VectorizerMixin.decode of CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                

In [12]:
print(analyzer(raw_test.data[0]))

['from', 'luriem', 'alleg', 'edu', 'michael', 'lurie', 'the', 'liberalizer', 'subject', 're', 're', 're', 'all', 'time', 'best', 'players', 'organization', 'allegheny', 'college', 'lines', '20', 'in', 'article', '1993apr21', '120525', 'tesla', 'njit', 'edu', 'drm6640', 'tesla', 'njit', 'edu', 'writes', 'overall', 'career', 'don', 'mattingly', 'don', 'mattingly', 'don', 'mattingly', 'don', 'mattingly', 'don', 'mattingly', 'don', 'mattingly', 'don', 'mattingly', 'don', 'mattingly', 'don', 'mattingly', '10', 'don', 'mattingly', '11', 'don', 'mattingly', 'wanna', 'go', 'to', 'game', 'sometime', 'jesus', 'christ', 'boy', 'have', 'you', 'not', 'heard', 'of', 'the', 'real', 'all', 'time', 'best', 'steve', 'balboni', 'now', 'that', 'yankee', 'pride']


In [13]:
print(count_vectorizer.get_feature_names())

['10', '11', '120525', '1993apr21', '20', 'all', 'alleg', 'allegheny', 'article', 'balboni', 'best', 'boy', 'career', 'christ', 'college', 'don', 'drm6640', 'edu', 'from', 'game', 'go', 'have', 'heard', 'in', 'jesus', 'liberalizer', 'lines', 'lurie', 'luriem', 'mattingly', 'michael', 'njit', 'not', 'now', 'of', 'organization', 'overall', 'players', 'pride', 're', 'real', 'sometime', 'steve', 'subject', 'tesla', 'that', 'the', 'time', 'to', 'wanna', 'writes', 'yankee', 'you']


### For a particular document, we want to map the token to its index in the original text

In [14]:
from collections import defaultdict

analyzed_text = count_vectorizer.build_analyzer()(raw_test.data[0])
dict_token_idxes = defaultdict(list)
dict_idx_to_token = dict()
for (idx, key) in enumerate(analyzed_text):
    dict_token_idxes[key].append(idx)
    dict_idx_to_token[idx] = key
    
dict_token_idxes

defaultdict(list,
            {'from': [0],
             'luriem': [1],
             'alleg': [2],
             'edu': [3, 27, 31],
             'michael': [4],
             'lurie': [5],
             'the': [6, 72],
             'liberalizer': [7],
             'subject': [8],
             're': [9, 10, 11],
             'all': [12, 74],
             'time': [13, 75],
             'best': [14, 76],
             'players': [15],
             'organization': [16],
             'allegheny': [17],
             'college': [18],
             'lines': [19],
             '20': [20],
             'in': [21],
             'article': [22],
             '1993apr21': [23],
             '120525': [24],
             'tesla': [25, 29],
             'njit': [26, 30],
             'drm6640': [28],
             'writes': [32],
             'overall': [33],
             'career': [34],
             'don': [35, 37, 39, 41, 43, 45, 47, 49, 51, 54, 57],
             'mattingly': [36, 38, 40, 42, 44, 46, 48,

### Now sampling: we randomly draw out 1's from the document

In [15]:
vec = count_vectorizer.transform(raw_test.data[:1]).toarray()
vec = vec / vec
num_samples = 1000
dim = vec.shape[1]

mask = np.ones((num_samples, dim))
mask.shape

(1000, 53)

In [16]:
num_words_mask = np.random.randint(1, dim, size=num_samples)
num_words_mask[0] = dim
num_words_mask

array([53, 33, 10, 14, 39, 34, 38, 22,  4, 17, 48, 35, 13, 45, 36, 45, 12,
       35, 28, 15, 39, 33, 12, 10, 10,  5,  1, 15, 49, 11, 46, 14,  3, 27,
        9, 52, 22, 26,  2, 18, 29, 37, 50, 15, 42, 47, 17, 20, 44, 39, 23,
        3, 50, 36,  9, 32, 22, 17, 26,  3, 40,  1, 32, 48, 21, 45, 27, 20,
       39, 48, 20,  4,  2,  9,  8,  6, 20, 49, 38, 39,  6, 15, 23,  5, 23,
        1, 29, 12, 32, 44,  9, 36, 11, 25, 14, 40, 38, 42,  6, 33,  1, 11,
       21, 30, 24, 25, 38, 33, 32,  3, 39, 23, 24,  1, 12, 43, 25, 18, 17,
       37,  7,  6, 31, 49, 21, 20, 49, 38, 44, 42, 10, 24, 47, 41,  8, 20,
       24, 13, 15, 12, 30, 31, 28, 31, 16, 19,  7, 45, 33,  9, 28,  5, 10,
        6,  7, 41, 23, 40, 43, 10, 39, 19, 35, 42, 35, 41, 43, 45,  1, 43,
       19, 20, 30, 22, 14, 38, 24, 12, 36, 40, 36, 15,  4,  5, 37, 20, 11,
       22, 29, 28,  2, 22, 40,  4,  5, 30, 45,  9, 28,  9, 22, 23,  2, 44,
       50, 24, 48, 11, 12, 10, 10, 33,  6, 29, 14, 40, 30, 37, 13,  2, 23,
       47, 48,  9, 45, 46

In [17]:
def create_mask(x):
    return np.random.binomial(1, x/dim, size=dim)

In [18]:
mask = np.array(list(map(create_mask, num_words_mask)))
mask

array([[1, 1, 1, ..., 1, 1, 1],
       [0, 1, 1, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 1],
       ...,
       [1, 0, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 1, 0]])

In [19]:
mask.shape

(1000, 53)

In [20]:
mask.sum(axis=1)

array([53, 30, 14, 16, 42, 33, 39, 20,  2, 18, 49, 33,  9, 45, 43, 48, 11,
       35, 24, 14, 39, 32,  8,  6, 10,  7,  0, 14, 49, 10, 44, 13,  2, 18,
        8, 53, 20, 28,  2, 22, 30, 36, 51, 14, 37, 46, 14, 19, 46, 38, 26,
        7, 50, 41, 11, 29, 19, 16, 29,  7, 38,  3, 32, 47, 24, 45, 23, 21,
       42, 51, 19,  3,  7, 12, 11,  8, 18, 48, 40, 38,  5, 15, 23,  9, 20,
        2, 29, 17, 34, 46,  8, 36, 11, 15, 10, 43, 37, 44,  7, 29,  1, 18,
       22, 31, 29, 22, 37, 33, 39,  4, 36, 30, 22,  0,  9, 44, 23, 21, 20,
       34, 11,  6, 32, 49, 18, 26, 47, 40, 41, 46, 14, 22, 50, 37,  7, 20,
       32, 12,  9, 10, 33, 29, 24, 25, 18, 18,  6, 48, 32, 11, 26,  6, 10,
        7, 12, 43, 22, 43, 44,  9, 40, 25, 34, 48, 36, 43, 45, 43,  1, 46,
       17, 20, 31, 22, 12, 41, 23, 12, 32, 41, 31, 17,  3,  5, 36, 17, 21,
       18, 24, 29,  0, 26, 35,  2,  5, 21, 47, 12, 29,  6, 22, 29,  0, 48,
       49, 22, 49, 13, 14,  8, 12, 34,  8, 33, 12, 38, 33, 36, 12,  3, 25,
       49, 50,  9, 47, 42

### Now create the synthetic raw text (original document) based on mask

In [21]:
word_arr = np.array(count_vectorizer.get_feature_names())
word_arr

array(['10', '11', '120525', '1993apr21', '20', 'all', 'alleg',
       'allegheny', 'article', 'balboni', 'best', 'boy', 'career',
       'christ', 'college', 'don', 'drm6640', 'edu', 'from', 'game', 'go',
       'have', 'heard', 'in', 'jesus', 'liberalizer', 'lines', 'lurie',
       'luriem', 'mattingly', 'michael', 'njit', 'not', 'now', 'of',
       'organization', 'overall', 'players', 'pride', 're', 'real',
       'sometime', 'steve', 'subject', 'tesla', 'that', 'the', 'time',
       'to', 'wanna', 'writes', 'yankee', 'you'], dtype='<U12')

In [22]:
word_arr[mask.astype(bool)[0]]

array(['10', '11', '120525', '1993apr21', '20', 'all', 'alleg',
       'allegheny', 'article', 'balboni', 'best', 'boy', 'career',
       'christ', 'college', 'don', 'drm6640', 'edu', 'from', 'game', 'go',
       'have', 'heard', 'in', 'jesus', 'liberalizer', 'lines', 'lurie',
       'luriem', 'mattingly', 'michael', 'njit', 'not', 'now', 'of',
       'organization', 'overall', 'players', 'pride', 're', 'real',
       'sometime', 'steve', 'subject', 'tesla', 'that', 'the', 'time',
       'to', 'wanna', 'writes', 'yankee', 'you'], dtype='<U12')

In [23]:
mask_bool = mask.astype(bool)
mask_bool

array([[ True,  True,  True, ...,  True,  True,  True],
       [False,  True,  True, ..., False, False,  True],
       [ True, False, False, ..., False, False,  True],
       ...,
       [ True, False,  True, ...,  True,  True,  True],
       [ True,  True,  True, ..., False, False,  True],
       [False, False, False, ..., False,  True, False]])

In [24]:
list_words = [word_arr[mask_bool_row] for mask_bool_row in mask_bool]
list_words[:2]

[array(['10', '11', '120525', '1993apr21', '20', 'all', 'alleg',
        'allegheny', 'article', 'balboni', 'best', 'boy', 'career',
        'christ', 'college', 'don', 'drm6640', 'edu', 'from', 'game', 'go',
        'have', 'heard', 'in', 'jesus', 'liberalizer', 'lines', 'lurie',
        'luriem', 'mattingly', 'michael', 'njit', 'not', 'now', 'of',
        'organization', 'overall', 'players', 'pride', 're', 'real',
        'sometime', 'steve', 'subject', 'tesla', 'that', 'the', 'time',
        'to', 'wanna', 'writes', 'yankee', 'you'], dtype='<U12'),
 array(['11', '120525', '1993apr21', 'all', 'article', 'balboni', 'boy',
        'don', 'game', 'heard', 'in', 'liberalizer', 'lurie', 'mattingly',
        'michael', 'njit', 'not', 'now', 'of', 'organization', 'overall',
        'players', 'real', 'steve', 'subject', 'tesla', 'that', 'time',
        'to', 'you'], dtype='<U12')]

In [25]:
analyzed_text = np.array(analyzed_text)
print(analyzed_text)

['from' 'luriem' 'alleg' 'edu' 'michael' 'lurie' 'the' 'liberalizer'
 'subject' 're' 're' 're' 'all' 'time' 'best' 'players' 'organization'
 'allegheny' 'college' 'lines' '20' 'in' 'article' '1993apr21' '120525'
 'tesla' 'njit' 'edu' 'drm6640' 'tesla' 'njit' 'edu' 'writes' 'overall'
 'career' 'don' 'mattingly' 'don' 'mattingly' 'don' 'mattingly' 'don'
 'mattingly' 'don' 'mattingly' 'don' 'mattingly' 'don' 'mattingly' 'don'
 'mattingly' 'don' 'mattingly' '10' 'don' 'mattingly' '11' 'don'
 'mattingly' 'wanna' 'go' 'to' 'game' 'sometime' 'jesus' 'christ' 'boy'
 'have' 'you' 'not' 'heard' 'of' 'the' 'real' 'all' 'time' 'best' 'steve'
 'balboni' 'now' 'that' 'yankee' 'pride']


In [26]:
# Turn list_word into a list of indices
list_raw_synthetic = []
for list_word in list_words:
    a = []
    for word in list_word:
        a.extend(dict_token_idxes[word])
    a = sorted(a)
    list_raw_synthetic.append(' '.join([dict_idx_to_token[idx] for idx in a]))

In [27]:
list_raw_synthetic[3]

'edu michael the 20 1993apr21 120525 njit edu njit edu 10 11 game jesus christ boy have heard of the'

# Get model predictions

In [28]:
model_pred = predict_fn(list_raw_synthetic)
model_pred.shape

(1000, 3)

# Get distances between original sample and synthetic ones

In [29]:
mask.shape

(1000, 53)

In [30]:
distances = pairwise_distances(mask[0].reshape((1, -1)), mask, metric='cosine').ravel()
distances.shape

(1000,)

In [31]:
def kernel_fn(distances, kernel_width):
    return np.sqrt(np.exp(-(distances ** 2) / kernel_width ** 2))

weights = kernel_fn(distances, kernel_width=0.75 * np.sqrt(dim))
weights.shape

(1000,)

# Solve

In [32]:
label = 0
num_features = 10
solver = Ridge(alpha=1, fit_intercept=True)
solver.fit(mask, model_pred[:, label], sample_weight=weights)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='auto', tol=0.001)

In [33]:
importances = solver.coef_
explanations = sorted(list(zip(word_arr, importances)), 
                      key=lambda x: x[1], reverse=True)[:num_features]

In [34]:
explanations

[('mattingly', 0.20229552944903878),
 ('yankee', 0.07616704150331953),
 ('luriem', 0.07327664229296545),
 ('players', 0.07302627352955146),
 ('liberalizer', 0.07155918387960071),
 ('allegheny', 0.0707025581364165),
 ('lurie', 0.06929677019260583),
 ('game', 0.05867673442091009),
 ('alleg', 0.05447759700085077),
 ('career', 0.049264448378751505)]

# Put it all together

In [42]:
def explain_instance(text_instance, predict_fn, label, num_samples=5000, num_features=10):
    if type(text_instance) is str:
        text_instance = [text_instance]
    
    # Use Count Vectorizer to vectorize text as BoW
    count_vectorizer = CountVectorizer()
    bow_vec = count_vectorizer.fit_transform(text_instance).toarray()
    bow_vec = bow_vec / bow_vec
    word_dim = bow_vec.shape[1]
    
    # Build word analyzer
    analyzer = count_vectorizer.build_analyzer()
    
    # Map the token to its indices and each index to token
    analyzed_text = analyzer(text_instance[0])
    dict_token_idxes = defaultdict(list)
    for (idx, key) in enumerate(analyzed_text):
        dict_token_idxes[key].append(idx)
        
    # Now create the samples
    bin_samples = np.ones((num_samples, word_dim))
    
    # For each sample, choose how many features we will keep
    num_words_keep = np.random.randint(1, word_dim, size=num_samples)
    
    # First row is original sample, hence we keep all words
    num_words_keep[0] = word_dim
    
    # Sample binary data
    bin_samples = np.array(list(map(lambda x: np.random.binomial(1, x/word_dim, size=word_dim), 
                                    num_words_keep))).astype(bool)
    
    # Now create synthetic raw text
    features = np.array(count_vectorizer.get_feature_names())
    
    # boolean index the words
    list_words = [features[row] for row in bin_samples]
    
    # Get the synthetic raw text
    ### TODO: this is probably the most expensive part - can we optimize this?
    def unravel_text(list_word):
        a = []
        for word in list_word:
            a.extend(dict_token_idxes[word])
        a = sorted(a)
        return ' '.join(np.array(analyzed_text)[a])
    
    list_raw_synthetic = list(map(unravel_text, list_words))

    # Get model predictions
    model_pred = predict_fn(list_raw_synthetic)
    
    # Get distances between original sample and synthetic ones
    distances = pairwise_distances(bin_samples[0].reshape((1, -1)), bin_samples, metric='cosine').ravel()
    
    # Get weights
    def kernel_fn(distances, kernel_width):
        return np.sqrt(np.exp(-(distances ** 2) / kernel_width ** 2))

    weights = kernel_fn(distances, kernel_width=0.75 * np.sqrt(word_dim))
    
    # Solve
    solver = Ridge(alpha=1, fit_intercept=True)
    solver.fit(bin_samples, model_pred[:, label], sample_weight=weights)
    
    # Get explanation
    importances = solver.coef_
    explanations = sorted(list(zip(features, importances)), 
                          key=lambda x: x[1], reverse=True)[:num_features]
    
    return explanations

In [43]:
explain_instance(raw_test.data[0], predict_fn, label=0)

[('mattingly', 0.20780741991599813),
 ('luriem', 0.07959609932238476),
 ('lurie', 0.07517863517897007),
 ('liberalizer', 0.07399448731148721),
 ('players', 0.0710768744329371),
 ('yankee', 0.06994912722979445),
 ('allegheny', 0.06675727458508018),
 ('alleg', 0.05867803849984341),
 ('game', 0.05825432891236671),
 ('career', 0.05472549515534699)]

# Compare with LIME

In [44]:
lime_text_explainer = LimeTextExplainer(
    class_names=categories
)

exp = lime_text_explainer.explain_instance(
    text_instance=raw_test.data[0],
    classifier_fn=predict_fn,
    labels=(0,)
)

list(sorted(exp.as_list(0), key=lambda x: x[1], reverse=True))

[('Mattingly', 0.15814561673299102),
 ('Liberalizer', 0.0472819598152857),
 ('luriem', 0.04433302535343226),
 ('Yankee', 0.04127184243710446),
 ('Allegheny', 0.0399793866876642),
 ('Lurie', 0.03991892553345141),
 ('PLAYERS', 0.03664036266282317),
 ('game', 0.03593865911584771),
 ('tesla', -0.05375952068502894),
 ('njit', -0.061585384929271174)]

In [45]:
%timeit explain_instance(raw_test.data[0], predict_fn, label=0)

386 ms ± 11 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [46]:
%timeit exp = lime_text_explainer.explain_instance(text_instance=raw_test.data[0], classifier_fn=predict_fn, labels=(0,))

630 ms ± 13.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
