### Exploring NLP Models with Skater/LIME

In this example, we'll train a couple types of models, and use Skater, LIME, and ipywidgets to interactively explore model behavior.

### Install Deps

In [2]:
! pip install --upgrade np_utils
! pip install --upgrade theano
! pip install --upgrade tensorflow
! pip install keras==2.0.6
! pip install spacy



Collecting np_utils
  Downloading np_utils-0.5.3.4.tar.gz (56kB)
[K    100% |████████████████████████████████| 61kB 329kB/s ta 0:00:01
[?25hCollecting numpy>=1.0 (from np_utils)
  Downloading numpy-1.13.1-cp36-cp36m-manylinux1_x86_64.whl (17.0MB)
[K    100% |████████████████████████████████| 17.0MB 43kB/s eta 0:00:01  2% |█                               | 491kB 2.0MB/s eta 0:00:09    5% |█▋                              | 870kB 2.2MB/s eta 0:00:08    28% |█████████                       | 4.8MB 2.3MB/s eta 0:00:06    45% |██████████████▌                 | 7.7MB 1.2MB/s eta 0:00:08    51% |████████████████▋               | 8.8MB 746kB/s eta 0:00:11    58% |██████████████████▉             | 10.0MB 1.4MB/s eta 0:00:05    60% |███████████████████▎            | 10.2MB 1.1MB/s eta 0:00:07    75% |████████████████████████▏       | 12.8MB 1.7MB/s eta 0:00:03    93% |█████████████████████████████▉  | 15.8MB 1.9MB/s eta 0:00:01
[?25hCollecting future>=0.16 (from np_utils)
  Downloading future

In [15]:
### Restart kernel
from __future__ import absolute_import

! python -m spacy download en

### Load SpaCy Language Model and Dataset

In [4]:
import spacy
import warnings
from sklearn.datasets import fetch_20newsgroups
import numpy as np
warnings.filterwarnings('ignore')
nlp = spacy.load('en')

from sklearn.model_selection import train_test_split
#gimme data
dataset = fetch_20newsgroups()
docs = dataset.data
y = dataset.target

docs_train, docs_test, y_train, y_test = train_test_split(docs, y, test_size = .3)

### Model 1: Pretrained Word Embeddings

We will use SpaCy's pretrained word embeddings as document representations, and feed these representations into a gradient boosting classifier.

In [7]:
#gimme vectors
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from spacy.tokens.doc import Doc
from sklearn.metrics import classification_report
import six
def doc2vec(x):
    if isinstance(x, (six.binary_type, six.string_types)):
        return nlp(x, parse = False, entity = False, tag = False).vector
    
    elif type(x) in [list, tuple, np.ndarray]:
        return np.array([doc2vec(six.text_type(doc)) for doc in x])
    
    else:
        raise ValueError("Unrecognized Input") 

# build a pipeline of text -> vector (transformer), vector -> predictions (model)
model = GradientBoostingClassifier(n_estimators = 50)
model = LogisticRegression()

transformer = FunctionTransformer(func = doc2vec, validate=False)
pipeline = make_pipeline(transformer, model)
pipeline.fit(docs_train, y_train)       

#Classification Report on Holdout
print(
    classification_report(y_test, 
                          pipeline.predict(docs_test), 
                          target_names=dataset.target_names)
)

                          precision    recall  f1-score   support

             alt.atheism       0.59      0.46      0.52       151
           comp.graphics       0.62      0.52      0.56       196
 comp.os.ms-windows.misc       0.56      0.63      0.59       161
comp.sys.ibm.pc.hardware       0.48      0.48      0.48       167
   comp.sys.mac.hardware       0.62      0.38      0.47       176
          comp.windows.x       0.58      0.71      0.64       180
            misc.forsale       0.65      0.78      0.71       174
               rec.autos       0.68      0.60      0.64       178
         rec.motorcycles       0.60      0.72      0.65       176
      rec.sport.baseball       0.63      0.66      0.65       186
        rec.sport.hockey       0.74      0.79      0.76       184
               sci.crypt       0.70      0.82      0.76       171
         sci.electronics       0.62      0.58      0.60       175
                 sci.med       0.75      0.86      0.81       177
         

### Model 2: CNN 
In this model, we convert text to a list of padded lists of word IDs, to be used in an embedding lookup table. The embeddings will be trained as part of a CNN implemented with Keras.

In [8]:

from sklearn.datasets import fetch_20newsgroups
from spacy.tokens import Doc
import spacy
from spacy.matcher import Matcher
from spacy.attrs import ORTH, IS_PUNCT
from collections import Counter
from functools import partial

class TextProcesser(object):
    def __init__(self, corpus, nlp=None, max_len=200, max_vocab_size=20000):
        """
        corpus: list of strings
            Documents used to initialize vocabulary.
            
        nlp: Spacy language model
            If none then will build one in __init__
            
        max_len: int
            Maximum length of a document sequence. Balance information with scale of data.
            
        max_vocab_size: int
        
        """
        self.max_vocab_size = max_vocab_size
        self.max_len = max_len
        self.nlp = nlp or spacy.load('en')
        self.PADDING_VAL = 1
        self.MISSING_VAL = 2
        self.START_VAL = 3
        self.END_VAL = 4
        self.vocab = {}
        self.vocab_counts = Counter()#Counter(['PADDING_VAL','MISSING_VAL','START_VAL','END_VAL'])
        self.build_vocab(corpus)

        
    def pad(self, obj):
        n_pads = max(self.max_len - len(obj) - 2, 0)
        we_can_take = self.max_len - 2
        result = [self.START_VAL] + obj[:we_can_take] + [self.END_VAL] + [self.PADDING_VAL] * n_pads
        return result
        
    def get_current_vocab_size(self):
        return len(self.vocab)
        
    def update(self, words):
        for word in words:
            self.vocab_counts.update([word])

    def build_vocab(self, corpus):
        self.vocab = {}
        self.vocab_counts = Counter()
        
        for doc in nlp.tokenizer.pipe(map(six.text_type, corpus)):
            self.update(map(self._process_token, doc))
            
        for i, (word, count) in enumerate(self.vocab_counts.most_common(self.max_vocab_size)):
            self.vocab[word] = i
        
    def _process_token(self, token):
        if token.is_space:
            return "SPACE"
        elif token.is_punct:
            return "PUNCT"       
        elif token.like_url:
            return "URL"
        elif token.like_email:
            return "EMAIL"
        elif token.like_num:
            return "NUM"
        else:
            return token.lower_

    def process_token(self, token):
        return self.vocab.get(self._process_token(token), self.MISSING_VAL)

    def process(self, texts):
        docs = []
        for doc in self.nlp.tokenizer.pipe(list(texts)):
            docs.append(self.pad(list(map(self.process_token, doc))))
        return np.array(docs)
            
    def __call__(self, texts):
        return self.process(texts)
    

In [9]:
#convolutional model: https://arxiv.org/abs/1408.5882
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout,  Input, Dense, Activation, Flatten
from keras.models import Sequential, Model, Sequential
from keras.layers.embeddings import Embedding
from keras.layers.wrappers import TimeDistributed
from keras.layers.merge import Concatenate
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.preprocessing import label_binarize


def model_factory(seq_len, 
                  vocab_size, 
                  embedding_size, 
                  n_classes, 
                  model_type='sequential',
                  loss='categorical_crossentropy', 
                  metrics=['acc'], 
                  optimizer='rmsprop'):
    
    def create_sequential_model():
        model = Sequential()
        model.add(Embedding(vocab_size, embedding_size, input_length=seq_len))
        model.add(Conv1D(64, 3, strides=1, padding='valid'))
        model.add(MaxPooling1D(pool_size=2))
        model.add(GlobalMaxPooling1D())
        model.add(Activation('relu'))
        model.add(Dense(n_classes,  activation='softmax'))
        return model
        
    def create_non_sequential_model():
        _input = Input(shape=(seq_len,), dtype='int32')
        _embedding = Embedding(vocab_size, embedding_size, input_length=seq_len)(_input)

        # each filter is (3 x 300 ) array of weights
        # window (kernel_size) is 3
        # so number of weights is (3 * 300 * 64)
        # each filter outputs a (200 / strides) x 1 transformation
        # padding is how we handle boundaries. include + pad, ignore, etc
        _conv_1 = Conv1D(64, 3, strides=1, padding='valid')(_embedding)

        # Cuts the size of the output in half, maxing over every 2 inputs
        _pool_1 = MaxPooling1D(pool_size=2)(_conv_1)
        _conv_2 = Conv1D(64, 3, padding='valid')(_pool_1)
        _pool_2 = GlobalMaxPooling1D()(_conv_2) 
        _activation = Activation('relu')(_pool_2)
        output = Dense(n_classes,  activation='softmax')(_activation)
        model = Model(inputs=_input, outputs=output)
        return model
        

    def create_model():
        if model_type=='sequential':
            model = create_sequential_model()
        elif model_type == 'non-sequential':
            model = create_non_sequential_model()        
        else:
            raise ValueError("Unrecognized model type {}".format(model_type))

        model.compile(loss=loss,
                     optimizer=optimizer,
                     metrics=metrics)
        return model

    return create_model
    
seq_len = 350
vocab_size = 25000
embedding_size = 300
epochs = 8
batch_size = 100
n_classes = len(np.unique(y))

model_build = model_factory(seq_len, vocab_size, embedding_size, n_classes)
model2 = KerasClassifier(build_fn=model_build, epochs=epochs, batch_size=batch_size, verbose=1)
processor = FunctionTransformer(TextProcesser(docs_train, nlp=nlp, max_len=seq_len), validate=False)
pipeline2 = make_pipeline(processor, model2)

# need to one hot encode y labels
y2_train = label_binarize(y_train, classes=range(len(np.unique(dataset.target_names))))
pipeline2.fit(docs_train, y2_train)

# make model silent after training
params = model2.get_params()
params = {key: value for key, value in params.items() if key != 'build_fn'}
params['verbose'] = 0
model2.set_params(**params)

# Model Performance on Holdout
print(
    classification_report(y_test, 
                          pipeline2.predict(docs_test), 
                          target_names=dataset.target_names)
)

Using TensorFlow backend.


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
                          precision    recall  f1-score   support

             alt.atheism       0.90      0.87      0.88       151
           comp.graphics       0.84      0.80      0.82       196
 comp.os.ms-windows.misc       0.84      0.81      0.83       161
comp.sys.ibm.pc.hardware       0.62      0.80      0.70       167
   comp.sys.mac.hardware       0.85      0.84      0.84       176
          comp.windows.x       0.88      0.83      0.85       180
            misc.forsale       0.75      0.74      0.74       174
               rec.autos       0.82      0.92      0.87       178
         rec.motorcycles       0.93      0.88      0.90       176
      rec.sport.baseball       0.92      0.91      0.91       186
        rec.sport.hockey       0.96      0.94      0.95       184
               sci.crypt       0.96      0.93      0.95       171
         sci.electronics       0.80      0.79      0.79      

### Model explanations
Here, we'll wrap each pipeline into a Skater model object. We'll use this model object to generate LIME explanations in HTML to help better understand how each model makes predictions. We'll wrap this functionality into an ipywidget to allow the user the (a) modify the text and (b) toggle between models.

In [18]:
## You may need to enable ipywidgets
! pip install ipywidgets
!jupyter nbextension enable --py --user widgetsnbextension

In [13]:
#Create the explorer app.
from warnings import filterwarnings
filterwarnings('ignore')
from ipywidgets import Button, Textarea, Layout, Box, Label, Text, Output, RadioButtons, HBox
from IPython.display import display, HTML, clear_output
from skater.core.local_interpretation.lime.lime_text import LimeTextExplainer

class TextExplainer(object):
    def __init__(self, models, init_pattern=""):
        """
        Display box for LIME results.
        
        models: dictionary of skater of models.
            Keys correspond to user-defined model names, used for radio buttons.
            Values are skater models used to generate predictions.
    
        """
        self.status = "Ready"
        self.explainer = LimeTextExplainer(class_names=dataset.target_names)
        self.models = models
        self.model_names = list(self.models.keys())
        self.text_field = Textarea(init_pattern, layout=Layout(height='200px', width='500px'))
        self.text_box = Box([Label(value='Text Box'), self.text_field])
        
        self.status_field = Label(self.status, layout=Layout(height='50px', width='100px'))        
        self.status_box = Box([Label(value='Status'), self.status_field])

        self.match_button = Button(description='Explain', )
        self.match_button.on_click(self.match_pattern)
        
        self.model_selectors = RadioButtons(
            options = self.model_names,
            description = "Use Model"
        )
        
        self.inputs_box = HBox([self.text_box, self.model_selectors])        
        
        self.explanation_area = Output()
        display(self.inputs_box)       
        display(self.match_button)
        display(self.status_box)
        display(self.explanation_area)
        
    @property
    def model(self):
        return self.models[self.model_selectors.value]
            
    @property
    def text(self):
        return self.text_field.value    
    
    def match_pattern(self, b):
        self.status_field.value = 'loading'
        with self.explanation_area:
            clear_output()
            display(HTML(self.get_explanation_as_html(self.text)))
        self.status_field.value = 'ready'

    def get_explanation_as_html(self, text):
        
        # generate most likely class to confine LIME results

        explanation = self.explainer.explain_instance(text, 
                                                      self.model, 
                                                      top_labels=1)

        return explanation.as_html()


In [14]:
models = {"CNN": pipeline2.predict_proba, ' GBC-Pretrain': pipeline.predict_proba}
r = TextExplainer(models, docs_test[3])

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget