https://shap.readthedocs.io/en/latest/example_notebooks/text_examples/question_answering/Explaining%20a%20Question%20Answering%20Transformers%20Model.html

In [10]:
import numpy as np
import torch
import transformers

import shap

# load the model
pmodel = transformers.pipeline("question-answering")
tokenized_qs = None  # variable to store the tokenized data


# define two predictions, one that outputs the logits for the range start,
# and the other for the range end
def f(questions, tokenized_qs, start):
    outs = []
    for q in questions:
        idx = np.argwhere(np.array(tokenized_qs["input_ids"]) == pmodel.tokenizer.sep_token_id)[
            0, 0
        ]  # this code assumes that there is only one sentence in data
        d = tokenized_qs.copy()
        d["input_ids"][:idx] = q[:idx]
        d["input_ids"][idx + 1 :] = q[idx + 1 :]
        out = pmodel.model.forward(**{k: torch.tensor(d[k]).reshape(1, -1) for k in d})
        logits = out.start_logits if start else out.end_logits
        outs.append(logits.reshape(-1).detach().numpy())
    return outs


def tokenize_data(data):
    for q in data:
        question, context = q.split("[SEP]")
        tokenized_data = pmodel.tokenizer(question, context)
    return tokenized_data  # this code assumes that there is only one sentence in data


def f_start(questions):
    return f(questions, tokenized_qs, True)


def f_end(questions):
    return f(questions, tokenized_qs, False)


# attach a dynamic output_names property to the models so we can plot the tokens at each output position
def out_names(inputs):
    question, context = inputs.split("[SEP]")
    d = pmodel.tokenizer(question, context)
    return [pmodel.tokenizer.decode([id]) for id in d["input_ids"]]


f_start.output_names = out_names
f_end.output_names = out_names

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Fetching 0 files: 0it [00:00, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 0 files: 0it [00:00, ?it/s]

Device set to use cpu


In [11]:
data = [
    "What is on the table?[SEP]When I got home today I saw my cat on the table, and my frog on the floor.",
]  # this code assumes that there is only one sentence in data
tokenized_qs = tokenize_data(data)

explainer_start = shap.Explainer(f_start, shap.maskers.Text(tokenizer=pmodel.tokenizer, output_type="ids"))
shap_values_start = explainer_start(data)

shap.plots.text(shap_values_start)

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [00:37, 37.43s/it]               
