In [2]:
import os
import pickle as pkl
import numpy as np
import shap
import torch
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline


PROJECT_DIR = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
SHAP_DIR = os.path.join(PROJECT_DIR, "classification/shap_values/coqa")
SPLITS_DIR = os.path.join(PROJECT_DIR, "classification/split_datasets/coqa")
MODELS_DIR = os.path.join(PROJECT_DIR, "classification/models")

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [3]:
''' Load SHAP values '''
with open(os.path.join(SHAP_DIR, "test.pkl"), "rb") as f:
    shap_values = pkl.load(f) # shape (1000, None, 2) (n_samples, n_features (not fixed), n_classes)

In [4]:
shap_values[2]

.values =
array([[-1.13922907e-04,  1.13922422e-04],
       [-1.13922907e-04,  1.13922422e-04],
       [-1.13922907e-04,  1.13922422e-04],
       [-1.13922907e-04,  1.13922422e-04],
       [-1.13922907e-04,  1.13922422e-04],
       [-1.13922907e-04,  1.13922422e-04],
       [-1.13922907e-04,  1.13922422e-04],
       [-1.13922907e-04,  1.13922422e-04],
       [-1.13922907e-04,  1.13922422e-04],
       [-1.13922907e-04,  1.13922422e-04],
       [-1.13922907e-04,  1.13922422e-04],
       [-1.13922907e-04,  1.13922422e-04],
       [-1.13922907e-04,  1.13922422e-04],
       [-1.13922907e-04,  1.13922422e-04],
       [-1.13922907e-04,  1.13922422e-04],
       [-1.13922907e-04,  1.13922422e-04],
       [-1.13922907e-04,  1.13922422e-04],
       [-1.13922907e-04,  1.13922422e-04],
       [ 9.92486351e-05, -9.92494096e-05],
       [ 9.92486351e-05, -9.92494096e-05],
       [ 9.92486351e-05, -9.92494096e-05],
       [ 9.92486351e-05, -9.92494096e-05],
       [ 9.92486351e-05, -9.92494096e-05],
 

In [5]:
''' Load data splits '''
dataset = load_from_disk(os.path.join(SPLITS_DIR))
texts = dataset["test"]["text"]

In [6]:
path_to_model = os.path.join(MODELS_DIR, "distilbert-base-uncased_13091207")

device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(path_to_model)
model = AutoModelForSequenceClassification.from_pretrained(path_to_model)
model.to(device)

pipe = TextClassificationPipeline(
    tokenizer=tokenizer,
    model=model,
    top_k=None, # get confidence scores for predictions
    # `return_all_scores` is now deprecated,  if want a similar funcionality use `top_k=None` instead of `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.
)
pipe.device = device

In [7]:
samples = ["I am in Finland", "seahorses are cool in Finland"]
explainer = shap.Explainer(pipe, seed=1)
sample_shap_values = explainer(samples)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [8]:
sample_shap_values

.values =
array([array([[ 0.        ,  0.        ],
              [-0.05176722,  0.05176717],
              [-0.0800604 ,  0.0800604 ],
              [ 0.01499557, -0.01499558],
              [-0.00300445,  0.00300446],
              [ 0.        ,  0.        ]]),
       array([[ 0.        ,  0.        ],
              [ 0.03928477, -0.03928478],
              [-0.03535185,  0.03535186],
              [-0.05292681,  0.05292687],
              [-0.02458153,  0.02458155],
              [-0.06960601,  0.06960601],
              [ 0.06535334, -0.06535332],
              [-0.00453931,  0.0045393 ],
              [ 0.        ,  0.        ]])], dtype=object)

.base_values =
array([[0.26197895, 0.73802108],
       [0.25797108, 0.74202889]])

.data =
(array(['', 'I ', 'am ', 'in ', 'Finland', ''], dtype=object), array(['', 'sea', 'horse', 's ', 'are ', 'cool ', 'in ', 'Finland', ''],
      dtype=object))

In [9]:
pipe(samples)

[[{'label': 'LABEL_1', 'score': 0.8578575253486633},
  {'label': 'LABEL_0', 'score': 0.14214245975017548}],
 [{'label': 'LABEL_1', 'score': 0.8243963718414307},
  {'label': 'LABEL_0', 'score': 0.17560367286205292}]]

## Ngrams

In [12]:
def create_ngrams(tokens, n):
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

tokens = [t.strip().lower() for t in sample_shap_values.data[0]]
bigrams = create_ngrams(tokens, 2)

In [13]:
bigrams

[' i', 'i am', 'am in', 'in finland', 'finland ']

In [18]:
ngrams_dict = {}
n = 2

for instance in sample_shap_values.data:
    tokens = [token.strip().lower() for token in instance]
    ngrams = create_ngrams(tokens, n)
    
    for ngram in ngrams:            
        if ngram not in ngrams_dict:
            ngrams_dict[ngram] = 1
        else:
            ngrams_dict[ngram] += 1

ngrams_dict


{' i': 1,
 'i am': 1,
 'am in': 1,
 'in finland': 2,
 'finland ': 2,
 ' sea': 1,
 'sea horse': 1,
 'horse s': 1,
 's are': 1,
 'are cool': 1,
 'cool in': 1}

In [63]:
# adding shap values for ngrams

ngrams_dict = {}
n = 2

for instance in sample_shap_values:
    tokens = [token.strip().lower() for token in instance.data]
    ngrams = create_ngrams(tokens, n) # [' i', 'i am', 'am in', 'in finland', 'finland ']
    shap_values = instance.values

    for ngram in ngrams:
        ngram_tokens = ngram.split()
        ngram_idxs = [tokens.index(token) for token in ngram_tokens]
        if ngram.startswith(" "): # at the start of the text
            ngram_tokens = ["<BOS>"] + ngram_tokens
            ngram_idxs = [0] + ngram_idxs
            ngram = "<BOS>" + ngram
        if ngram.endswith(" "): # at the end of the text
            ngram_tokens = ngram_tokens + ["<EOS>"]
            ngram_idxs = ngram_idxs + [len(tokens)-1]
            ngram = ngram + "<EOS>"
        
        # get shap values for ngram
        ngram_shap_values = shap_values[ngram_idxs[0]:ngram_idxs[-1]+1]
        ngram_shap_values = np.mean(ngram_shap_values, axis=0) # sum or mean?

        if ngram not in ngrams_dict:
            ngrams_dict[ngram] = {
                'neg': [],
                'pos': []
            }
        
        contribution_to_0 = ngram_shap_values[0]
        contribution_to_1 = ngram_shap_values[1]
        abs_shap_value = abs(contribution_to_0)

        if contribution_to_0 > contribution_to_1:
            # ngram contributes to prediction of class 0 (negative class)
            # => store negative contribution
            contribution = -abs_shap_value
            ngrams_dict[ngram]['neg'].append(contribution)
        elif contribution_to_0 < contribution_to_1:
            # ngram contributes to prediction of class 1 (positive class)
            # => store positive contribution
            contribution = abs_shap_value
            ngrams_dict[ngram]['pos'].append(contribution)
        else:
            # ngram does not contribute to prediction of either class
            pass
    
ngrams_dict

{'<BOS> i': {'neg': [], 'pos': [0.02588360756635666]},
 'i am': {'neg': [], 'pos': [0.06591380666941404]},
 'am in': {'neg': [], 'pos': [0.03253241255879402]},
 'in finland': {'neg': [-0.005995559506118298, -0.03040701523423195],
  'pos': []},
 'finland <EOS>': {'neg': [],
  'pos': [0.0015022270381450653, 0.0022696563974022865]},
 '<BOS> sea': {'neg': [-0.01964238379150629], 'pos': []},
 'sea horse': {'neg': [-0.0019664596766233444], 'pos': []},
 'horse s': {'neg': [], 'pos': [0.04413933027535677]},
 's are': {'neg': [], 'pos': [0.03875417169183493]},
 'are cool': {'neg': [], 'pos': [0.047093771398067474]},
 'cool in': {'neg': [], 'pos': [0.002126334235072136]}}