**Install required packages**

In [None]:
!pip install transformers
!pip install datasets
!pip install shap

**Imports**

In [None]:
import pandas as pd
import numpy as np
import shap
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, TextClassificationPipeline
import matplotlib.pyplot as plt

**Import data**

In [None]:
data = pd.read_csv('/content/drive/MyDrive/test (1).csv')
text = list(data['text'].values)

**Initialize model and tokenizer**

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/checkpoint-5000')
tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/checkpoint-5000')

pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True, truncation = True, max_length = 512)

In [None]:
def predict_proba(sentence):
    probas = []
    for label in pipe(sentence)[0]:
        probas.append(label['score'])
    return np.array(probas)

In [None]:
predict_proba(text[0])

array([9.99979615e-01, 2.03548752e-05])

In [None]:
explainer = shap.Explainer(pipe)

In [None]:
word_attr = explainer([metrics_text[10]])

  0%|          | 0/248 [00:00<?, ?it/s]

Partition explainer: 2it [04:46, 286.95s/it]              


In [None]:
word_attr.shape

(1, 1332, 2)

In [None]:
word_attr.shape

(1, 77, 2)

In [None]:
def faithfulness_metric(predict_proba, cls_explainer, text, tokenizer):
    # calculate word attributions
    pred_class = np.argmax(predict_proba(text))
    word_attributions = cls_explainer([text])
    coefs = []
    if word_attributions.shape[1] > 512:
      for x in word_attributions.values[0][:,pred_class][:512]:
        coefs.append(x)
    else:
      for x in word_attributions.values[0][:,pred_class]:
        coefs.append(x)
    # print(len(coefs))
    coefs = np.array(coefs[1:-1])
    tokens = np.array(tokenizer(text, truncation = True, max_length = 512)['input_ids'])[1:-1]
    base = np.zeros(tokens.shape[0])

    #find predicted class
    # pred_class = np.argmax(predict_proba(text))
    x = np.array(tokenizer(text, truncation = True, max_length = 512)['input_ids'])[1:-1]

    #find indexs of coefficients in decreasing order of value
    ar = np.argsort(-coefs)  #argsort returns indexes of values sorted in increasing order; so do it for negated array
    pred_probs = np.zeros(x.shape[0])

    for ind in np.nditer(ar):
        x_copy = x.copy()
        # print(x.shape)
        # print(base.shape)
        x_copy[ind] = base[ind]
        decoded_copy = tokenizer.decode(x_copy)
        x_copy_pr = predict_proba(decoded_copy)
        pred_probs[ind] = x_copy_pr[pred_class]

    return -np.corrcoef(coefs, pred_probs)[0,1]

In [None]:
def monotonicity_metric(predict_proba, cls_explainer, text, tokenizer):
    pred_class = np.argmax(predict_proba(text))
    # calculate word attributions
    word_attributions = cls_explainer([text])
    coefs = []
    print(word_attributions)
    if word_attributions.shape[1] > 512:
      for x in word_attributions.values[0][:,pred_class][:512]:
        coefs.append(x)
    else:
      for x in word_attributions.values[0][:,pred_class]:
        coefs.append(x)
    coefs = np.array(coefs[1:-1])
    tokens = np.array(tokenizer(text, truncation = True, max_length = 512)['input_ids'])[1:-1]
    base = np.zeros(tokens.shape[0])

    #find predicted class
    
    x = np.array(tokenizer(text, truncation = True, max_length = 512)['input_ids'])[1:-1]
    x_copy = base.copy()

    #find indexs of coefficients in increasing order of value
    ar = np.argsort(coefs)
    pred_probs = np.zeros(x.shape[0])
    for ind in np.nditer(ar):
        x_copy[ind] = x[ind]
        decoded_copy = tokenizer.decode(x_copy.astype(int))
        x_copy_pr = predict_proba(decoded_copy)
        pred_probs[ind] = x_copy_pr[pred_class]
    
    return np.all(np.diff(pred_probs[ar]) >= 0)

In [None]:
monotonicity_metric(predict_proba, explainer, text[2], tokenizer)

  0%|          | 0/248 [00:00<?, ?it/s]

Partition explainer: 2it [00:21, 21.69s/it]               


.values =
array([[[-2.06758523e-06,  2.06632433e-06],
        [-1.94575567e-03,  1.94575099e-03],
        [-8.86357710e-01,  8.86357728e-01],
        [-1.10540951e-01,  1.10540954e-01],
        [ 1.52690245e-05, -1.52715504e-05],
        [ 4.96730881e-05, -4.96840464e-05],
        [ 4.54521432e-06, -4.55323038e-06],
        [ 5.71398050e-06, -5.71728037e-06],
        [-2.41280019e-05,  2.41345637e-05],
        [ 5.82940459e-05, -5.82908411e-05],
        [ 2.54351229e-05, -2.54487289e-05],
        [ 6.87233114e-06, -6.87206773e-06],
        [ 4.18267155e-06, -4.17842045e-06],
        [ 2.71863246e-06, -2.71655813e-06],
        [ 9.24591368e-05, -9.24398087e-05],
        [-2.04254124e-04,  2.04249053e-04],
        [-2.50128724e-05,  2.50220182e-05],
        [-2.31840386e-06,  2.33265928e-06],
        [-2.86491799e-04,  2.86500483e-04],
        [ 4.83940436e-04, -4.83937390e-04],
        [-1.31125060e-04,  1.31127762e-04],
        [-1.09279958e-04,  1.09287475e-04],
        [ 0.00000000e+

False

In [None]:
monotonicity_array = []
for t in tqdm(metrics_text):
  monotonicity_array.append(monotonicity_metric(predict_proba, explainer, t, tokenizer))

m = np.array(monotonicity_array)

print(np.mean(m))

In [None]:
import pickle

In [None]:
with open('drive/MyDrive/monotonicity_shap.pkl', 'wb') as f:
  pickle.dump(m, f)

In [None]:
print(m.mean())

0.03


In [None]:
text[2]

'You liar you were involved in the whole 9 11 debacle I kid of course bounties'

In [None]:
faithfulness_metric(predict_proba, explainer, metrics_text[10], tokenizer)

  0%|          | 0/248 [00:00<?, ?it/s]

Partition explainer: 2it [04:33, 273.81s/it]              


0.20714245687995592

In [None]:
metrics_sample = pd.read_csv('drive/MyDrive/metrics_sample.csv')

In [None]:
metrics_text = metrics_sample['text'].values

In [None]:
!pip install tqdm



In [None]:
from tqdm import tqdm

In [None]:
faithfulness_array = []
for t in tqdm(metrics_text):
  faithfulness_array.append(faithfulness_metric(predict_proba, explainer, t, tokenizer))

l = np.array(faithfulness_array)

In [None]:
import pickle

In [None]:
with open('faithfulness_shap.pkl', 'wb') as f:
  pickle.dump(l, f)

In [None]:
print(l.mean())

0.5507082943475533


In [None]:
explainer = shap.Explainer(pipe) 

In [None]:
shap_values = explainer([text[0]])

  0%|          | 0/248 [00:00<?, ?it/s]

Partition explainer: 2it [00:44, 44.39s/it]               


In [None]:
shap_values.shape

(1, 30, 2)

In [None]:
shap_values.values[0][:,0]

array([ 3.52784991e-06,  2.12974846e-05,  1.15009025e-04,  4.71752137e-05,
        3.34437937e-05,  1.91647559e-05, -1.23806298e-04,  5.88446856e-05,
        2.36090273e-05,  2.81874090e-05,  6.93649054e-06,  3.16351652e-05,
        2.95396894e-05,  4.51486558e-05,  1.35786831e-05,  2.77310610e-05,
        7.80653208e-05, -5.01666218e-05,  8.50111246e-06, -7.93486834e-07,
       -8.40853900e-05, -1.42138451e-05, -6.12065196e-06,  5.74141741e-05,
        2.57177278e-04,  2.12183222e-04,  2.09754333e-04, -1.42399222e-05,
       -8.85665417e-04, -5.96046448e-08])

In [None]:
len(tokenizer(text[0])['input_ids'])

30

**Initialize and run explainer**

In [None]:
explainer = shap.Explainer(pipe) 

In [None]:
shap_values = explainer([text])

TypeError: ignored

In [None]:
def score_and_visualize(text):
  prediction = pipe([text])
  print(prediction[0])

  explainer = shap.Explainer(pipe)
  shap_values = explainer([text])

  shap.plots.text(shap_values)

In [None]:
shap_values = score_and_visualize(text[1250])

[{'label': 'LABEL_0', 'score': 0.0009607221581973135}, {'label': 'LABEL_1', 'score': 0.9990392923355103}]


  0%|          | 0/248 [00:00<?, ?it/s]

Partition explainer: 2it [01:10, 70.09s/it]               


In [None]:
shap_values = score_and_visualize(text[1174])

[{'label': 'LABEL_0', 'score': 0.0012127074878662825}, {'label': 'LABEL_1', 'score': 0.9987873435020447}]


  0%|          | 0/248 [00:00<?, ?it/s]

Partition explainer: 2it [00:40, 40.30s/it]               


In [None]:
shap_values = score_and_visualize(text[2863])

[{'label': 'LABEL_0', 'score': 0.0015150128165259957}, {'label': 'LABEL_1', 'score': 0.9984850287437439}]


  0%|          | 0/248 [00:00<?, ?it/s]

Partition explainer: 2it [00:38, 38.60s/it]               


In [None]:
shap_values = score_and_visualize(text[317])

[{'label': 'LABEL_0', 'score': 0.9999827146530151}, {'label': 'LABEL_1', 'score': 1.7326206943835132e-05}]


  0%|          | 0/210 [00:00<?, ?it/s]

Partition explainer: 2it [00:14, 14.68s/it]               


In [None]:
shap_values = score_and_visualize(text[325])

[{'label': 'LABEL_0', 'score': 0.9999799728393555}, {'label': 'LABEL_1', 'score': 2.0082736227777787e-05}]


  0%|          | 0/110 [00:00<?, ?it/s]