In [None]:
!pip install lime
!pip install transformers
!pip install pytreebank
!pip install datasets==1.5.0

import os
import random

import pandas as pd
import numpy as np
import scipy as sp
import torch
import spacy
from torch.utils.data import \
    TensorDataset, \
    DataLoader
from transformers import \
    BertTokenizer, \
    BertForSequenceClassification, \
    AdamW, \
    BertConfig, \
    get_linear_schedule_with_warmup


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
cd '/content/drive/MyDrive/NLP Project'

/content/drive/MyDrive/NLP Project


In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# BERT Example

**Load your data**

In [16]:
import src.data.dataload
sst=src.data.dataload.load_sst()
train, val, test = sst.train_val_test

In [24]:
ag_news=src.data.dataload.load_agnews()
train_ag, _, test_ag = ag_news.train_val_test

Using custom data configuration default


Downloading and preparing dataset ag_news/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/17ec33e23df9e89565131f989e0fdf78b0cc4672337b582da83fc3c9f79fe34d...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=11045148.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=751209.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset ag_news downloaded and prepared to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/17ec33e23df9e89565131f989e0fdf78b0cc4672337b582da83fc3c9f79fe34d. Subsequent calls will reuse this data.


**Set up your model & tokenizer**

In [None]:

bert_model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/NLP Project/BERT Model')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

id2label = bert_model.config.id2label
label2id = bert_model.config.label2id
labels = sorted(label2id, key=label2id.get)

**Explanations on SST**

In [None]:
from lime.lime_text import LimeTextExplainer
LIME_explainer = Explainer(predict_proba_BERT,labels,'LIME')

indices=np.random.choice(len(test), 20, replace=False)
instance_array = test['sentence'].iloc[indices]

top_tokens_SST,top_values_SST=LIME_explainer.explain_instances(instance_array)

In [None]:
top_tokens,top_values

**Explanations on AG News**

In [None]:
'''
NOTE - the BERT model hasn't been fine-tuned on AG News so this is just to show the mechanics on a different dataset
'''
LIME_explainer = Explainer(predict_proba_BERT,labels,'LIME')
indices=np.random.choice(len(test_ag), 20, replace=False)
instance_array = train_ag['sentence'].iloc[indices]
top_tokens_AG,top_values_AG=LIME_explainer.explain_instances(instance_array)

# BCN Example

**Set up your model & tokenizer**

In [None]:
cd /content/drive/MyDrive/NLP Project/AllenNLP

/content/drive/MyDrive/NLP Project/AllenNLP


In [6]:
!pip install allennlp==2.1.0 allennlp-models==2.1.0
import spacy
nlp = spacy.load('en_core_web_sm')
import allennlp
import allennlp_models
from allennlp.models.archival import load_archive
from allennlp.common.util import JsonDict
from allennlp.data import Instance
from allennlp.predictors.predictor import Predictor
from allennlp.data.fields import LabelField
from allennlp.data.tokenizers.spacy_tokenizer import SpacyTokenizer
from typing import List, Dict
from overrides import overrides

#BCN_model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/NLP Project/BERT Model')
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

#id2label = bert_model.config.id2label
#label2id = bert_model.config.label2id
#labels = sorted(label2id, key=label2id.get)

In [12]:
# importing the dataset reader
import tagging
# importing the BCN model
import BCN_model
archive = load_archive("./BCN_output/model.tar.gz")
BCN_model = archive.model
vocab = BCN_model.vocab
BCN_predictor = Predictor.from_archive(archive, 'ag_text_classifier')

In [27]:
from lime.lime_text import LimeTextExplainer
labels_BCN = ['Sci/Tech', 'Sports','World','Business']
LIME_explainer = Explainer(predict_proba_BCN,labels_BCN,'LIME')

indices=np.random.choice(len(test_ag), 20, replace=False)
instance_array = test_ag['sentence'].iloc[indices]
top_tokens_AG,top_values_AG=LIME_explainer.explain_instances(instance_array)

In [None]:
top_tokens_SST,top_values_SST

In [None]:
indices=np.random.choice(len(test_ag), 20, replace=False)
instance_array = test_ag['text'].iloc[indices]
top_tokens_AG,top_values_AG=LIME_explainer.explain_instances(instance_array)

**Functions and class definitions - if completed all this can be put into a separate .py module**

In [None]:
class Explainer():

  def __init__(self, predict_proba,labels, explainer_type='LIME'):
    '''
    predict_proba - predict function which will depend on model type
    '''
    if explainer_type == 'LIME':
      self.exp = LimeTextExplainer(class_names=labels)

    self.predict_proba=predict_proba

  def explain_instance(self,x):
    '''
    x - 1 input instance
    
    returns - list of top tokens/importance weights
    '''
    exp_instance=self.exp.explain_instance(x, self.predict_proba, num_features=10,top_labels=5,num_samples=50)

    pred_label = np.argmax(exp_instance.predict_proba)

    top_tokens=[x[0] for x in exp_instance.as_list(label=pred_label)]
    top_values = [x[1] for x in exp_instance.as_list(label=pred_label)]

    return top_tokens,top_values

  def explain_instances(self,X):
    '''
    X - array of input sentences
    '''

    top_tokens_list=[]
    top_values_list = []

    for s in X:

      top_tokens,top_values = self.explain_instance(s)

      top_tokens_list.append(top_tokens)
      top_values_list.append(top_values)

    return top_tokens_list,top_values_list


def predict_proba_BERT(x):

  '''
  this depends on the model, will be passed to explainer
  '''

  if isinstance(x,str):
    x=[x]

  with torch.no_grad():
    tv = torch.tensor([tokenizer.encode(v, padding='max_length', max_length=128,truncation=True) for v in x]).to(device)
    attention_mask = (tv!=0).type(torch.int64).to(device)
    outputs = bert_model(tv,attention_mask=attention_mask)
    scores = torch.softmax(outputs[0],dim=1)

    return scores.cpu().detach().numpy()

def predict_proba_BCN(x):

  #predict only on the sentence
  title = ' '

  a = BCN_predictor.predict_batch_json([
      dict(title=title, Description=s) for s in x
  ])

  class_probs=np.array([t['class_probabilities'] for t in a])
  return class_probs

@Predictor.register('ag_text_classifier')
class AGNewsClassifier(Predictor):
    """
    Predictor for any model that takes in a sentence and returns
    a single class for it.  In particular, it can be used with
    the [`BasicClassifier`](../models/basic_classifier.md) model.

    """

    def predict(self, sentence: str) -> JsonDict:
        return self.predict_json({"Description": sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like `{"sentence": "..."}`.
        Runs the underlying model, and adds the `"label"` to the output.
        """
        sentence = json_dict["Description"]
        reader_has_tokenizer = (
            getattr(self._dataset_reader, "tokenizer", None) is not None
            or getattr(self._dataset_reader, "_tokenizer", None) is not None
        )
        if not reader_has_tokenizer:
            tokenizer = SpacyTokenizer()
            sentence = tokenizer.tokenize(sentence)
        return self._dataset_reader.text_to_instance(sentence)

    @overrides
    def predictions_to_labeled_instances(
        self, instance: Instance, outputs: Dict[str, np.ndarray]
    ) -> List[Instance]:
        new_instance = instance.duplicate()
        label = np.argmax(outputs["class_probabilities"])
        new_instance.add_field("label", LabelField(int(label), skip_indexing=True))
        return [new_instance]