# Set-up

Change the working directory to the one where you saved your files

In [None]:
cd /content/drive/MyDrive/COMP0087/allenNLP/BCN

/content/drive/MyDrive/COMP0087/allenNLP/BCN


In [None]:
from torch import nn
import torch.optim as optim
import numpy as np
import spacy
nlp = spacy.load('en_core_web_sm')
import pandas as pd

Change path below as needed

In [None]:
test_set = pd.read_json('/content/drive/MyDrive/COMP0087/data/test.jsonl', orient='records', lines=True)

In [None]:
%%shell
pip install torch==1.7.1

In [None]:
%%shell
pip install allennlp==2.1.0 allennlp-models==2.1.0

In [None]:
# imports from allennlp
from allennlp.models.archival import load_archive
from allennlp.common.util import JsonDict
from allennlp.data import Instance
from allennlp.predictors.predictor import Predictor
from allennlp.data.fields import LabelField
from allennlp.data.tokenizers.spacy_tokenizer import SpacyTokenizer

from typing import List, Dict

from overrides import overrides

from allennlp.interpret.attackers import Attacker, InputReduction

from allennlp.interpret.saliency_interpreters import SimpleGradient

In [None]:
%%shell
pip install checklist==0.0.10

In [None]:
import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from scipy.stats import kendalltau
from scipy.stats import spearmanr

Background about model: http://docs.allennlp.org/v0.9.0/api/allennlp.models.biattentive_classification_network.html

In [None]:
# importing the dataset reader
import tagging
# importing the BCN model
import BCN_model

# Training

In [None]:
# training model
# here, the output will be saved to a new folder called 'BCN_output'. You will get an error message if such a directory already exists.
# !pwd; allennlp train --include-package tagging -s BCN_output config_BCN.jsonnet

# Predictions

In [None]:
@Predictor.register('ag_text_classifier')
class AGNewsClassifier(Predictor):
    """
    Predictor for any model that takes in a sentence and returns
    a single class for it.  In particular, it can be used with
    the [`BasicClassifier`](../models/basic_classifier.md) model.

    """

    def predict(self, sentence: str) -> JsonDict:
        return self.predict_json({"Description": sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like `{"sentence": "..."}`.
        Runs the underlying model, and adds the `"label"` to the output.
        """
        sentence = json_dict["Description"]
        reader_has_tokenizer = (
            getattr(self._dataset_reader, "tokenizer", None) is not None
            or getattr(self._dataset_reader, "_tokenizer", None) is not None
        )
        if not reader_has_tokenizer:
            tokenizer = SpacyTokenizer()
            sentence = tokenizer.tokenize(sentence)
        return self._dataset_reader.text_to_instance(sentence)

    @overrides
    def predictions_to_labeled_instances(
        self, instance: Instance, outputs: Dict[str, np.ndarray]
    ) -> List[Instance]:
        new_instance = instance.duplicate()
        label = np.argmax(outputs["class_probabilities"])
        new_instance.add_field("label", LabelField(int(label), skip_indexing=True))
        return [new_instance]

In [None]:
archive = load_archive("./BCN_output/model.tar.gz")
model = archive.model
vocab = model.vocab

In [None]:
predictor = Predictor.from_archive(archive, 'ag_text_classifier')

In [None]:
predictor.predict(sentence="Canadian Press - VANCOUVER (CP) - The sister of a man who died after a violent confrontation with police has demanded the city's chief constable resign for defending the officer involved.")

{'class_probabilities': [2.405789976955841e-11,
  4.568966868257265e-15,
  1.0,
  3.932658886041107e-12],
 'label': '1',
 'logits': [2.251049280166626,
  -6.3178839683532715,
  26.70160675048828,
  0.43990111351013184]}

# Interpret

In [None]:
@Predictor.register('ag_text_classifier_with_input_red')
class InputReductionTextClassifierPredictor(AGNewsClassifier):
    
    def predict_json(self, json_dict: JsonDict) -> JsonDict:
        predictor = AGNewsClassifier(self._model, self._dataset_reader)
        prediction = predictor.predict(sentence=json_dict['Description'])

        attacker = InputReduction(predictor)
        attack = attacker.attack_from_json(inputs=json_dict,
                                           input_field_to_attack='tokens',
                                           grad_input_field='grad_input_1',
                                           ignore_tokens=None)

        return {'prediction': prediction, 'input_reduction_output': attack}

In [None]:
predictor_with_input_red = Predictor.from_archive(archive, 'ag_text_classifier_with_input_red')

In [None]:
input = "The Cleveland Indians pulled within one game of the AL Central lead by beating the Minnesota Twins, 7-1, Saturday night with home runs by Travis Hafner and Victor Martinez."

In [None]:
predictor_with_input_red.predict(input)

{'input_reduction_output': {'final': [['pulled', 'Hafner']],
  'original': ['The',
   'Cleveland',
   'Indians',
   'pulled',
   'within',
   'one',
   'game',
   'of',
   'the',
   'AL',
   'Central',
   'lead',
   'by',
   'beating',
   'the',
   'Minnesota',
   'Twins',
   ',',
   '7',
   '-',
   '1',
   ',',
   'Saturday',
   'night',
   'with',
   'home',
   'runs',
   'by',
   'Travis',
   'Hafner',
   'and',
   'Victor',
   'Martinez',
   '.']},
 'prediction': {'class_probabilities': [0.000256297062151134,
   0.9993062019348145,
   0.0002524169394746423,
   0.00018510186055209488],
  'label': '2',
  'logits': [-1.1653821468353271,
   7.103096961975098,
   -1.1806373596191406,
   -1.4908137321472168]}}

# Visualisation

Source: https://adataanalyst.com/machine-learning/highlight-text-using-weights/

In [None]:
import html
import random
from IPython.core.display import display, HTML

In [None]:
# Prevent special characters like & and < to cause the browser to display something other than what you intended.
def html_escape(text):
    return html.escape(text)

In [None]:
def visualise_weights(tokens, gradients, max_alpha = 0.4):
  max_alpha = max_alpha 
  highlighted_text = []
  for i in range(len(tokens)):
      weight = gradients[i]
      highlighted_text.append('<span style="background-color:rgba(135,206,250,' + str(weight / max_alpha) + ');">' + html_escape(tokens[i]) + '</span>')
  highlighted_text = ' '.join(highlighted_text)
  print(display(HTML(highlighted_text)))

# Checklist

In [None]:
sample = []
for row in test_set.head(1000).itertuples():
  sample.append(row[3])

In [None]:
tokenizer = SpacyTokenizer()
for sentence in sample[:10]:
  tokens = [str(x) for x in tokenizer.tokenize(sentence)]
  grad = SimpleGradient(predictor).saliency_interpret_from_json({'Description':sentence})
  gradient_list = grad['instance_1']['grad_input_1']
  visualise_weights(tokens, gradient_list, 0.4)

None


None


None


None


None


None


None


None


None


None


In [None]:
pdata = list(nlp.pipe(sample))

In [None]:
for i in range(100):
  sentence = sample[i]
  if Perturb.contractions(sample[i]) != []:

    tokens = [str(x) for x in tokenizer.tokenize(sentence)]

    grad_orig = SimpleGradient(predictor).saliency_interpret_from_json({'Description':sentence})
    gradient_list_orig = np.array(grad_orig['instance_1']['grad_input_1'])

    visualise_weights(tokens, gradient_list_orig)

    perturbed_sentence = Perturb.contractions(sentence)[0]

    tokens = [str(x) for x in tokenizer.tokenize(perturbed_sentence)]

    grad_pert = SimpleGradient(predictor).saliency_interpret_from_json({'Description': perturbed_sentence})
    gradient_list_pert = np.array(grad_pert['instance_1']['grad_input_1'])

    visualise_weights(tokens, gradient_list_pert)

None


None


None


None


None


None


None


None


None


None


None


None


None


None


None


None


None


None


None


None


None


None


# Manual perturbations

In [None]:
# select sentence from sample
sentence = sample[12]
sentence

" AMSTERDAM (Reuters) - Free Record Shop, a Dutch music  retail chain, beat Apple Computer Inc. to market on Tuesday  with the launch of a new download service in Europe's latest  battleground for digital song services."

In [None]:
tokens = [str(x) for x in tokenizer.tokenize(sentence)]

print(predictor.predict(sentence=sentence)['label'])
grad_orig = SimpleGradient(predictor).saliency_interpret_from_json({'Description':sentence})
gradient_list_orig = np.array(grad_orig['instance_1']['grad_input_1'])

visualise_weights(tokens, gradient_list_orig)

# example: changing week day fromm Tuesday to Wednesday
perturbed_sentence = "AMSTERDAM (Reuters) - Free Record Shop, a Dutch music  retail chain, beat Apple Computer Inc. to market on Wednesday  with the launch of a new download service in Europe's latest  battleground for digital song services."

tokens = [str(x) for x in tokenizer.tokenize(perturbed_sentence)]
print(predictor.predict(sentence=perturbed_sentence)['label'])

grad_pert = SimpleGradient(predictor).saliency_interpret_from_json({'Description': perturbed_sentence})
gradient_list_pert = np.array(grad_pert['instance_1']['grad_input_1'])

visualise_weights(tokens, gradient_list_pert)

In [None]:
spearmanr(gradient_list_orig, gradient_list_pert)

SpearmanrResult(correlation=0.9682486631016043, pvalue=3.1728391771495316e-20)