In [1]:
%load_ext autoreload
%autoreload 2

import checklist
import spacy
import itertools

import checklist.editor
import checklist.text_generation
from checklist.test_types import MFT, INV, DIR
from checklist.expect import Expect
import numpy as np
import spacy
from checklist.test_suite import TestSuite
from checklist.perturb import Perturb
from transformers import pipeline

In [None]:
# use pipeline sentimental analysis
model = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.





In [3]:
# used to decrease size of chunk
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

# use to process the data with the sentimental model
def batch_predict(model, data, batch_size=128):
    ret = []
    for d in chunks(data, batch_size):
        ret.extend(model(d))
    return ret

In [4]:
import numpy as np

# Function to check the score
# I am not sure about the score that author has calculated.
def pred_and_conf(data):
    # change format to softmax, make everything in [0.33, 0.66] range be predicted as neutral
    preds = batch_predict(model, data)
    pr = np.array([x['score'] if x['label'] == 'POSITIVE' else 1 - x['score'] for x in preds])
    pp = np.zeros((pr.shape[0], 3))
    margin_neutral = 1/3.
    mn = margin_neutral / 2.
    neg = pr < 0.5 - mn
    pp[neg, 0] = 1 - pr[neg]
    pp[neg, 2] = pr[neg]
    pos = pr > 0.5 + mn
    pp[pos, 0] = 1 - pr[pos]
    pp[pos, 2] = pr[pos]
    neutral_pos = (pr >= 0.5) * (pr < 0.5 + mn)
    pp[neutral_pos, 1] = 1 - (1 / margin_neutral) * np.abs(pr[neutral_pos] - 0.5)
    pp[neutral_pos, 2] = 1 - pp[neutral_pos, 1]
    neutral_neg = (pr < 0.5) * (pr > 0.5 - mn)
    pp[neutral_neg, 1] = 1 - (1 / margin_neutral) * np.abs(pr[neutral_neg] - 0.5)
    pp[neutral_neg, 0] = 1 - pp[neutral_neg, 1]
    preds = np.argmax(pp, axis=1)
    return preds, pp
    

In [5]:
suite_path = 'sentiment_suite.pkl' #define suite path name
suite = TestSuite.from_file(suite_path)

In [6]:
suite.run(pred_and_conf, n=500) # run the test suite with 500 samples and check score with `pred_and_conf` function

Running single positive words
Predicting 34 examples
Running single negative words
Predicting 35 examples
Running single neutral words
Predicting 13 examples
Running Sentiment-laden words in context
Predicting 500 examples
Running neutral words in context
Predicting 500 examples
Running intensifiers
Predicting 1000 examples
Running reducers
Predicting 1000 examples
Running change neutral words with BERT
Predicting 5092 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5500 examples
Running add random urls and handles
Predicting 11000 examples
Running punctuation
Predicting 1154 examples
Running typos
Predicting 1000 examples
Running 2 typos
Predicting 1000 examples
Running contractions
Predicting 1039 examples
Running change names
Predicting 3311 examples
Running change locations
Predicting 5500 examples
Running used to, but now
Predicting 500 examples
Running "used to" should reduce
Predicting 1000 examples
Running protected: race
P

In [7]:
suite.summary() #summary the test case

Vocabulary

single positive words
Test cases:      34
Fails (rate):    0 (0.0%)


single negative words
Test cases:      35
Fails (rate):    1 (2.9%)

Example fails:
0.3 0.0 0.7 average
----


single neutral words
Test cases:      13
Fails (rate):    13 (100.0%)

Example fails:
0.0 0.0 1.0 found
----
0.0 0.0 1.0 Indian
----
0.0 0.0 1.0 see
----


Sentiment-laden words in context
Test cases:      8658
Test cases run:  500
Fails (rate):    7 (1.4%)

Example fails:
0.0 0.0 1.0 It is an average food.
----
0.5 0.5 0.0 It is a creepy service.
----
0.0 0.0 1.0 It was a weird food.
----


neutral words in context
Test cases:      1716
Test cases run:  500
Fails (rate):    485 (97.0%)

Example fails:
0.0 0.0 1.0 It was a British staff.
----
0.0 0.0 1.0 That was an Israeli cabin crew.
----
0.0 0.0 1.0 This flight was Israeli.
----


intensifiers
Test cases:      2000
Test cases run:  500
After filtering: 499 (99.8%)
Fails (rate):    5 (1.0%)

Example fails:
0.2 0.0 0.8 This was a creepy flight.
