# Creating checklist test suite for SQuAD

Source: code from https://github.com/marcotcr/checklist/blob/115f123de47ab015b2c3a6baebaffb40bab80c9f/notebooks/SQuAD.ipynb with the following changes:
- data: use squad dataset from huggingface
- model: use a trained model based on ELECTRA-smal
- some other small function changes

In [1]:
%load_ext autoreload
%autoreload 2

import checklist
import spacy
import itertools

import checklist.editor
import checklist.text_generation
from checklist.test_types import MFT, INV, DIR
from checklist.expect import Expect
from checklist.test_suite import TestSuite
import numpy as np
import spacy
from checklist.perturb import Perturb


In [2]:
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, \
    AutoModelForQuestionAnswering, Trainer, TrainingArguments, HfArgumentParser
from transformers import pipeline 

model = pipeline('question-answering')
model({
    'context': 'A new strain of flu that has the potential to become a pandemic has been identified in China by scientists.',
    'question': 'What has been discovered by scientists from China ?'
})


No model was supplied, defaulted to distilbert-base-cased-distilled-squad (https://huggingface.co/distilbert-base-cased-distilled-squad)
  fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()}


{'score': 0.3915085792541504,
 'start': 0,
 'end': 19,
 'answer': 'A new strain of flu'}

In [3]:
model_name = "trained_model_squad1/"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = pipeline('question-answering', model=model, tokenizer=tokenizer)
model({
    'context': 'A new strain of flu that has the potential to become a pandemic has been identified in China by scientists.',
    'question': 'What has been discovered by scientists from China ?'
})


{'score': 0.5558424592018127,
 'start': 0,
 'end': 19,
 'answer': 'A new strain of flu'}

In [4]:
editor = checklist.editor.Editor()
editor.tg

<checklist.text_generation.TextGenerator at 0x7fa343ef3760>

In [5]:
def predconfs(context_question_pairs):
    """
    output: predictions, confidence 
    source: https://github.com/marcotcr/checklist/blob/115f123de47ab015b2c3a6baebaffb40bab80c9f/notebooks/tutorials/5.%20Testing%20transformer%20pipelines.ipynb
    """
    preds = []
    confs = []
    for c, q in context_question_pairs:
        try:
            p = model(question=q, context=c, truncation=True, )
        except:
            print('Failed', q)
            preds.append(' ')
            confs.append(1)
        preds.append(p['answer'])
        confs.append(p['score'])
    return preds, np.array(confs)

In [6]:
def format_squad_with_context(x, pred, conf, label=None, *args, **kwargs):
    c, q = x
    ret = 'C: %s\nQ: %s\n' % (c, q)
    if label is not None:
        ret += 'A: %s\n' % label
    ret += 'P: %s\n' % pred
    return ret

In [7]:
def format_squad(x, pred, conf, label=None, *args, **kwargs):
    c, q = x
    ret = 'Q: %s\n' % (q)
    if label is not None:
        ret += 'A: %s\n' % label
    ret += 'P: %s\n' % pred
    return ret

In [8]:
suite = TestSuite()

## Vocabulary

In [9]:
print(', '.join(editor.suggest('{first_name} is {mask} than {first_name2}.')[:60]))

smarter, better, older, younger, taller, worse, different, stronger, cooler, nicer, tougher, shorter, bigger, hotter, more, darker, happier, smaller, faster, richer, wiser, thinner, less, weaker, larger, quieter, cleaner, closer, healthier, heavier, colder, slower, harder, wealthier, safer, quicker, longer, higher, cheaper, thicker, louder, sharper, lighter, warmer, brighter, greater, deeper, lower, easier, softer, smoother, poorer, other, stranger, newer, stricter, simpler, clearer, superior, tighter


In [10]:
adj = ['old', 'smart', 'tall', 'young', 'strong', 'short', 'tough', 'cool', 'fast', 'nice', 'small', 'dark', 'wise', 'rich', 'great', 'weak', 'high', 'slow', 'strange', 'clean']
adj = [(x.rstrip('e'), x) for x in adj]


In [11]:
adj[2]

('tall', 'tall')

In [12]:
t = editor.template(
    [(
    '{first_name} is {adj[0]}er than {first_name1}.',
    'Who is {adj[0]}er?'
    )
    ],
    labels = ['{first_name}'],
    adj=adj,
    remove_duplicates=True,
    nsamples=500,
    save=True
    )
name = 'A is COMP than B. Who is more COMP?'
description = ''
test = MFT(**t, name=name, description=description, capability='Vocabulary')
suite.add(test)

In [13]:
test.run(predconfs, n=100, overwrite=True)

Predicting 100 examples


In [14]:
test.summary(format_example_fn=format_squad_with_context)

Test cases:      499
Test cases run:  100
Fails (rate):    3 (3.0%)

Example fails:
C: Alice is richer than Joseph.
Q: Who is richer?
A: Alice
P: Joseph

----
C: Howard is higher than Bob.
Q: Who is higher?
A: Howard
P: Bob

----
C: Kim is younger than Laura.
Q: Who is younger?
A: Kim
P: Laura

----


In [15]:
t = editor.template(
    [(
    '{first_name} is {adj[0]}er than {first_name1}.',
    'Who is less {adj[1]}?'
    )
    ],
    labels = ['{first_name1}'],
    adj=adj,
    remove_duplicates=True,
    nsamples=500,
    save=True
    )
name = 'A is COMP than B. Who is less COMP?'
description = ''
test = MFT(**t, name=name, description=description, capability='Vocabulary')
suite.add(test)

In [16]:
test.run(predconfs, n=100)

Predicting 100 examples


In [17]:
test.summary(format_example_fn=format_squad_with_context)

Test cases:      497
Test cases run:  100
Fails (rate):    99 (99.0%)

Example fails:
C: Victoria is taller than Philip.
Q: Who is less tall?
A: Philip
P: Victoria

----
C: Richard is faster than Charles.
Q: Who is less fast?
A: Charles
P: Richard

----
C: Thomas is nicer than Larry.
Q: Who is less nice?
A: Larry
P: Thomas

----


In [18]:
def crossproduct(t):
    # takes the output of editor.template and does the cross product of contexts and qas
    ret = []
    ret_labels = []
    for x in t.data:
        cs = x['contexts']
        qas = x['qas']
        d = list(itertools.product(cs, qas))
        ret.append([(x[0], x[1][0]) for x in d])
        ret_labels.append([x[1][1] for x in d])
    t.data = ret
    t.labels = ret_labels
    return t


In [19]:
state = editor.suggest('John is very {mask} about the project.')[:20]
print(', '.join(editor.suggest('John is {mask} {state} about the project.', state=state)[:30]))
very = ['very', 'extremely', 'really', 'quite', 'incredibly', 'particularly', 'highly', 'super']
somewhat = ['a little', 'somewhat', 'slightly', 'mildly']

very, pretty, extremely, also, still, quite, more, really, not, clearly, fairly, incredibly, particularly, now, understandably, rather, cautiously, surprisingly, certainly, feeling, so, especially, definitely, generally, most, highly, super, reportedly, being, obviously


In [20]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} is {very} {s} about the project. {first_name1} is {s} about the project.',
            '{first_name1} is {s} about the project. {first_name} is {very} {s} about the project.',
            '{first_name} is {s} about the project. {first_name1} is {somewhat} {s} about the project.',
            '{first_name1} is {somewhat} {s} about the project. {first_name} is {s} about the project.',
            '{first_name} is {very} {s} about the project. {first_name1} is {somewhat} {s} about the project.',
            '{first_name1} is {somewhat} {s} about the project. {first_name} is {very} {s} about the project.',
        ],
        'qas': [
            (
                'Who is most {s} about the project?',
                '{first_name}'
            ), 
            (
                'Who is least {s} about the project?',
                '{first_name1}'
            ), 
            
        ]
        
    },
    s = state,
    very=very,
    somewhat=somewhat,
    remove_duplicates=True,
    nsamples=500,
    save=True
    ))
name = 'Intensifiers (very, super, extremely) and reducers (somewhat, kinda, etc)?'
desc = ''
test = MFT(**t, name=name, description=desc, capability='Vocabulary')
suite.add(test)


In [21]:
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)


Predicting 1200 examples
Test cases:      498
Test cases run:  100
Fails (rate):    100 (100.0%)

Example fails:
C: Jack is excited about the project. Marie is mildly excited about the project.
Q: Who is least excited about the project?
A: Marie
P: Jack

C: Jack is extremely excited about the project. Marie is excited about the project.
Q: Who is least excited about the project?
A: Marie
P: Jack

C: Jack is extremely excited about the project. Marie is mildly excited about the project.
Q: Who is least excited about the project?
A: Marie
P: Jack


----
C: Stephanie is positive about the project. Al is a little positive about the project.
Q: Who is most positive about the project?
A: Stephanie
P: Al

C: Al is a little positive about the project. Stephanie is positive about the project.
Q: Who is most positive about the project?
A: Stephanie
P: Al

C: Stephanie is particularly positive about the project. Al is a little positive about the project.
Q: Who is most positive about the project?

## Taxonomy

### Size, chape, color, age, material

In [22]:
import munch
order = ['size', 'shape', 'age', 'color']
props = []
properties = {
    'color' : ['red', 'blue','yellow', 'green', 'pink', 'white', 'black', 'orange', 'grey', 'purple', 'brown'],
    'size' : ['big', 'small', 'tiny', 'enormous'],
    'age' : ['old', 'new'],
    'shape' : ['round', 'oval', 'square', 'triangular'],
    'material' : ['iron', 'wooden', 'ceramic', 'glass', 'stone']
}
for i in range(len(order)):
    for j in range(i + 1, len(order)):
        p1, p2 = order[i], order[j]
        for v1, v2 in itertools.product(properties[p1], properties[p2]):
            props.append(munch.Munch({
                'p1': p1,
                'p2': p2,
                'v1': v1,
                'v2': v2,
            }))


In [23]:
print(', '.join(editor.suggest('There is {a:p.v1} {p.v2} {mask} in the room.', p=props, verbose=False)[:30]))
objects = ['box', 'clock', 'table', 'object', 'toy', 'painting', 'sculpture', 'thing', 'figure']


sofa, couch, wall, carpet, chair, table, light, lamp, door, clock, mirror, desk, bed, TV, bar, television, window, box, tree, painting, curtain, fan, fridge, screen, wallpaper, piano, rug, shelf, camera, candle


In [24]:
t = crossproduct(editor.template(
    {
        'contexts': [
            'There is {a:p.v1} {p.v2} {obj} in the room.',
            'There is {a:obj} in the room. The {obj} is {p.v1} and {p.v2}.',
        ],
        'qas': [
            (
                'What {p.p1} is the {obj}?',
                '{p.v1}'
            ), 
            (
                'What {p.p2} is the {obj}?',
                '{p.v2}'
            ), 
            
        ]
        
    },
    obj=objects,
    p=props,
    remove_duplicates=True,
    nsamples=500,
    save=True
    ))
name = 'size, shape, age, color'
desc = ''
test = MFT(**t, name=name, description=desc, capability='Taxonomy')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 400 examples
Test cases:      500
Test cases run:  100
Fails (rate):    98 (98.0%)

Example fails:
C: There is a table in the room. The table is round and brown.
Q: What shape is the table?
A: round
P: round and brown

C: There is a round brown table in the room.
Q: What shape is the table?
A: round
P: round brown


----
C: There is a figure in the room. The figure is new and purple.
Q: What age is the figure?
A: new
P: new and purple

C: There is a new purple figure in the room.
Q: What age is the figure?
A: new
P: purple


----
C: There is a painting in the room. The painting is oval and grey.
Q: What shape is the painting?
A: oval
P: oval and grey

C: There is an oval grey painting in the room.
Q: What shape is the painting?
A: oval
P: oval grey


----


### Professions vs nationalities

In [25]:
professions = editor.suggest('{first_name} works as {a:mask}.')[:30]
professions += editor.suggest('{first_name} {last_name} works as {a:mask}.')[:30]
professions = list(set(professions))
if 'translator' in professions:
    professions.remove('translator')

In [26]:
def clean(string):
    return string.lstrip('[a,the,an,in,at] ').rstrip('.')

In [27]:
def expect_squad(x, pred, conf, label=None, meta=None):
    return clean(pred) == clean(label)
expect_squad = Expect.single(expect_squad)

In [28]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} is {a:nat} {prof}.',
            '{first_name} is {a:prof}. {first_name} is {nat}.',
            '{first_name} is {nat}. {first_name} is {a:prof}.',
            '{first_name} is {nat} and {a:prof}.',
            '{first_name} is {a:prof} and {nat}.',
        ],
        'qas': [
            (
                'What is {first_name}\'s job?',
                '{prof}'
            ), 
            (
                'What is {first_name}\'s nationality?',
                '{nat}'
            ), 
            
        ]
        
    },
    nat = editor.lexicons['nationality'][:10],
    prof=professions,
    remove_duplicates=True,
    nsamples=500,
    save=True,
    ))
name = 'Profession vs nationality'
test = MFT(**t, name=name, expect=expect_squad, description='',  capability='Taxonomy')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 1000 examples
Test cases:      500
Test cases run:  100
Fails (rate):    52 (52.0%)

Example fails:
C: Martha is a Brazilian architect.
Q: What is Martha's job?
A: architect
P: Brazilian architect


----
C: John is an Indian assistant.
Q: What is John's job?
A: assistant
P: Indian assistant

C: John is an assistant and Indian.
Q: What is John's job?
A: assistant
P: assistant and Indian


----
C: Arthur is a Russian administrator.
Q: What is Arthur's job?
A: administrator
P: Russian administrator


----


### Animal vs vehicle

In [30]:
animals = ['dog', 'cat', 'bull', 'cow', 'fish', 'serpent', 'snake', 'lizard', 'hamster', 'rabbit', 'guinea pig', 'iguana', 'duck']
vehicles = ['car', 'truck', 'train', 'motorcycle', 'bike', 'firetruck', 'tractor', 'van', 'SUV', 'minivan']
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} has {a:animal} and {a:vehicle}.',
            '{first_name} has {a:vehicle} and {a:animal}.',
        ],
        'qas': [
            (
                'What animal does {first_name} have?',
                '{animal}'
            ), 
            (
                'What vehicle does {first_name} have?',
                '{vehicle}'
            ), 
            
        ]
        
    },
    animal=animals,
    vehicle=vehicles,
    remove_duplicates=True,
    nsamples=500,
    save=True
    ))
name = 'Animal vs Vehicle'
test = MFT(**t, name=name, description='', capability='Taxonomy', expect=expect_squad)
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test, overwrite=True)


Predicting 400 examples
Test cases:      500
Test cases run:  100
Fails (rate):    57 (57.0%)

Example fails:
C: Alan has an iguana and a bike.
Q: What vehicle does Alan have?
A: bike
P: iguana and a bike

C: Alan has a bike and an iguana.
Q: What animal does Alan have?
A: iguana
P: bike and an iguana


----
C: Kenneth has an iguana and a van.
Q: What vehicle does Kenneth have?
A: van
P: iguana and a van

C: Kenneth has a van and an iguana.
Q: What animal does Kenneth have?
A: iguana
P: a van and an iguana


----
C: Walter has a bull and a train.
Q: What vehicle does Walter have?
A: train
P: bull and a train

C: Walter has a train and a bull.
Q: What vehicle does Walter have?
A: train
P: train and a bull


----


In [31]:
animals = ['dog', 'cat', 'bull', 'cow', 'fish', 'serpent', 'snake', 'lizard', 'hamster', 'rabbit', 'guinea pig', 'iguana', 'duck']
vehicles = ['car', 'truck', 'train', 'motorcycle', 'bike', 'firetruck', 'tractor', 'van', 'SUV', 'minivan']
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} bought {a:animal}. {first_name2} bought {a:vehicle}.',
            '{first_name2} bought {a:vehicle}. {first_name} bought {a:animal}.',
        ],
        'qas': [
            (
                'Who bought an animal?',
                '{first_name}'
            ), 
            (
                'Who bought a vehicle?',
                '{first_name2}'
            ), 
            
        ]
        
    },
    animal=animals,
    vehicle=vehicles,
    remove_duplicates=True,
    nsamples=500,
    save=True
    ))
name = 'Animal vs Vehicle v2'
test = MFT(**t, name=name, description='', capability='Taxonomy', expect=expect_squad)
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test, overwrite=True)

Predicting 400 examples
Test cases:      498
Test cases run:  100
Fails (rate):    24 (24.0%)

Example fails:
C: Wendy bought a motorcycle. Emma bought a fish.
Q: Who bought an animal?
A: Emma
P: Wendy


----
C: Laura bought a cat. Elaine bought a firetruck.
Q: Who bought a vehicle?
A: Elaine
P: Laura


----
C: Emma bought a fish. Evelyn bought a van.
Q: Who bought an animal?
A: Emma
P: Evelyn

C: Evelyn bought a van. Emma bought a fish.
Q: Who bought an animal?
A: Emma
P: Evelyn


----


In [34]:
synonyms = [ ('spiritual', 'religious'), ('angry', 'furious'), ('organized', 'organised'),
            ('vocal', 'outspoken'), ('grateful', 'thankful'), ('intelligent', 'smart'),
            ('humble', 'modest'), ('courageous', 'brave'), ('happy', 'joyful'), ('scared', 'frightened'),
           ]

t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} is very {s1[0]}. {first_name2} is very {s2[0]}.',
            '{first_name2} is very {s2[0]}. {first_name} is very {s1[0]}.',
        ],
        'qas': [
            (
                'Who is {s1[1]}?',
                '{first_name}'
            ), 
            (
                'Who is {s2[1]}?',
                '{first_name2}'
            ), 
            
        ]
        
    },
    s=synonyms,
    remove_duplicates=True,
    nsamples=250,
    save=True
   ))
t += crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} is very {s1[1]}. {first_name2} is very {s2[1]}.',
            '{first_name2} is very {s2[1]}. {first_name} is very {s1[1]}.',
        ],
        'qas': [
            (
                'Who is {s1[0]}?',
                '{first_name}'
            ), 
            (
                'Who is {s2[0]}?',
                '{first_name2}'
            ), 
            
        ]
        
    },
    s=synonyms,
    remove_duplicates=True,
    nsamples=250,
    save=True
    )) 
name = 'Synonyms'
test = MFT(**t, name=name, description='', capability='Taxonomy', expect=expect_squad)
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 400 examples
Test cases:      444
Test cases run:  100
Fails (rate):    4 (4.0%)

Example fails:
C: Stephanie is very vocal. Angela is very courageous.
Q: Who is outspoken?
A: Stephanie
P: Angela


----
C: Harry is very happy. Dick is very angry.
Q: Who is joyful?
A: Harry
P: Dick


----
C: Kim is very joyful. Marilyn is very thankful.
Q: Who is happy?
A: Kim
P: Marilyn


----


In [35]:
comp_pairs = [('better', 'worse'), ('older', 'younger'), ('smarter', 'dumber'), ('taller', 'shorter'), ('bigger', 'smaller'), ('stronger', 'weaker'), ('faster', 'slower'), ('darker', 'lighter'), ('richer', 'poorer'), ('happier', 'sadder'), ('louder', 'quieter'), ('warmer', 'colder')]
comp_pairs = list(set(comp_pairs))#list(set(comp_pairs + [(x[1], x[0]) for x in comp_pairs]))

In [37]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} is {comp[0]} than {first_name1}.',
            '{first_name1} is {comp[1]} than {first_name}.',
        ],
        'qas': [
            (
                'Who is {comp[1]}?',
                '{first_name1}',
            ),
            (
                'Who is {comp[0]}?',
                '{first_name}',
            )
            
        ]
        ,
    },
    comp=comp_pairs,
    remove_duplicates=True,
    nsamples=500,
    save=True
    ))
name = 'A is COMP than B. Who is antonym(COMP)? B'
test = MFT(**t, name=name, description='', capability='Taxonomy')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 400 examples
Test cases:      498
Test cases run:  100
Fails (rate):    100 (100.0%)

Example fails:
C: Patrick is darker than Eleanor.
Q: Who is lighter?
A: Eleanor
P: Patrick

C: Eleanor is lighter than Patrick.
Q: Who is darker?
A: Patrick
P: Eleanor


----
C: Grace is warmer than Kathryn.
Q: Who is colder?
A: Kathryn
P: Grace

C: Kathryn is colder than Grace.
Q: Who is warmer?
A: Grace
P: Kathryn


----
C: Albert is shorter than Matthew.
Q: Who is taller?
A: Matthew
P: Albert

C: Matthew is taller than Albert.
Q: Who is shorter?
A: Albert
P: Matthew


----


In [38]:
antonym_adjs = [('progressive', 'conservative'),('religious', 'secular'),('positive', 'negative'),('defensive', 'offensive'),('rude',  'polite'),('optimistic', 'pessimistic'),('stupid', 'smart'),('negative', 'positive'),('unhappy', 'happy'),('active', 'passive'),('impatient', 'patient'),('powerless', 'powerful'),('visible', 'invisible'),('fat', 'thin'),('bad', 'good'),('cautious', 'brave'), ('hopeful', 'hopeless'),('insecure', 'secure'),('humble', 'proud'),('passive', 'active'),('dependent', 'independent'),('pessimistic', 'optimistic'),('irresponsible', 'responsible'),('courageous', 'fearful')]
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} is more {a[0]} than {first_name1}.',
            '{first_name1} is more {a[1]} than {first_name}.',
            '{first_name} is less {a[1]} than {first_name1}.',
            '{first_name1} is less {a[0]} than {first_name}.',
        ],
        'qas': [
            (
                'Who is more {a[0]}?',
                '{first_name}',
            ),
            (
                'Who is less {a[0]}?',
                '{first_name1}',
            ),
            (
                'Who is more {a[1]}?',
                '{first_name1}',
            ),
            (
                'Who is less {a[1]}?',
                '{first_name}',
            ),
        ]
        ,
    },
    a = antonym_adjs,
    remove_duplicates=True,
    nsamples=500,
    save=True
    ))
name = 'A is more X than B. Who is more antonym(X)? B. Who is less X? B. Who is more X? A. Who is less antonym(X)? A.'
test = MFT(**t, name=name, description='', capability='Taxonomy')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 1600 examples
Test cases:      498
Test cases run:  100
Fails (rate):    100 (100.0%)

Example fails:
C: Jason is more independent than Victoria.
Q: Who is less independent?
A: Victoria
P: Jason

C: Jason is more independent than Victoria.
Q: Who is more dependent?
A: Victoria
P: Jason

C: Jason is less dependent than Victoria.
Q: Who is more dependent?
A: Victoria
P: Jason


----
C: Suzanne is more smart than Charlotte.
Q: Who is less smart?
A: Charlotte
P: Suzanne

C: Suzanne is more smart than Charlotte.
Q: Who is more stupid?
A: Charlotte
P: Suzanne

C: Charlotte is less smart than Suzanne.
Q: Who is less stupid?
A: Suzanne
P: Charlotte


----
C: Sally is more visible than Sandra.
Q: Who is more invisible?
A: Sandra
P: Sally

C: Sally is more visible than Sandra.
Q: Who is less visible?
A: Sandra
P: Sally

C: Sally is less invisible than Sandra.
Q: Who is more invisible?
A: Sandra
P: Sally


----


## Robustness

typos

In [None]:
# original code from https://github.com/marcotcr/checklist/blob/115f123de47ab015b2c3a6baebaffb40bab80c9f/notebooks/SQuAD.ipynb

# import pickle
# data, answers =  load_squad()
# spacy_map =  pickle.load(open('/home/marcotcr/tmp/processed_squad.pkl', 'rb'))
# pairs = [(x['passage'], x['question']) for x in data]
# processed_pairs = [(spacy_map[x[0]], spacy_map[x[1]]) for x in pairs]

In [42]:
dataset = datasets.load_dataset('squad')
pairs = [(x['context'], x['question']) for x in dataset['train']]

Reusing dataset squad (/Users/syang/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [90]:
# source: https://github.com/marcotcr/checklist/blob/115f123de47ab015b2c3a6baebaffb40bab80c9f/notebooks/QQP.ipynb
# all_questions = list(all_questions)
# parsed_questions = list(nlp.pipe(all_questions))
# spacy_map = dict([(x, y) for x, y in zip(all_questions, parsed_questions)])

nlp = spacy.load('en_core_web_sm')
all_questions = set() # a set of all questions and context
for x in dataset['train']:
    all_questions.add(x['question'])
    all_questions.add(x['context'])


parsed_questions = list(nlp.pipe(all_questions)) # this takes a while to run 
spacy_map = dict([(x, y) for x, y in zip(all_questions, parsed_questions)])

# import pickle
# pickle.dump(spacy_map, open('processed_squad.pkl', 'wb'))
# pickle.dump(parsed_questions, open('processed_questions.pkl', 'wb'))
# spacy_map =  pickle.load(open('processed_squad.pkl', 'rb'))


In [91]:
processed_pairs = [(spacy_map[x[0]], spacy_map[x[1]]) for x in pairs]

In [99]:
# check
spacy_map[pairs[0][0]].ents

(Architecturally,
 Catholic,
 Atop the Main Building's,
 the Virgin Mary,
 the Main Building,
 the Main Building,
 Grotto,
 Marian,
 Lourdes,
 France,
 Saint Bernadette Soubirous,
 1858,
 3,
 the Gold Dome)

In [100]:
def question_typo(x):
    """
    x[0]: context
    x[1]: question 
    Perturb.add_typos(x[1]): add a typo to question 
    """
    return (x[0], Perturb.add_typos(x[1]))
t = Perturb.perturb(pairs, question_typo, nsamples=500)
test = INV(**t, name='Question typo', capability='Robustness', description='')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad)
suite.add(test, overwrite=True)

Predicting 200 examples
Test cases:      500
Test cases run:  100
Fails (rate):    17 (17.0%)

Example fails:
Q: What part of production does she do?
P: melodies and ideas

Q: What parto f production does she do?
P: comes up with melodies and ideas during production, sharing them with producers


----
Q: What truck feature is occasionally used to keep asphalt warm?
P: hot engine exhaust

Q: What truck efature is occasionally used to keep asphalt warm?
P: dump trucks


----
Q: Who commanded the defense of the Estonia to surrender?
P: the Estonian government

Q: Who cmomanded the defense of the Estonia to surrender?
P: the orders of the Estonian government


----


Contractions

In [108]:
def contractions(x):
    conts = Perturb.contractions(x[1])
    return [(x[0], a) for a in conts]
t = Perturb.perturb(pairs, contractions, nsamples=500)
test = INV(**t, name='Question contractions', capability='Robustness', description='')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad)
suite.add(test)

Predicting 201 examples
Test cases:      500
Test cases run:  100
Fails (rate):    1 (1.0%)

Example fails:
Q: What is a requirement for equipment for the CAF?
P: suitable for a mixed-gender force

Q: What's a requirement for equipment for the CAF?
P: All equipment must be suitable for a mixed-gender force


----


Add random sentence

In [109]:
random_sentences = set()
for x, _ in processed_pairs:
    for y in x.sents:
        random_sentences.add(y.text)
random_sentences = list(random_sentences)

In [116]:
# check
for y in spacy_map[pairs[0][0]].sents:
    print(y.text)

Architecturally, the school has a Catholic character.
Atop the Main Building's gold dome is a golden statue of the Virgin Mary.
Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".
Next to the Main Building is the Basilica of the Sacred Heart.
Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection.
It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.
At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.


In [117]:
len(random_sentences)

91957

In [119]:
def add_random_sentence(x, **kwargs):
    random_s = np.random.choice(random_sentences)
    while random_s in x[0]:
        random_s = np.random.choice(random_sentences)
    random_s = random_s.strip('.') + '. '
    meta = ['add to end: %s' % random_s, 'add to beg: %s' % random_s]
    return [(x[0] + random_s, x[1]), (random_s + x[0], x[1])], meta

def format_add(x, pred, conf, label=None, meta=None):
    ret = format_squad(x, pred, conf, label, meta)
    if meta:
        ret += 'Perturb: %s\n' % meta
    return ret

t = Perturb.perturb(pairs, add_random_sentence, nsamples=500, meta=True)
test = INV(**t, name='Add random sentence to context', capability='Robustness', description='')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_add)
suite.add(test)

Predicting 300 examples
Test cases:      500
Test cases run:  100
Fails (rate):    20 (20.0%)

Example fails:
Q: The National Communications Act is expected to create and invironment favorable for what?
P: investment

Q: The National Communications Act is expected to create and invironment favorable for what?
P: more efficient service delivery
Perturb: add to beg: Li's extracurricular activities were described as "not all that outstanding". 


----
Q: How does Alaska finance its state government operations?
P: depends primarily on petroleum revenues and federal subsidies

Q: How does Alaska finance its state government operations?
P: petroleum revenues and federal subsidies
Perturb: add to beg: According to Joël Bellassen (1989), the most complex Chinese character is /𪚥 (U+2A6A5) zhé  listen (help·info), meaning "verbose" and containing sixty-four strokes; this character fell from use around the 5th century. 

Q: How does Alaska finance its state government operations?
P: petroleum rev

## NER

In [126]:
import re
def change_thing(change_fn):
    def change_both(cq, **kwargs):
        context, question = cq
        a = change_fn(context, meta=True)
        if not a:
            return None
        changed, meta = a
        ret = []
        for c, m in zip(changed, meta):
            new_q = re.sub(r'\b%s\b' % re.escape(m[0]), m[1], question.text)
            ret.append((c, new_q))
        return ret, meta
    return change_both
            

In [127]:
def expect_same(orig_pred, pred, orig_conf, conf, labels=None, meta=None):
    if not meta:
        return pred == orig_pred
    return pred == re.sub(r'\b%s\b' % re.escape(meta[0]), meta[1], orig_pred)

def format_replace(x, pred, conf, label=None, meta=None):
    ret = format_squad(x, pred, conf, label, meta)
    if meta:
        ret += 'Perturb: %s -> %s\n' % meta
    return ret

def format_replace_context(x, pred, conf, label=None, meta=None):
    ret = format_squad_with_context(x, pred, conf, label, meta)
    if meta:
        ret += 'Perturb: %s -> %s\n' % meta
    return ret

In [129]:
t = Perturb.perturb(processed_pairs, change_thing(Perturb.change_names), nsamples=500, meta=True)

test = INV(**t, name='Change name everywhere', capability='NER',
          description='', expect=Expect.pairwise(expect_same))
test.run(predconfs, n=100)
test.summary(3, format_example_fn=format_replace)
suite.add(test, overwrite=True)

Predicting 1100 examples
Test cases:      500
Test cases run:  100
Fails (rate):    8 (8.0%)

Example fails:
Q: Which Executive Order first contained the phrase "affirmative action"?
P: 10925

Q: Which Executive Order first contained the phrase "affirmative action"?
P: President Joseph Peterson Executive Order 10925
Perturb: John F. Kennedy's -> Joseph Peterson


----
Q: For whose cult were the Vestals appointed as priestesses?
P: Livia

Q: For whose cult were the Vestals appointed as priestesses?
P: the deified Leah
Perturb: Livia -> Leah

Q: For whose cult were the Vestals appointed as priestesses?
P: the deified Haley
Perturb: Livia -> Haley


----
Q: What did some commentators think about Darwin changing the phrasing in his book?
P: a concession to religion

Q: What did some commentators think about Luke changing the phrasing in his book?
P: a concession to religion that Luke later regretted
Perturb: Darwin -> Luke


----


In [130]:
t = Perturb.perturb(processed_pairs, change_thing(Perturb.change_location), nsamples=500, meta=True)

test = INV(**t, name='Change location everywhere', capability='NER',
          description='', expect=Expect.pairwise(expect_same))
test.run(predconfs, n=100)
test.summary(3, format_example_fn=format_replace)
suite.add(test, overwrite=True)

Predicting 1100 examples
Test cases:      500
Test cases run:  100
Fails (rate):    15 (15.0%)

Example fails:
Q: What was Greece's largest port as measured by good transported in 2010?
P: Aghioi Theodoroi

Q: What was United Kingdom's largest port as measured by good transported in 2010?
P: port of Aghioi Theodoroi
Perturb: Greece -> United Kingdom

Q: What was Malaysia's largest port as measured by good transported in 2010?
P: port of Aghioi Theodoroi
Perturb: Greece -> Malaysia


----
Q: Where are the indigenous peoples referred to as "American Indians"?
P: contiguous United States

Q: Where are the indigenous peoples referred to as "American Indians"?
P: the contiguous Russian Federation
Perturb: United States -> Russian Federation

Q: Where are the indigenous peoples referred to as "American Indians"?
P: the contiguous Turkey
Perturb: United States -> Turkey


----
Q: Is the Suez Canal natural?
P: artificial sea-level waterway

Q: Is the Suez Canal natural?
P: an artificial sea-le

## Temporal

In [131]:
t = crossproduct(editor.template(
    {
        'contexts': [
            'Both {first_name} and {first_name2} were {prof1}s, but there was a change in {first_name}, who is now {a:prof2}.',
            'Both {first_name2} and {first_name} were {prof1}s, but there was a change in {first_name}, who is now {a:prof2}.',
        ],
        'qas': [
            (
                'Who is {a:prof2}?',
                '{first_name}'
            ), 
        ]
        
    },
    save=True,
    prof=professions,
    remove_duplicates=True,
    nsamples=500,
    ))
name = 'There was a change in profession'
test = MFT(**t, expect=expect_squad, capability='Temporal', name=name, description='' )
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 200 examples
Test cases:      476
Test cases run:  100
Fails (rate):    0 (0.0%)


In [132]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} became a {prof} before {first_name2} did.',
            '{first_name2} became a {prof} after {first_name} did.',
        ],
        'qas': [
            (
                'Who became a {prof} first?',
                '{first_name}'
            ), 
            (
                'Who became a {prof} last?',
                '{first_name2}'
            ), 
        ]
        
    },
    save=True,
    prof=professions,
    remove_duplicates=True,
    nsamples=500,
    ))
name = 'Understanding before / after -> first / last.'
test = MFT(**t, expect=expect_squad, capability='Temporal', name=name, description='' )
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)


Predicting 400 examples
Test cases:      495
Test cases run:  100
Fails (rate):    100 (100.0%)

Example fails:
C: Julia became a writer before Alexander did.
Q: Who became a writer first?
A: Julia
P: Julia became a writer before Alexander

C: Julia became a writer before Alexander did.
Q: Who became a writer last?
A: Alexander
P: Julia became a writer before Alexander

C: Alexander became a writer after Julia did.
Q: Who became a writer first?
A: Julia
P: Alexander


----
C: Matthew became a producer before Jim did.
Q: Who became a producer first?
A: Matthew
P: Jim

C: Jim became a producer after Matthew did.
Q: Who became a producer last?
A: Jim
P: Matthew


----
C: Hugh became a photographer before Jim did.
Q: Who became a photographer first?
A: Hugh
P: Jim

C: Jim became a photographer after Hugh did.
Q: Who became a photographer first?
A: Hugh
P: Jim


----


## Negation

In context

In [133]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} is not {a:prof}. {first_name2} is.',
            '{first_name2} is {a:prof}. {first_name} is not.',
        ],
        'qas': [
            (
                'Who is {a:prof}?',
                '{first_name2}'
            ), 
            (
                'Who is not {a:prof}?',
                '{first_name}'
            ), 
        ]
        
    },
    save=True,
    prof=professions,
    remove_duplicates=True,
    nsamples=500,
    ))
name = 'Negation in context, may or may not be in question'
test = MFT(**t, expect=expect_squad, capability='Negation', name=name, description='' )
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 400 examples
Test cases:      498
Test cases run:  100
Fails (rate):    85 (85.0%)

Example fails:
C: Ann is not a DJ. Frank is.
Q: Who is a DJ?
A: Frank
P: Ann


----
C: Diane is not an entrepreneur. Mike is.
Q: Who is an entrepreneur?
A: Mike
P: Diane


----
C: Lawrence is not a secretary. Edward is.
Q: Who is a secretary?
A: Edward
P: Lawrence


----


Not in context:

In [134]:

t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} is {a:prof}. {first_name2} is {a:prof2}.',
            '{first_name2} is {a:prof2}. {first_name} is {a:prof}.',
        ],
        'qas': [
            (
                'Who is {a:prof}?',
                '{first_name}'
            ), 
            (
                'Who is not {a:prof}?',
                '{first_name2}'
            ), 
            (
                'Who is {a:prof2}?',
                '{first_name2}'
            ), 
            (
                'Who is not {a:prof2}?',
                '{first_name}'
            ), 
        ]
        
    },
    prof=professions,
    remove_duplicates=True,
    nsamples=500,
    ))
name = 'Negation in question only.'
test = MFT(**t, expect=expect_squad, capability='Negation', name=name, description='' )
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 800 examples
Test cases:      481
Test cases run:  100
Fails (rate):    100 (100.0%)

Example fails:
C: Carol is a model. Jerry is an author.
Q: Who is not a model?
A: Jerry
P: Carol

C: Carol is a model. Jerry is an author.
Q: Who is not an author?
A: Carol
P: Jerry

C: Jerry is an author. Carol is a model.
Q: Who is not a model?
A: Jerry
P: Carol


----
C: Judith is an analyst. Ruth is an interpreter.
Q: Who is not an analyst?
A: Ruth
P: Judith

C: Judith is an analyst. Ruth is an interpreter.
Q: Who is not an interpreter?
A: Judith
P: Ruth

C: Ruth is an interpreter. Judith is an analyst.
Q: Who is not an analyst?
A: Ruth
P: Judith


----
C: Claire is a reporter. Edith is an entrepreneur.
Q: Who is not a reporter?
A: Edith
P: Claire

C: Claire is a reporter. Edith is an entrepreneur.
Q: Who is not an entrepreneur?
A: Claire
P: Edith

C: Edith is an entrepreneur. Claire is a reporter.
Q: Who is not a reporter?
A: Edith
P: Claire


----


## Fairness spinoff

In [135]:
import collections
fewer_profs = ['doctor', 'nurse', 'secretary', 'CEO']
t = editor.template(
    [
        ('{male} is not {a:prof}, {female} is.', 'Who is {a:prof}?', '{female}', 'woman', '{prof}'),
        ('{female} is not {a:prof}, {male} is.', 'Who is {a:prof}?', '{male}', 'man', '{prof}'),
    ],
#     prof=professions + ['doctor'],
    prof=fewer_profs,
    remove_duplicates=True,
    nsamples=1000,
    unroll=True,
    save=True,
    )
data = [(d[0], d[1]) for d in t.data]
labels = [d[2] for d in t.data]
meta = [(d[3], d[4]) for d in t.data]

test = MFT(data, expect=expect_squad, labels=labels, meta=meta, templates=t.templates,
          name='M/F failure rates should be similar for different professions', capability='Fairness',
          description='Using negation in context.')
test.run(predconfs, n=100)

def print_fair(test):
    c = collections.Counter(test.meta)
    fail = collections.Counter([tuple(x) for x in np.array(test.meta)[test.fail_idxs()]])
    profs = set()
    for sex, prof in fail:
        profs.add(prof)
    prof_fail = {}
    get_fail = lambda f:fail[f] / c[f]
    for prof in profs:
        fail_m = get_fail(('man', prof))
        fail_f = get_fail(('woman', prof))
        prof_fail[prof] = (fail_m, fail_f)
    print('%-13s fail_men fail_women (count)' % 'profession')
    for prof, vs in sorted(prof_fail.items(), key=lambda x:max(x[1][0], x[1][1]), reverse=True):
        fail_m, fail_f = vs
        print('%-13s   %.1f      %.1f     (%d)' % (prof, 100 * fail_m, 100 * fail_f, c[('man', prof)]))
print_fair(test)
suite.add(test)

Predicting 100 examples
profession    fail_men fail_women (count)
nurse           5.1      5.1     (275)
secretary       5.1      5.1     (236)
CEO             4.2      3.3     (239)
doctor          3.6      3.6     (250)


## Coref

Basic coref

In [136]:
if 'actress' in professions:
    professions.remove('actress')

In [137]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{male} and {female} are friends. He is {a:prof1}, and she is {a:prof2}.',
            '{female} and {male} are friends. He is {a:prof1}, and she is {a:prof2}.',
            '{male} and {female} are friends. She is {a:prof2}, and he is {a:prof1}.',
            '{female} and {male} are friends. She is {a:prof2}, and he is {a:prof1}.',
        ],
        'qas': [
            (
                'Who is {a:prof1}?',
                '{male}'
            ), 
            (
                'Who is {a:prof2}?',
                '{female}'
            ), 
        ]
        
    },
    save=True,
    prof=professions,
    remove_duplicates=True,
    nsamples=500,
    ))
name = 'Basic coref, he / she'
test = MFT(**t, expect=expect_squad, name=name, description='', capability='Coref')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 800 examples
Test cases:      485
Test cases run:  100
Fails (rate):    100 (100.0%)

Example fails:
C: Chris and Elizabeth are friends. He is an educator, and she is an administrator.
Q: Who is an educator?
A: Chris
P: Chris and Elizabeth

C: Chris and Elizabeth are friends. He is an educator, and she is an administrator.
Q: Who is an administrator?
A: Elizabeth
P: Chris and Elizabeth

C: Elizabeth and Chris are friends. He is an educator, and she is an administrator.
Q: Who is an educator?
A: Chris
P: Elizabeth and Chris


----
C: Jeff and Emily are friends. He is a secretary, and she is an engineer.
Q: Who is a secretary?
A: Jeff
P: Jeff and Emily

C: Jeff and Emily are friends. He is a secretary, and she is an engineer.
Q: Who is an engineer?
A: Emily
P: secretary

C: Emily and Jeff are friends. He is a secretary, and she is an engineer.
Q: Who is a secretary?
A: Jeff
P: Emily and Jeff


----
C: Martin and Evelyn are friends. He is an economist, and she is a writer.
Q: W

In [138]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{male} and {female} are friends. His mom is {a:prof}.',
            '{female} and {male} are friends. His mom is {a:prof}.',
        ],
        'qas': [
            (
                'Whose mom is {a:prof}?',
                '{male}'
            ), 
        ]
        
    },
    save=True,
    prof=professions,
    remove_duplicates=True,
    nsamples=250,
    ))
t += crossproduct(editor.template(
    {
        'contexts': [
            '{male} and {female} are friends. Her mom is {a:prof}.',
            '{female} and {male} are friends. Her mom is {a:prof}.',
        ],
        'qas': [
            (
                'Whose mom is {a:prof}?',
                '{female}'
            ), 
        ]
        
    },
    save=True,
    prof=professions,
    remove_duplicates=True,
    nsamples=250,
    ))

name = 'Basic coref, his / her'
test = MFT(**t, expect=expect_squad, name=name, description='', capability='Coref')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 200 examples
Test cases:      500
Test cases run:  100
Fails (rate):    100 (100.0%)

Example fails:
C: Dick and Marie are friends. His mom is an engineer.
Q: Whose mom is an engineer?
A: Dick
P: Dick and Marie

C: Marie and Dick are friends. His mom is an engineer.
Q: Whose mom is an engineer?
A: Dick
P: Marie and Dick


----
C: Mark and Pamela are friends. Her mom is a model.
Q: Whose mom is a model?
A: Pamela
P: Mark and Pamela

C: Pamela and Mark are friends. Her mom is a model.
Q: Whose mom is a model?
A: Pamela
P: Pamela and Mark


----
C: Ryan and Ellen are friends. His mom is an assistant.
Q: Whose mom is an assistant?
A: Ryan
P: Ryan and Ellen

C: Ellen and Ryan are friends. His mom is an assistant.
Q: Whose mom is an assistant?
A: Ryan
P: Ellen and Ryan


----


Former, latter

In [139]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} and {first_name2} are friends. The former is {a:prof1}.',
            '{first_name2} and {first_name} are friends. The latter is {a:prof1}.',
            '{first_name} and {first_name2} are friends. The former is {a:prof1} and the latter is {a:prof2}.',
            '{first_name2} and {first_name} are friends. The former is {a:prof2} and the latter is {a:prof1}.',
        ],
        'qas': [
            (
                'Who is {a:prof1}?',
                '{first_name}'
            ), 
        ]
        
    },
    prof=professions,
    remove_duplicates=True,
    nsamples=500,
    save=True
    ))
name = 'Former / Latter'
test = MFT(**t, expect=expect_squad, name=name, description='', capability='Coref')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 400 examples
Test cases:      480
Test cases run:  100
Fails (rate):    100 (100.0%)

Example fails:
C: Ron and Pamela are friends. The former is an architect.
Q: Who is an architect?
A: Ron
P: Ron and Pamela

C: Pamela and Ron are friends. The latter is an architect.
Q: Who is an architect?
A: Ron
P: Pamela and Ron

C: Ron and Pamela are friends. The former is an architect and the latter is an economist.
Q: Who is an architect?
A: Ron
P: an economist


----
C: Ralph and Kathryn are friends. The former is a reporter.
Q: Who is a reporter?
A: Ralph
P: Ralph and Kathryn

C: Kathryn and Ralph are friends. The latter is a reporter.
Q: Who is a reporter?
A: Ralph
P: Kathryn and Ralph

C: Ralph and Kathryn are friends. The former is a reporter and the latter is an assistant.
Q: Who is a reporter?
A: Ralph
P: an assistant


----
C: Helen and Kevin are friends. The former is a secretary.
Q: Who is a secretary?
A: Helen
P: Helen and Kevin

C: Kevin and Helen are friends. The latter i

## SRL

In [141]:
import pattern
import pattern.en
pverb = ['love', 'hate', 'like', 'remember', 'recognize', 'trust', 'deserve', 'understand', 'blame', 'dislike', 'prefer', 'follow', 'notice', 'hurt', 'bother', 'support', 'believe', 'accept', 'attack']
a = pattern.en.tenses('loves')[0]
b = pattern.en.tenses('stolen')[0]
pverb = [(pattern.en.conjugate(v, *a), pattern.en.conjugate(v, *b)) for v in pverb]

t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} {v[0]} {first_name2}.',
            '{first_name2} is {v[1]} by {first_name}.',
        ],
        'qas': [
            (
                'Who {v[0]}?',
                '{first_name}'
            ), 
            (
                'Who is {v[1]}?',
                '{first_name2}'
            ), 
        ]
        
    },
    v=pverb,
    remove_duplicates=True,
    nsamples=500,
    ))
name = 'Agent / object distinction'
test = MFT(**t, expect=expect_squad, name=name, description='', capability='SRL')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 400 examples
Test cases:      499
Test cases run:  100
Fails (rate):    98 (98.0%)

Example fails:
C: Ian hates Stephen.
Q: Who hates?
A: Ian
P: Stephen

C: Stephen is hated by Ian.
Q: Who is hated?
A: Stephen
P: Ian


----
C: Fred understands Edith.
Q: Who is understood?
A: Edith
P: Fred understands Edith

C: Edith is understood by Fred.
Q: Who is understood?
A: Edith
P: Fred


----
C: William is noticed by Edith.
Q: Who is noticed?
A: William
P: Edith


----


In [142]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} {v[0]} {first_name2}. {first_name2} {v[0]} {first_name3}.',
            '{first_name} {v[0]} {first_name2}. {first_name3} is {v[1]} by {first_name2}.',
            '{first_name2} is {v[1]} by {first_name}. {first_name2} {v[0]} {first_name3}.',
            '{first_name2} is {v[1]} by {first_name}. {first_name3} is {v[1]} by {first_name2}.',
        ],
        'qas': [
            (
                'Who {v[0]} {first_name2}?',
                '{first_name}'
            ), 
            (
                'Who {v[0]} {first_name3}?',
                '{first_name2}'
            ), 
            (
                'Who is {v[1]} by {first_name}?',
                '{first_name2}'
            ), 
            (
                'Who is {v[1]} by {first_name2}?',
                '{first_name3}'
            ), 
        ]
        
    },
    save=True,
    v=pverb,
    remove_duplicates=True,
    nsamples=500,
    ))
name = 'Agent / object distinction with 3 agents'
test = MFT(**t, expect=expect_squad, name=name, description='', capability='SRL')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)


Predicting 1600 examples
Test cases:      492
Test cases run:  100
Fails (rate):    100 (100.0%)

Example fails:
C: Ryan deserves Nicole. Nicole deserves Alfred.
Q: Who deserves Nicole?
A: Ryan
P: Alfred

C: Ryan deserves Nicole. Alfred is deserved by Nicole.
Q: Who deserves Alfred?
A: Nicole
P: Ryan

C: Ryan deserves Nicole. Alfred is deserved by Nicole.
Q: Who is deserved by Ryan?
A: Nicole
P: Alfred


----
C: Sharon dislikes Christopher. Christopher dislikes Deborah.
Q: Who dislikes Christopher?
A: Sharon
P: Deborah

C: Sharon dislikes Christopher. Christopher dislikes Deborah.
Q: Who dislikes Deborah?
A: Christopher
P: Sharon

C: Sharon dislikes Christopher. Deborah is disliked by Christopher.
Q: Who dislikes Deborah?
A: Christopher
P: Sharon


----
C: Grace likes Richard. Sally is liked by Richard.
Q: Who likes Sally?
A: Richard
P: Grace

C: Grace likes Richard. Sally is liked by Richard.
Q: Who is liked by Grace?
A: Richard
P: Richard. Sally

C: Richard is liked by Grace. Richard

In [143]:
path = 'squad_suite.pkl'
suite.save(path)

In [144]:
suite.summary(n=3, format_example_fn=format_squad_with_context)

Vocabulary

A is COMP than B. Who is more COMP?
Test cases:      499
Test cases run:  100
Fails (rate):    3 (3.0%)

Example fails:
C: Kim is younger than Laura.
Q: Who is younger?
A: Kim
P: Laura

----
C: Alice is richer than Joseph.
Q: Who is richer?
A: Alice
P: Joseph

----
C: Howard is higher than Bob.
Q: Who is higher?
A: Howard
P: Bob

----


A is COMP than B. Who is less COMP?
Test cases:      497
Test cases run:  100
Fails (rate):    99 (99.0%)

Example fails:
C: Kathryn is shorter than Jessica.
Q: Who is less short?
A: Jessica
P: Kathryn

----
C: Kevin is younger than Louise.
Q: Who is less young?
A: Louise
P: Kevin

----
C: Joe is weaker than Linda.
Q: Who is less weak?
A: Linda
P: Joe

----


Intensifiers (very, super, extremely) and reducers (somewhat, kinda, etc)?
Test cases:      498
Test cases run:  100
Fails (rate):    100 (100.0%)

Example fails:
C: Roger is very concerned about the project. Kelly is concerned about the project.
Q: Who is least concerned about the proj

In [145]:
print_fair(suite.tests['M/F failure rates should be similar for different professions'])

profession    fail_men fail_women (count)
nurse           5.1      5.1     (275)
secretary       5.1      5.1     (236)
CEO             4.2      3.3     (239)
doctor          3.6      3.6     (250)


In [147]:
import json

In [150]:
format_fn = lambda x: json.dumps({'passage': x[0], 'question': x[1]})
suite.to_raw_file('squad.jsonl', format_fn=format_fn)

In [151]:

format_fn = lambda x: {'passage': x[0], 'question': x[1]}
suite.to_raw_file('squad.json', format_fn=format_fn, file_format='squad')

In [185]:
test_ranges = {}
current_idx = 0
all_examples = []
for name, t in suite.tests.items():
    examples = t.to_raw_examples()
    suite.test_ranges[name] = (current_idx, current_idx + len(examples))
    current_idx += len(examples)
    all_examples.extend(examples)


In [186]:
all_examples

["('Victoria is stranger than Frank.', 'Who is stranger?')",
 "('Ashley is stronger than Alan.', 'Who is stronger?')",
 "('Bobby is nicer than Thomas.', 'Who is nicer?')",
 "('Benjamin is slower than Simon.', 'Who is slower?')",
 "('Sally is older than Melissa.', 'Who is older?')",
 "('Martha is richer than Hugh.', 'Who is richer?')",
 "('Charlie is nicer than Colin.', 'Who is nicer?')",
 "('Ron is nicer than Christopher.', 'Who is nicer?')",
 "('Ken is cooler than Al.', 'Who is cooler?')",
 "('Jane is smarter than Alice.', 'Who is smarter?')",
 "('Angela is older than Amanda.', 'Who is older?')",
 "('Fred is slower than Lucy.', 'Who is slower?')",
 "('Don is faster than Patricia.', 'Who is faster?')",
 "('Judith is richer than Jessica.', 'Who is richer?')",
 "('Samuel is cleaner than Bob.', 'Who is cleaner?')",
 "('Edith is weaker than Lisa.', 'Who is weaker?')",
 "('Kathy is smaller than Walter.', 'Who is smaller?')",
 "('Michael is faster than Rose.', 'Who is faster?')",
 "('Ed is c