**NOTE on Installation of `neuralcoref`:**

First I installed the neuralcoref using the command: `conda install conda-forge::neuralcoref` </br> This will install neuralcoref V4.

Then I ran `import neuralcoref` which downloaded a 41MB file to fix the `spacy.strings.StringStore size changed` problem.

At last, I encountered with the `kernel died` problem which I fixed using `spacy==2.1.0` which was not provided using `conda` so I installed it using `pip`.

Tadaaaa! It's show time.

# Check the Requirements

In [1]:
import spacy
spacy.__version__

'2.1.0'

In [2]:
import neuralcoref
neuralcoref.__version__

'4.0.0'

An example of how `neuralcoref` works:

In [3]:
from IPython.display import clear_output

!python -m spacy download en
clear_output()

In [4]:
nlp = spacy.load('en')
neuralcoref.add_to_pipe(nlp)

doc1 = nlp('My sister has a dog. She loves him.')
print(doc1._.coref_clusters)

doc2 = nlp('Angela lives in Boston. She is quite happy in that city.')
for ent in doc2.ents:
    print(ent._.coref_cluster)

[My sister: [My sister, She], a dog: [a dog, him]]
Angela: [Angela, She]
Boston: [Boston, that city]


In [5]:
doc2._.coref_clusters[0]

Angela: [Angela, She]

# BLIMP Data Preprocessing
The code is provided by ValueZeroing repository.

In [17]:
import numpy as np
from datasets import load_dataset, Dataset, concatenate_datasets
# import spacy
# import neuralcoref
from transformers import AutoTokenizer
import sys, os
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(sys.modules[__name__].__file__), "../..")))
# from utils.utils import MODEL_PATH

Expanding the contractions of the dataset:

In [7]:
def expand_contraction_process(example):
    example['sentence_good'] = example['sentence_good'].replace("wouldn't", "would not")
    example['sentence_bad'] = example['sentence_bad'].replace("wouldn't", "would not")

    example['sentence_good'] = example['sentence_good'].replace("couldn't", "could not")
    example['sentence_bad'] = example['sentence_bad'].replace("couldn't", "could not")

    example['sentence_good'] = example['sentence_good'].replace("shouldn't", "should not")
    example['sentence_bad'] = example['sentence_bad'].replace("shouldn't", "should not")

    example['sentence_good'] = example['sentence_good'].replace("won't", "will not")
    example['sentence_bad'] = example['sentence_bad'].replace("won't", "will not")

    example['sentence_good'] = example['sentence_good'].replace("can't", "cannot")
    example['sentence_bad'] = example['sentence_bad'].replace("can't", "cannot")

    example['sentence_good'] = example['sentence_good'].replace("don't", "do not")
    example['sentence_bad'] = example['sentence_bad'].replace("don't", "do not")

    example['sentence_good'] = example['sentence_good'].replace("doesn't", "does not")
    example['sentence_bad'] = example['sentence_bad'].replace("doesn't", "does not")

    example['sentence_good'] = example['sentence_good'].replace("didn't", "did not")
    example['sentence_bad'] = example['sentence_bad'].replace("didn't", "did not")

    example['sentence_good'] = example['sentence_good'].replace("isn't", "is not")
    example['sentence_bad'] = example['sentence_bad'].replace("isn't", "is not")

    example['sentence_good'] = example['sentence_good'].replace("aren't", "are not")
    example['sentence_bad'] = example['sentence_bad'].replace("aren't", "are not")

    example['sentence_good'] = example['sentence_good'].replace("wasn't", "was not")
    example['sentence_bad'] = example['sentence_bad'].replace("wasn't", "was not")

    example['sentence_good'] = example['sentence_good'].replace("weren't", "were not")
    example['sentence_bad'] = example['sentence_bad'].replace("weren't", "were not")

    example['sentence_good'] = example['sentence_good'].replace("hasn't", "has not")
    example['sentence_bad'] = example['sentence_bad'].replace("hasn't", "has not")

    example['sentence_good'] = example['sentence_good'].replace("haven't", "have not")
    example['sentence_bad'] = example['sentence_bad'].replace("haven't", "have not")

    example['sentence_good'] = example['sentence_good'].replace("hadn't", "had not")
    example['sentence_bad'] = example['sentence_bad'].replace("hadn't", "had not")

    return example


The Subject-Verb Agreement processing step:

In [8]:
# feed each subset (task) into the processing function
def sva_process(raw_data):
    data = {}
    data['sentence_good'] = [] 
    data['sentence_bad'] = []
    data['good_word'] = [] # the target word in good sentence (correct word)
    data['bad_word'] = [] # the target word in bad sentence (wrong word)
    data['target_index'] = [] # the [MASK] token which we want to predict, this is the difference between two sentences
    
    # the index of cue word, ex: for Subject-Verb agreement it's the subject word which help us to predict the verb correctly
    data['cue_indices'] = [] 
    data['labels'] = []
    # for each pair of data in the specific task
    for example in raw_data:
        # creating a spaCy document object by processing each sentence
        doc_good = nlp(example['sentence_good'])
        doc_bad = nlp(example['sentence_bad'])
        
        cue_index = -1
        sentence_good = []
        sentence_bad = []
        # loop for number of tokens in the sentence
        for i in range(len(doc_good)):
            # add the str of each token to lists
            sentence_good.append(doc_good[i].text)
            sentence_bad.append(doc_bad[i].text)
            # "nsubj" -> nominal subject: It's a grammatical dependency relation that indicates the noun phrase
            # (or other nominal expression) that functions as the subject of the sentence.
            if doc_good[i].dep_ == "nsubj" and cue_index == -1:
                cue_index = i
            if doc_good[i].text != doc_bad[i].text:
                target_index = i
                good_word = doc_good[i].text
                bad_word = doc_bad[i].text
                
                tag = doc_good[target_index].tag_
                if tag == 'VBZ':
                    tag = 'singular'
                elif tag == 'VBP':
                    tag = 'plural'

                # fix wrong tags of SpaCy
                elif tag == "NN": # it's vice versa becuse spacy consideres verb as a noun, so those plural nouns are actually a singular verb e.g., works
                    tag = "plural"
                elif tag == "NNS":
                    tag = "singular"
                elif doc_good[target_index].tag_ not in ['VBZ', 'VBP'] and doc_bad[target_index].tag_ in ['VBZ', 'VBP']:
                    if doc_bad[target_index].tag_ == "VBZ":
                        tag = "plural"
                    elif doc_bad[target_index].tag_ == "VBP":
                        tag = "singular"

                elif tag not in ['VBZ', 'VBP']: #correcting exceptions
                    if doc_good[target_index].text in ["was", "upsets", "hurts", "bores", "vanishes", "distracts", "kisses", "boycotts", "scares"]:
                        tag = "singular"
                    elif doc_good[target_index].text in ["were", "upset", "hurt", "bore", "vanish", "distract", "kiss", "boycott", "scare"]:
                        tag = "plural"
                    else:
                        print(doc_good[target_index].text, doc_bad[target_index].text)


        if target_index == -1 or cue_index == -1:
            continue
        # in case the candidate [MASK] does not not consists of a single token (for the specific tokenizer)
        if len(tokenizer_bert.tokenize(good_word)) > 1 or len(tokenizer_bert.tokenize(bad_word)) > 1:
            continue

        data['sentence_good'].append(sentence_good) # list of lists contains of tokens in each sentence parsed by spacy
        data['sentence_bad'].append(sentence_bad)
        data['target_index'].append(target_index) 
        data['cue_indices'].append([cue_index])
        data['good_word'].append(good_word)
        data['bad_word'].append(bad_word)
        data['labels'].append(tag) # plurality/singularity of target word for the good sentence

    return Dataset.from_dict(data)


The Determiner-Noun Agreement processing step:

In [9]:
def det_process(raw_data):
    data = {}
    data['sentence_good'] = []
    data['sentence_bad'] = []
    data['good_word'] = []
    data['bad_word'] = []
    data['target_index'] = []
    data['cue_indices'] = []
    data['labels'] = []
    for ex, example in enumerate(raw_data):
        doc_good = nlp(example['sentence_good'])
        doc_bad = nlp(example['sentence_bad'])
        edges = [] # a list contains all edges in the graph of dependency tree
        # iterate over words in each doc and find its children. Then create a tuple of index of the word and the indec of the child
        for w in doc_good:
            edges.extend([(w.i, child.i) for child in w.children])

        target_index = -1
        cue_index = -1
        sentence_good = []
        sentence_bad = []
        for i in range(len(doc_good)):
            sentence_good.append(doc_good[i].text)
            sentence_bad.append(doc_bad[i].text)
            if doc_good[i].text != doc_bad[i].text: # doc_good[i].dep_ == "det" and
                target_index = i
                good_word = doc_good[i].text
                bad_word = doc_bad[i].text
            
            # Search over all edges in the sentence to find the edge which points at the target word (the determiner),
            # so the parent of this edge is our cue word. (In dependency tree the NOUN points at DET)
            for s, d in edges:
                if d == target_index:
                    cue_index = s
                    break

        if target_index == -1 or cue_index == -1:
            print(ex)
            continue
        if len(tokenizer_bert.tokenize(good_word)) > 1 or len(tokenizer_bert.tokenize(bad_word)) > 1:
            continue

        if good_word in ['this', 'that']:
            tag = "singular"
        elif good_word in ['these', 'those']:
            tag = "plural"
        else:
            print(good_word)

        data['sentence_good'].append(sentence_good)
        data['sentence_bad'].append(sentence_bad)
        data['target_index'].append(target_index)
        data['cue_indices'].append([cue_index])
        data['good_word'].append(good_word)
        data['bad_word'].append(bad_word)
        data['labels'].append(tag)

    return Dataset.from_dict(data)

The Anaphor Number Agreement processing step:

In [10]:
def number_process(raw_data):
    data = {}
    data['sentence_good'] = []
    data['sentence_bad'] = []
    data['good_word'] = []
    data['bad_word'] = []
    data['target_index'] = []
    data['cue_indices'] = []
    data['labels'] = []
    for ex, example in enumerate(raw_data):
        doc_good = nlp(example['sentence_good'])
        doc_bad = nlp(example['sentence_bad'])
        if not doc_good._.has_coref:
            continue
        # coref_clusters returns all the coref clusters. Here we take the first cluster ([0]) and it contains of the
        # [coref_1 (entity), coref_2]. So, the first word is our cue.
        cue_words, _ = doc_good._.coref_clusters[0]
        # The cue may have more than one word.
        cue_words = cue_words.text.split(" ")
        target_index = -1
        cue_indices = [] # In contrast with previous functions here we have a list of indices for cue
        sentence_good = []
        sentence_bad = []
        for i in range(len(doc_good)):
            sentence_good.append(doc_good[i].text)
            sentence_bad.append(doc_bad[i].text)
            if doc_good[i].text != doc_bad[i].text:
                target_index = i
                good_word = doc_good[i].text
                bad_word = doc_bad[i].text

            if doc_good[i].text in cue_words:
                cue_indices.append(i)

        if target_index == -1 or not cue_indices:
            continue
        if len(tokenizer_bert.tokenize(good_word)) > 1 or len(tokenizer_bert.tokenize(bad_word)) > 1:
            continue

        if good_word in ['itself', 'himself', 'herself']:
            tag = "singular"
        elif good_word in ['themselves']:
            tag = "plural"
        else:
            print(good_word)

        data['sentence_good'].append(sentence_good)
        data['sentence_bad'].append(sentence_bad)
        data['target_index'].append(target_index)
        data['cue_indices'].append(cue_indices)
        data['good_word'].append(good_word)
        data['bad_word'].append(bad_word)
        data['labels'].append(tag)

    return Dataset.from_dict(data)


In [11]:
TASK_UID = {
    'anaphor_number_agreement': 'ana',
    'determiner_noun_agreement_2': 'dna',
    'determiner_noun_agreement_with_adj_2': 'dnaa',
    'distractor_agreement_relational_noun': 'darn',
    'regular_plural_subject_verb_agreement_1': 'rpsv',
}

UID_PROCESSOR = {
    'ana': number_process,
    'dna': det_process,
    'dnaa': det_process,
    'rpsv': sva_process,
    'darn': sva_process,
}

In [None]:
SEED = 12

# Load Tokenizer
# MODEL_NAME = "bert" # "bert", "roberta", "electra"
MODEL_NAME = "roberta" # "bert", "roberta", "electra"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH[MODEL_NAME])
# tokenizer_bert = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base', use_fast=False)

# Load spacy
# nlp = spacy.load('en') # Loading the English Language Model
# neuralcoref.add_to_pipe(nlp)

In [27]:
# for each task we load the train split of the data from hugging face
for task, uid in TASK_UID.items():
    raw_data = load_dataset("blimp", task)['train']
    raw_data = raw_data.map(expand_contraction_process) # expanding the contractions
    
    # performing processing by type of task
    data = UID_PROCESSOR[uid](raw_data)
    data = data.shuffle(seed=SEED)

    if uid == "rpsv": # balancing class labels
        plur_indices = np.where(np.array(data['labels']) == 'plural')[0]
        sing_indices = np.where(np.array(data['labels']) == 'singular')[0]
        sing_indices = np.random.choice(sing_indices, len(plur_indices))
        plur_data = data.select(plur_indices)
        sing_data = data.select(sing_indices)
        data = concatenate_datasets([plur_data, sing_data])
        data = data.shuffle(seed=SEED)

    # aggregate datasets
    if uid == "ana": # "ana" is the first uid, after that always the eslse statement will be run
        number_dataset = data
    else:
        number_dataset = concatenate_datasets([number_dataset, data])
        number_dataset = number_dataset.shuffle(seed=SEED)

number_dataset = number_dataset.shuffle(seed=SEED)
number_dataset = number_dataset.train_test_split(test_size=0.5)

Reusing dataset blimp (/home/s_abbasi/.cache/huggingface/datasets/blimp/anaphor_number_agreement/0.1.0/0c65b833b8653dc81bbd517025f8248bcc8a94407cfc06a390abfd213d7cc13c)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?ex/s]

Reusing dataset blimp (/home/s_abbasi/.cache/huggingface/datasets/blimp/determiner_noun_agreement_2/0.1.0/0c65b833b8653dc81bbd517025f8248bcc8a94407cfc06a390abfd213d7cc13c)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?ex/s]

Reusing dataset blimp (/home/s_abbasi/.cache/huggingface/datasets/blimp/determiner_noun_agreement_with_adj_2/0.1.0/0c65b833b8653dc81bbd517025f8248bcc8a94407cfc06a390abfd213d7cc13c)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?ex/s]

Reusing dataset blimp (/home/s_abbasi/.cache/huggingface/datasets/blimp/distractor_agreement_relational_noun/0.1.0/0c65b833b8653dc81bbd517025f8248bcc8a94407cfc06a390abfd213d7cc13c)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?ex/s]

long longs
depart departs


Reusing dataset blimp (/home/s_abbasi/.cache/huggingface/datasets/blimp/regular_plural_subject_verb_agreement_1/0.1.0/0c65b833b8653dc81bbd517025f8248bcc8a94407cfc06a390abfd213d7cc13c)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?ex/s]

heal heals


In [28]:
number_dataset.save_to_disk(f"./BLIMP Dataset/{MODEL_NAME}/")

Flattening the indices:   0%|          | 0/3 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/3 [00:00<?, ?ba/s]

In [None]:
from datasets import load_from_disk
ds = load_from_disk(f"./BLIMP Dataset/{MODEL_NAME}/")

In [None]:
1610 ana ['James', 'can', 'kiss', 'himself', '.'],
1614 ana ['James', 'impressed', 'himself', '.']

In [65]:
for i in range(len(ds['train'])):
    if 'sister' in ds['train'][i]['sentence_good']:
        print(i, ds['train'][i]['sentence_good'])

57 ['A', 'sister', 'of', 'these', 'cashiers', 'is', 'slumping', 'over', '.']
507 ['The', 'sister', 'of', 'the', 'Impressionists', 'appears', 'to', 'exit', 'this', 'lake', '.']
1291 ['The', 'sister', 'of', 'doctors', 'writes', '.']
1322 ['The', 'sister', 'of', 'these', 'dancers', 'has', 'swallowed', '.']
1633 ['The', 'sister', 'of', 'a', 'lot', 'of', 'drivers', 'responds', '.']
1712 ['The', 'sister', 'of', 'all', 'cashiers', 'forces', 'Andrea', "'s", 'best', 'friend', 'to', 'cry', '.']
1905 ['The', 'sister', 'of', 'patients', 'sits', 'down', '.']
2006 ['A', 'sister', 'of', 'all', 'girls', 'does', 'not', 'implore', 'Diana', 'to', 'fall', 'asleep', '.']
2100 ['The', 'sister', 'of', 'those', 'men', 'does', 'not', 'scare', 'Grace', '.']


In [14]:
number_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence_good', 'sentence_bad', 'good_word', 'bad_word', 'target_index', 'cue_indices', 'labels'],
        num_rows: 2137
    })
    test: Dataset({
        features: ['sentence_good', 'sentence_bad', 'good_word', 'bad_word', 'target_index', 'cue_indices', 'labels'],
        num_rows: 2137
    })
})

In [18]:
type(number_dataset)

datasets.dataset_dict.DatasetDict

In [31]:
number_dataset['train'][0]

{'sentence_good': ['The',
  'granddaughters',
  'of',
  'Thomas',
  'have',
  'messed',
  'up',
  'this',
  'high',
  'school',
  '.'],
 'sentence_bad': ['The',
  'granddaughters',
  'of',
  'Thomas',
  'has',
  'messed',
  'up',
  'this',
  'high',
  'school',
  '.'],
 'good_word': 'have',
 'bad_word': 'has',
 'target_index': 4,
 'cue_indices': [1],
 'labels': 'plural'}

In [16]:
number_dataset['train'][0]

{'sentence_good': ['Every',
  'pedestrian',
  'can',
  'not',
  'hurt',
  'that',
  'unconvinced',
  'girl',
  '.'],
 'sentence_bad': ['Every',
  'pedestrian',
  'can',
  'not',
  'hurt',
  'those',
  'unconvinced',
  'girl',
  '.'],
 'good_word': 'that',
 'bad_word': 'those',
 'target_index': 5,
 'cue_indices': [7],
 'labels': 'singular'}