In [1]:
# default_exp core

# core

> Core Functionality for spaCy Data Debug that helps clean annotated NER data. All functions assume you have data in the Prodigy annotation format. For example:
```json
{
    "text":"The 300-year-old restored building attracts thousands of visitors every year.",
    "spans":[
        {
            "text":"300-year-old",
            "start":4,
            "end":16,
            "label":"AGE"
        },
        {
            "text":"building",
            "start":26,
            "end":34,
            "label":"LOCATION"
        },
        {
            "text":"visitor",
            "start":57,
            "end":64,
            "label":"PERSONTYPE"
        },
        {
            "text":"every year",
            "start":66,
            "end":76,
            "label":"SET"
        }
    ],
    "meta":{
        "source":"Cognitive Services Training Set"
    },
    "answer":"accept"
}
```

In [2]:
#hide
from nbdev.showdoc import *

In [3]:
#export
from collections import defaultdict
import copy
import json
from pathlib import Path
from typing import Dict, List

import srsly
import spacy
from spacy.pipeline import EntityRuler
from spacy.language import Language

In [4]:
#export
def fix_annotations_format(data): 
    for e in data:
        if 'meta' not in e:
            e['meta'] = {}
        if isinstance(e['meta'], list):
            e['meta'] = {
                'source': e['meta']
            }

        for s in e['spans']:
            if 'text' not in s:
                s['text'] = e['text'][s['start']:s['end']]
            s['label'] = s['label'].upper()
    return data

In [5]:
train = list(srsly.read_jsonl("../CognitiveServices/API-TextAnalytics-NER.CloudServices/data/2020-01-23/cs_train.jsonl"))
dev = list(srsly.read_jsonl("../CognitiveServices/API-TextAnalytics-NER.CloudServices/data/2020-01-23/cs_dev.jsonl"))
test = list(srsly.read_jsonl("../CognitiveServices/API-TextAnalytics-NER.CloudServices/data/2020-01-23/cs_test.jsonl"))

In [6]:
train = fix_annotations_format(train)
dev = fix_annotations_format(dev)
test = fix_annotations_format(test)

In [7]:
#export 
def dataset_stats(data: List[Dict[str, object]], serialize=False):
    labels = defaultdict(int)
    examples = defaultdict(list)
    n_examples_no_entities = 0
    for e in data:
        if not e['spans']:
            n_examples_no_entities += 1
            examples['NONE'].append(e)
        else:
            for s in e['spans']:
                label = s['label']
                labels[label] += 1
                examples[label].append(e)
                 
    res = {
        'n_examples': len(data),
        'n_examples_no_entities': n_examples_no_entities,
        'ents_per_type': labels
    }
    if serialize:
        return srsly.json_dumps(res, indent=4)
    else:
        res['examples_with_type'] = examples
        return res

In [8]:
print(dataset_stats(test, serialize=True))

{
    "n_examples":10000,
    "n_examples_no_entities":2083,
    "ents_per_type":{
        "PERSON":2203,
        "AGE":83,
        "GPE":2543,
        "NUMBER":1497,
        "PERSONTYPE":4511,
        "ORGANIZATION":3327,
        "LOCATION":1908,
        "DATERANGE":1187,
        "PERCENTAGE":385,
        "NUM_RANGE":41,
        "DURATION":584,
        "EVENT":1493,
        "CURRENCY":305,
        "DATE":287,
        "PRODUCT":3446,
        "ADDRESS":27,
        "TIMERANGE":54,
        "SET":145,
        "TIME":14,
        "DATETIMERANGE":11
    }
}


In [9]:
stats = dataset_stats(dev)
print(len(stats['examples_with_type']['PERSON']), "Examples with a PERSON label.")
print("Examples with Label")
for k, v in stats['examples_with_type'].items():
    print(f"{k}: {len(v)}")

7074 Examples with a PERSON label.
Examples with Label
EVENT: 1565
PERSON: 7074
GPE: 6959
LOCATION: 4062
PERSONTYPE: 5246
ORGANIZATION: 4345
NUMBER: 1421
PRODUCT: 1337
DATERANGE: 3229
TIME: 59
DATE: 1062
DURATION: 620
AGE: 232
TIMERANGE: 65
CURRENCY: 379
ADDRESS: 205
SET: 126
PERCENTAGE: 264
DATETIMERANGE: 45
NUM_RANGE: 24


In [30]:
#export
def ents_by_label(data: List[Dict[str, object]]):
    """Get a dictionary of unique text spans by label for your data"""
    annotations = defaultdict(set)

    for e in data:
        for s in e['spans']:
            lower_text = s.get('text', e['text'][s['start']:s['end']])
            annotations[s['label']].add(lower_text)
            
    for label in annotations.keys():
        annotations[label] = sorted(annotations[label])
        
    return annotations

This function creates a dictionary of the unique text spans by label for your dataset.

In [36]:
ents_by_label(test)['AGE']

['1,500-year-old',
 '11',
 '11-year-old',
 '12-year-old',
 '13 years old',
 '15',
 '15-36 months old',
 '15-year old',
 '150-year-old',
 '16',
 '16 years',
 '17',
 '18',
 '18 years',
 '18 years of age',
 '2 years',
 '2 years of age',
 '2-year-old',
 '21-year-old',
 '23',
 '23-year-old',
 '24',
 '26',
 '27 years old',
 '29',
 '30-year-old',
 '33',
 '35',
 '37',
 '38',
 '40',
 '40-year-old',
 '40s',
 '42',
 '45',
 '47',
 '48',
 '50s',
 '52',
 '53',
 '54-year-old',
 '60',
 '600 to 1,000 years old',
 '61-year old',
 '65',
 '65 years',
 '66',
 '70',
 '70 years',
 '71-year-old',
 '72-year-old',
 '73-year-old',
 '75',
 '8-year-old',
 '82-year-old',
 '84-year-old',
 '88-year-old',
 '90',
 'AGE 26',
 'age of 7',
 'age three',
 'ages 18-49',
 'computer',
 'early 20s',
 'early 50s',
 'five-year-old',
 'over 70',
 'six-year-old',
 'teenage',
 'twenty',
 'under 35',
 'year-old']

In [32]:
#export
def get_label_disparities(data: List[Dict[str, object]], label1: str, label2: str):
    """Identify annotated spans that have different labels in different examples"""
    annotations = ents_by_label(data)
    if label1 and label2:
        return set(annotations[label1]).intersection(set(annotations[label2]))

In [34]:
get_label_disparities(test, "NUMBER", "AGE")

{'11',
 '15',
 '16',
 '17',
 '18',
 '23',
 '24',
 '29',
 '33',
 '35',
 '37',
 '38',
 '40',
 '42',
 '47',
 '48',
 '52',
 '60',
 '66',
 '70',
 '75',
 '90'}

In [38]:
#hide
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted index.ipynb.
