In [1]:
# default_exp core

# pipeline

> A `Pipeline` keeps track of your train/dev/test split and provides the same debug functionality for each dataset and for all data combined

In [15]:
#export

from typing import List, Dict
from spacy_data_debug.core import *


class Pipeline:
    def __init__(self, train: List[Dict[str, object]], dev: List[Dict[str, object]], test: List[Dict[str, object]] = None):
        self.datasets = {
            'train': train,
            'dev': dev,
            'all': train + dev
        }
        if test:
            self.datasets.update({
                'test': test,
                'all': train + dev + test
            })
    
    def apply(self, func, *args, **kwargs):
        """Apply an existing function to all datasets"""
        res = {}
        for k, dataset in self.datasets.items():
            res[k] = func(dataset, *args, **kwargs)
        return res

In [13]:
import srsly
train = list(srsly.read_jsonl("../CognitiveServices/API-TextAnalytics-NER.CloudServices/data/2020-01-23/cs_train.jsonl"))
dev = list(srsly.read_jsonl("../CognitiveServices/API-TextAnalytics-NER.CloudServices/data/2020-01-23/cs_dev.jsonl"))
test = list(srsly.read_jsonl("../CognitiveServices/API-TextAnalytics-NER.CloudServices/data/2020-01-23/cs_test.jsonl"))

In [17]:
pipeline = Pipeline(train, dev, test)
res = pipeline.apply(dataset_stats, serialize=True)
print(res['train'])
print(res['all'])

{
    "n_examples":130000,
    "n_examples_no_entities":7,
    "ents_per_type":{
        "AGE":2999,
        "LOCATION":52346,
        "PERSONTYPE":69290,
        "SET":1505,
        "GPE":89942,
        "DATERANGE":42247,
        "EVENT":20088,
        "NUMBER":18604,
        "ORGANIZATION":56557,
        "PERSON":92811,
        "PRODUCT":18102,
        "DATE":13304,
        "PERCENTAGE":3698,
        "DURATION":8584,
        "ADDRESS":2938,
        "CURRENCY":4355,
        "TIME":746,
        "TIMERANGE":654,
        "DATETIMERANGE":647,
        "NUM_RANGE":340
    }
}
{
    "n_examples":150000,
    "n_examples_no_entities":2090,
    "ents_per_type":{
        "AGE":3314,
        "LOCATION":58316,
        "PERSONTYPE":79047,
        "SET":1776,
        "GPE":99444,
        "DATERANGE":46663,
        "EVENT":23146,
        "NUMBER":21522,
        "ORGANIZATION":64229,
        "PERSON":102088,
        "PRODUCT":22885,
        "DATE":14653,
        "PERCENTAGE":4347,
        "DURATION":9

In [18]:
#hide
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted 01_pipeline.ipynb.
Converted index.ipynb.
