In [1]:
from datasets import load_dataset

pos_dataset = load_dataset("universal_dependencies", "en_ewt", split="train")
ner_dataset = load_dataset("conll2003", split="train")

In [2]:
pos_labels = pos_dataset.features["upos"].feature.names
ner_labels = ner_dataset.features["ner_tags"].feature.names

In [3]:
import spacy
spacy.require_gpu()

True

In [4]:
nlp = spacy.blank("en")
nlp.add_pipe(
    "transformer",
    config={
        "model": {
            "@architectures": "spacy-transformers.TransformerModel.v3",
            "name": "tvocoder/bert_fake_news_ft",
            "tokenizer_config": {
                "use_fast": True,
                "do_lower_case": True,
                "truncation": True,
                "max_length": 512,
                "padding": "max_length"
            }
        }
    }
)
nlp.add_pipe("tagger")
nlp.add_pipe("ner")

<spacy.pipeline.ner.EntityRecognizer at 0x1bb06699690>

In [5]:
from spacy.tokens import Doc
from spacy.training import Example

from spacy.training.iob_utils import iob_to_biluo, biluo_tags_to_offsets

In [6]:
def pos_training_examples(pos_dataset):
    labels = pos_dataset.features["upos"].feature.names
    examples = []
    for example in pos_dataset:
        doc = Doc(nlp.vocab, words=example["tokens"])
        pos = [labels[i] for i in example["upos"]]
        examples.append(Example.from_dict(doc, {"tags": pos}))
    return examples

In [7]:
def ner_training_examples(ner_dataset):
    labels = ner_dataset.features["ner_tags"].feature.names
    examples = []
    for example in ner_dataset:
        doc  = Doc(nlp.vocab, words=example["tokens"])
        iob  = [labels[i] for i in example["ner_tags"]]
        biluo= iob_to_biluo(iob)
        ents = biluo_tags_to_offsets(doc, biluo)
        examples.append(Example.from_dict(doc, {"entities": ents}))
    return examples

In [8]:
pos_labels = pos_dataset.features["upos"].feature.names
print(list(pos_labels))

for label in pos_labels:
    nlp.get_pipe("tagger").add_label(label)

['NOUN', 'PUNCT', 'ADP', 'NUM', 'SYM', 'SCONJ', 'ADJ', 'PART', 'DET', 'CCONJ', 'PROPN', 'PRON', 'X', '_', 'ADV', 'INTJ', 'VERB', 'AUX']


In [9]:
ner_labels = ner_dataset.features["ner_tags"].feature.names
print(list(ner_labels))

for label in ner_labels:
    nlp.get_pipe("ner").add_label(label)

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [10]:
pos_train = pos_training_examples(pos_dataset)
ner_train = ner_training_examples(ner_dataset)

In [11]:
import random
from spacy.util import minibatch

In [None]:
batch_size = 32

nlp.initialize(lambda: pos_train)
nlp.initialize(lambda: ner_train)

optimizer = nlp.create_optimizer()

for epoch in range(10):
    losses = {}
    random.shuffle(pos_train + ner_train)
    
    batches = minibatch(pos_train + ner_train, size=batch_size)
    for batch in batches:
        nlp.update(batch, sgd=optimizer, drop=0.5, losses=losses)
    print(f"[Epoch {epoch+1}] Losses: {losses}")

In [13]:
nlp.to_disk("./spacy_pipeline")