In [1]:
from pathlib import Path

from spacy.vocab import Vocab
import spacy
from spacy.kb import KnowledgeBase


# Q2146908 (Russ Cochran): American golfer
# Q7381115 (Russ Cochran): publisher
ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)}


In [2]:
def create_kb(model=None, output_dir=None):
    """Load the model and create the KB with pre-defined entity encodings.
    If an output_dir is provided, the KB will be stored there in a file 'kb'.
    The updated vocab will also be written to a directory in the output_dir."""

    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)

    # check the length of the nlp vectors
    if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
        raise ValueError(
            "The `nlp` object should have access to pretrained word vectors, "
            " cf. https://spacy.io/usage/models#languages."
        )

    # You can change the dimension of vectors in your KB by using an encoder that changes the dimensionality.
    # For simplicity, we'll just use the original vector dimension here instead.
    vectors_dim = nlp.vocab.vectors.shape[1]
    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=vectors_dim)

    # set up the data
    entity_ids = []
    descr_embeddings = []
    freqs = []
    for key, value in ENTITIES.items():
        desc, freq = value
        entity_ids.append(key)
        descr_embeddings.append(nlp(desc).vector)
        freqs.append(freq)

    # set the entities, can also be done by calling `kb.add_entity` for each entity
    kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=descr_embeddings)

    # adding aliases, the entities need to be defined in the KB beforehand
    kb.add_alias(
        alias="Russ Cochran",
        entities=["Q2146908", "Q7381115"],
        probabilities=[0.24, 0.7],  # the sum of these probabilities should not exceed 1
    )

    # test the trained model
    print()
    _print_kb(kb)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        kb_path = str(output_dir / "kb")
        kb.dump(kb_path)
        print()
        print("Saved KB to", kb_path)

        vocab_path = output_dir / "vocab"
        kb.vocab.to_disk(vocab_path)
        print("Saved vocab to", vocab_path)

        print()

        # test the saved model
        # always reload a knowledge base with the same vocab instance!
        print("Loading vocab from", vocab_path)
        print("Loading KB from", kb_path)
        vocab2 = Vocab().from_disk(vocab_path)
        kb2 = KnowledgeBase(vocab=vocab2)
        kb2.load_bulk(kb_path)
        print()
        _print_kb(kb2)

def _print_kb(kb):
    print(kb.get_size_entities(), "kb entities:", kb.get_entity_strings())
    print(kb.get_size_aliases(), "kb aliases:", kb.get_alias_strings())

create_kb('en_core_web_md', 'kb.out')

Loaded model 'en_core_web_md'

2 kb entities: ['Q2146908', 'Q7381115']
1 kb aliases: ['Russ Cochran']

Saved KB to kb.out/kb
Saved vocab to kb.out/vocab

Loading vocab from kb.out/vocab
Loading KB from kb.out/kb

2 kb entities: ['Q2146908', 'Q7381115']
1 kb aliases: ['Russ Cochran']


In [3]:
import random
from pathlib import Path

from spacy.vocab import Vocab

import spacy
from spacy.kb import KnowledgeBase
from spacy.pipeline import EntityRuler
from spacy.util import minibatch, compounding


def sample_train_data():
    train_data = []

    # Q2146908 (Russ Cochran): American golfer
    # Q7381115 (Russ Cochran): publisher

    text_1 = "Russ Cochran his reprints include EC Comics."
    dict_1 = {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}
    train_data.append((text_1, {"links": dict_1}))

    text_2 = "Russ Cochran has been publishing comic art."
    dict_2 = {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}
    train_data.append((text_2, {"links": dict_2}))

    text_3 = "Russ Cochran captured his first major title with his son as caddie."
    dict_3 = {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}
    train_data.append((text_3, {"links": dict_3}))

    text_4 = "Russ Cochran was a member of University of Kentucky's golf team."
    dict_4 = {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}
    train_data.append((text_4, {"links": dict_4}))

    return train_data


# training data
TRAIN_DATA = sample_train_data()

In [7]:
def train_nel(kb_path, vocab_path=None, output_dir=None, n_iter=50):
    """Create a blank model with the specified vocab, set up the pipeline and train the entity linker.
    The `vocab` should be the one used during creation of the KB."""
    vocab = Vocab().from_disk(vocab_path)
    # create blank English model with correct vocab
    nlp = spacy.blank("en", vocab=vocab)
    nlp.vocab.vectors.name = "spacy_pretrained_vectors"
    print("Created blank 'en' model with vocab from '%s'" % vocab_path)

    # Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy.
    nlp.add_pipe(nlp.create_pipe('sentencizer'))

    # Add a custom component to recognize "Russ Cochran" as an entity for the example training data.
    # Note that in a realistic application, an actual NER algorithm should be used instead.
    ruler = EntityRuler(nlp)
    patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)

    # Create the Entity Linker component and add it to the pipeline.
    if "entity_linker" not in nlp.pipe_names:
        # use only the predicted EL score and not the prior probability (for demo purposes)
        cfg = {"incl_prior": False}
        entity_linker = nlp.create_pipe("entity_linker", cfg)
        kb = KnowledgeBase(vocab=nlp.vocab)
        kb.load_bulk(kb_path)
        print("Loaded Knowledge Base from '%s'" % kb_path)
        entity_linker.set_kb(kb)
        nlp.add_pipe(entity_linker, last=True)

    # Convert the texts to docs to make sure we have doc.ents set for the training examples.
    # Also ensure that the annotated examples correspond to known identifiers in the knowlege base.
    kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings()
    TRAIN_DOCS = []
    for text, annotation in TRAIN_DATA:
        with nlp.disable_pipes("entity_linker"):
            doc = nlp(text)
        annotation_clean = annotation
        for offset, kb_id_dict in annotation["links"].items():
            new_dict = {}
            for kb_id, value in kb_id_dict.items():
                if kb_id in kb_ids:
                    new_dict[kb_id] = value
                else:
                    print(
                        "Removed", kb_id, "from training because it is not in the KB."
                    )
            annotation_clean["links"][offset] = new_dict
        TRAIN_DOCS.append((doc, annotation_clean))

    # get names of other pipes to disable them during training
    pipe_exceptions = ["entity_linker", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train entity linker
        # reset and initialize the weights randomly
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DOCS)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    losses=losses,
                    sgd=optimizer,
                )
            print(itn, "Losses", losses)

    # test the trained model
    _apply_model(nlp)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print()
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        _apply_model(nlp2)


def _apply_model(nlp):
    for text, annotation in TRAIN_DATA:
        # apply the entity linker which will now make predictions for the 'Russ Cochran' entities
        doc = nlp(text)
        print()
        print("Entities", [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_kb_id_) for t in doc])

train_nel('./kb.out/kb', './kb.out/vocab', './nel.out')

Created blank 'en' model with vocab from './kb.out/vocab'
Loaded Knowledge Base from './kb.out/kb'


RuntimeError: [E188] Could not match the gold entity links to entities in the doc - make sure the gold EL data refers to valid results of the named entity recognizer in the `nlp` pipeline.