In [2]:
# Doc object
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("This is a sentence")
print(doc)

This is a sentence


#### Pipeline components

#### Tagger

In [3]:
for token in doc:
    print(f'{token.text}: ')
    print(f'[.tag] {token.tag}, part-of-speech tag Id')
    print(f'[.pos] {token.pos}, position in the sentence')
    print("")


This: 
[.tag] 15267657372422890137, part-of-speech tag Id
[.pos] 95, position in the sentence

is: 
[.tag] 13927759927860985106, part-of-speech tag Id
[.pos] 87, position in the sentence

a: 
[.tag] 15267657372422890137, part-of-speech tag Id
[.pos] 90, position in the sentence

sentence: 
[.tag] 15308085513773655218, part-of-speech tag Id
[.pos] 92, position in the sentence



#### Parser

In [4]:
for token in doc:
    print(f'{token.text}: ')
    print(f'[.head] {token.head if token.has_head() else None}, syntactic parent of the token, if hase one')
    print(f'[.dep] {token.dep}, syntactic dependency id')
    print(f'[.sent] {token.sent}, relative sentence span')
    print("")

This: 
[.head] is, syntactic parent of the token, if hase one
[.dep] 429, syntactic dependency id
[.sent] This is a sentence, relative sentence span

is: 
[.head] is, syntactic parent of the token, if hase one
[.dep] 8206900633647566924, syntactic dependency id
[.sent] This is a sentence, relative sentence span

a: 
[.head] sentence, syntactic parent of the token, if hase one
[.dep] 415, syntactic dependency id
[.sent] This is a sentence, relative sentence span

sentence: 
[.head] is, syntactic parent of the token, if hase one
[.dep] 404, syntactic dependency id
[.sent] This is a sentence, relative sentence span



#### Named Entity recognizer

In [5]:
for ent in doc.ents:
    print(f'{ent.text}: ')
    print(f'{ent}')

#### Text classifier

In [6]:
for c in doc.cats:
    print(f'{c}')

Because text categories are always very specific, the text classifier is not included in any of the trained pipelines by default. But you can use it to train your own system.

All pipeline packages you can load into spaCy include several files and a config.cfg.

The config defines things like the language and pipeline. This tells spaCy which components to instantiate and how they should be configured.

The built-in components that make predictions also need binary data. The data is included in the pipeline package and loaded into the component when you load the pipeline.

In [7]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [8]:
print(nlp.pipeline)

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x14fe95c70>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x14fe95670>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x14f8373e0>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x14ff1f810>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x14ff29810>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x14f8374c0>)]


#### Custom pipeline components

In [9]:
from spacy.language import Language

After the text is tokenized and a Doc object has been created, pipeline components are applied in order. spaCy supports a range of built-in components, but also lets you define your own.

```python
from spacy.language import Language

@Language.component("custom_component")
def custom_component_function(d):
    # Do something to the doc here
    return d

nlp.add_pipe("custom_component", last=True) # Default
```

In [10]:
# Define a custom component
@Language.component("custom_component")
def custom_component_function(d):
    # Print the doc's length
    print("Doc length:", len(d))
    # Return the doc object
    return d

nlp = spacy.load("en_core_web_sm")

# Add the component first in the pipeline
nlp.add_pipe("custom_component", first=True)

# Print the pipeline component names
print("Pipeline:", nlp.pipe_names)

# Process a text
_ = nlp("Hello world!")

Pipeline: ['custom_component', 'tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
Doc length: 3


In [11]:
import spacy
from spacy.language import Language
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print("animal_patterns:", animal_patterns)
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", animal_patterns)

# Define the custom component
@Language.component("animal_component")
def animal_component_function(d):
    # Apply the matcher to the doc
    matches = matcher(d)
    # Create a Span for each match and assign the label "ANIMAL"
    spans = [Span(d, start, end, label='ANIMAL') for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    d.ents = spans
    return d


# Add the component to the pipeline after the "ner" component
nlp.add_pipe("animal_component", after="ner")
print(nlp.pipe_names)

# Process the text and print the text and label for the doc.ents
doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text, ent.label_) for ent in doc.ents])

animal_patterns: [Golden Retriever, cat, turtle, Rattus norvegicus]
['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner', 'animal_component']
[('cat', 'ANIMAL'), ('Golden Retriever', 'ANIMAL')]


In [12]:
from spacy.tokens import Token

# Define getter function
def get_is_color(token):
    colors = ["red", "yellow", "blue"]
    return token.text in colors

Token.set_extension("is_color", getter=get_is_color, force=True)

doc = nlp("The lake is blue")

print(doc[3]._.is_color)

True


In [13]:
from spacy.tokens import Doc

# Define method with arguments
def has_token(doc, token_text):
    in_doc = token_text in [token.text for token in doc]
    return in_doc

# Set extension on the Doc with method
Doc.set_extension("has_token", method=has_token)

doc = nlp("The sky is blue.")
print(doc._.has_token("blue"), "- blue")
print(doc._.has_token("cloud"), "- cloud")

True - blue
False - cloud


In [14]:
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")


def get_wikipedia_url(span):
    # Get a Wikipedia URL if the span has one of the labels
    if span.label_ in ("PERSON", "ORG", "GPE", "LOCATION"):
        entity_text = span.text.replace(" ", "_")
        return f"https://en.wikipedia.org/w/index.php?search={entity_text}"


# Set the Span extension wikipedia_url using the getter get_wikipedia_url
Span.set_extension("wikipedia_url", method=get_wikipedia_url, force=True)

doc = nlp(
    "In over fifty years from his very first recordings right through to his "
    "last album, David Bowie was at the vanguard of contemporary culture."
)
for ent in doc.ents:
    # Print the text and Wikipedia URL of the entity
    print(ent.text, ent._.wikipedia_url())

fifty years None
first None
David Bowie https://en.wikipedia.org/w/index.php?search=David_Bowie


#### Performance

If you need to process a lot of texts and create a lot of Doc objects in a row, the nlp.pipe method can speed this up significantly. It processes the texts as a stream and yields Doc objects. It is much faster than just calling nlp on each text, because it batches up the texts.

In [15]:
data = [
    ("This is a text", {"id": 1, "page_number": 15}),
    ("And another text", {"id": 2, "page_number": 16}),
]

for doc, context in nlp.pipe(data, as_tuples=True):
    print(doc.text, context["page_number"])

This is a text 15
And another text 16
