In [2]:
import spacy
# Load the en_core_web_sm model
nlp = spacy.load('en_core_web_md')

# Print the names of the pipeline components
print(nlp.pipe_names)

# Print the full pipeline of (name, component) tuples
print(nlp.pipeline)

['tagger', 'parser', 'ner']
[('tagger', <spacy.pipeline.Tagger object at 0x11177f710>), ('parser', <spacy.pipeline.DependencyParser object at 0x111b69a98>), ('ner', <spacy.pipeline.EntityRecognizer object at 0x111b69af0>)]


In [3]:
# Define the custom component
def length_component(doc):
    # Get the doc's length
    doc_length = len(doc)
    print("This document is {} tokens long.".format(doc_length))
    # Return the doc
    return doc

In [4]:
# Load the small English model
nlp = spacy.load('en_core_web_sm')
  
# Add the component first in the pipeline and print the pipe names
nlp.add_pipe(length_component,first=True)
print(nlp.pipe_names)

['length_component', 'tagger', 'parser', 'ner']


In [5]:
# Process a text
doc = nlp("This is a sentence.")

This document is 5 tokens long.


# Complex components

In [2]:
import spacy
# Import the PhraseMatcher and initialize it
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc, Span
# Load the small English model
nlp = spacy.load('en_core_web_sm')

matcher = PhraseMatcher(nlp.vocab)
ANIMALS=['Golden Retriever', 'cat', 'turtle', 'Rattus norvegicus']
patterns = list(nlp.pipe(ANIMALS))
matcher.add('ANIMAL', None, *patterns)

# Define the custom component
def animal_component(doc):
    # Create a Span for each match and assign the label 'ANIMAL'
    # and overwrite the doc.ents with the matched spans
    doc.ents = [Span(doc, start, end, label=nlp.vocab.strings['ANIMAL'])
                for match_id, start, end in matcher(doc)]
    return doc

  
# Add the component to the pipeline after the 'ner' component 
nlp.add_pipe(animal_component, after='ner')  

# Process the text and print the text and label for the doc.ents
doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text, ent.label_) for ent in doc.ents])

[('cat', 'ANIMAL'), ('Golden Retriever', 'ANIMAL')]


#  Setting extension attributes 

In [6]:
from spacy.tokens import Doc, Span,Token
# Register the Token extension attribute 'is_country' with the default value False
Token.set_extension('is_country', default=False)

# Process the text and set the is_country attribute to True for the token "Spain"
doc = nlp("I live in Spain.")
doc[3]._.is_country = True

# Print the token text and the is_country attribute for all tokens
print([(token.text, token._.is_country) for token in doc])

[('I', False), ('live', False), ('in', False), ('Spain', True), ('.', False)]


In [7]:
# Define the getter function that takes a token and returns its reversed text
def get_reversed(token):
    return token.text[::-1]
  
# Register the Token property extension 'reversed' with the getter get_reversed
Token.set_extension('reversed', getter=get_reversed)

# Process the text and print the reversed attribute for each token
doc = nlp("All generalizations are false, including this one.")
for token in doc:
    print('reversed:', token._.reversed)

reversed: llA
reversed: snoitazilareneg
reversed: era
reversed: eslaf
reversed: ,
reversed: gnidulcni
reversed: siht
reversed: eno
reversed: .


In [8]:
# Define the getter function
def get_has_number(doc):
    # Return if any of the tokens in the doc return True for token.like_num
    return any(token.like_num for token in doc)

# Register the Doc property extension 'has_number' with the getter get_has_number
Doc.set_extension('has_number' , getter =get_has_number)

# Process the text and check the custom has_number attribute 
doc = nlp("The museum closed for five years in 2012.")
print('has_number:', doc._.has_number)

has_number: True


In [9]:
# Define the method
def to_html(span, tag):
    # Wrap the span text in a HTML tag and return it
    return '<{tag}>{text}</{tag}>'.format(tag=tag, text=span.text)

# Register the Span property extension 'to_html' with the method to_html
Span.set_extension('to_html', method =to_html)

# Process the text and call the to_html method on the span with the tag name 'strong'
doc = nlp("Hello world, this is a sentence.")
span = doc[0:2]
print(span._.to_html('strong'))

<strong>Hello world</strong>


In [3]:
def get_wikipedia_url(span):
    # Get a Wikipedia URL if the span has one of the labels
    if span.label_ in ('PERSON', 'ORG', 'GPE', 'LOCATION'):
        entity_text = span.text.replace(' ', '_')
        return "https://en.wikipedia.org/w/index.php?search=" + entity_text
# Load the small English model
nlp = spacy.load('en_core_web_sm')
# Set the Span extension wikipedia_url using get getter get_wikipedia_url
Span.set_extension('wikipedia_url', getter =get_wikipedia_url)

doc = nlp("In over fifty years from his very first recordings right through to his last album, David Bowie was at the vanguard of contemporary culture.")
for ent in doc.ents:
    # Print the text and Wikipedia URL of the entity
    print(ent.text, ent._.wikipedia_url)

over fifty years None
first None
David Bowie https://en.wikipedia.org/w/index.php?search=David_Bowie


In [1]:
import spacy
from spacy.tokens import Doc, Span,Token
from spacy.matcher import PhraseMatcher
nlp = spacy.load('en_core_web_sm')
matcher = PhraseMatcher(nlp.vocab)

COUNTRIES=['Czech Republic','Slovakia','USA']
patterns = list(nlp.pipe(COUNTRIES))
matcher.add('COUNTRY', None, *patterns)

def countries_component(doc):
    # Create an entity Span with the label 'GPE' for all matches
    doc.ents = [Span(doc, start, end, label=nlp.vocab.strings['GPE'])
                for match_id, start, end in matcher(doc)]
    return doc

# Add the component to the pipeline
nlp.add_pipe(countries_component)
print(nlp.pipe_names)

['tagger', 'parser', 'ner', 'countries_component']


In [2]:
# Process the text and print the entity text, label and capital attributes
# Load the small English model

doc = nlp("Czech Republic may help Slovakia protect its airspace")
print([(ent.text, ent.label_) for ent in doc.ents])

[('Czech Republic', 'GPE'), ('Slovakia', 'GPE')]
