# 1. What happens when you call nlp?

- Tokenize the text and apply each pipeline component in order. The tokenizer turns a string of text into a `Doc` object. spaCy then applies every component in the pipeline on document, in order.

# 2. Inspecting the Pipeline

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

print(nlp.pipe_names)

print(nlp.pipeline)

# 3. Simple Components


In [None]:
import spacy

def length_component(doc):
    doc_length = len(doc)
    print(f"This document is {doc_length} tokens long.")
    return doc

nlp = spacy.load("en_core_web_sm")

nlp.add_pipe(length_component, first=True)
print(nlp.pipe_names)

doc = nlp("This is a sentence.")

# 4. Complex Components

In [None]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")
animals = ["Golden Retriever", "cat", "trutule", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print("animal_patterns: ", animal_patterns)
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)

def animal_component(doc):
    matches = matcher(doc)
    spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]
    
    doc.ents = spans
    return doc

nlp.add_pipe(animal_component, after="ner")
print(nlp.pipe_names)

doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text, ent.label_) for ent in doc.ents])

# 5. Setting Extension Attributes

In [None]:
from spacy.lang.en import English
from spacy.tokens import Token

nlp = English()

Token.set_extension("is_country", default=False)

doc = nlp("I live in Spain.")
doc[3]._.is_country = True

print([(token.text, token._.is_country) for token in doc])

In [None]:
from spacy.lang.en import English
from spacy.tokens import Token

nlp = English()

def get_reversed(token):
    return token.text[::-1]

Token.set_extension("reversed", getter=get_reversed)

doc = nlp("All generalizations are false, including this one.")

for token in doc:
    print("reversed:", token._.reversed)

In [None]:
from spacy.lang.en import English
from spacy.tokens import Doc

nlp = English()

def get_has_number(doc):
    return any(token.like_num for token in doc)

Doc.set_extension("has_number", getter=get_has_number)

doc = nlp("The museum closed for five years in 2012.")
print("has_number:", doc._.has_number)

In [None]:
from spacy.lang.en import English
from spacy.tokens import Span

nlp = English()

def to_html(span, tag):
    return f"<{tag}>{span.text}</{tag}>"

Span.set_extension("to_html", method=to_html)

doc = nlp("Hello world, this is a sentence.")
span = doc[0:2]
print("to_html", span._.to_html('strong'))

# 6. Entities and Extensions

In [None]:
import spacy
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")

def get_wikipedia_url(span):
    if span.label_ in ("PERSON", "ORG", "GPE", "LOCATION"):
        entity_text = span.text.replace(" ", "_")
        return "https://en.wikipedia.org/w/index.php?search="+entity_text
    
    
Span.set_extension("wikipedia_url", getter=get_wikipedia_url)

doc = nlp(
    "In over fifty years from his very first recordings right through to his "
    "last album, David Bowie was at the vanguard of contemporary culture."
)

for ent in doc.ents:
    print(ent.text, ent._.wikipedia_url)