In [1]:
# Author: Roi Yehoshua <roiyeho@gmail.com>
# May 2024
# License: MIT

In [2]:
import spacy

spaCy Core Components

In [3]:
nlp = spacy.load('en_core_web_sm')  # A small English NLP model

In [4]:
doc = nlp("Apple is looking at buying a U.K. startup for $1 billion")

In [5]:
print([token.text for token in doc])

['Apple', 'is', 'looking', 'at', 'buying', 'a', 'U.K.', 'startup', 'for', '$', '1', 'billion']


In [6]:
for token in doc:
    print(token.text, token.lemma_, token.pos_)

Apple Apple PROPN
is be AUX
looking look VERB
at at ADP
buying buy VERB
a a DET
U.K. U.K. PROPN
startup startup NOUN
for for ADP
$ $ SYM
1 1 NUM
billion billion NUM


In [7]:
for entity in doc.ents:
    print(entity.text, entity.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


spaCy Pipeline

In [8]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [9]:
from spacy.language import Language

# Define the custom component function
@Language.component('custom_component')
def custom_component(doc):
    # Do something to the doc here
    print("Processing document length:", len(doc))
    return doc

# Add the custom component to the pipeline using the registered name
nlp.add_pipe('custom_component', last=True)

# Process text to see the effect of the custom component
doc = nlp("Apple is looking at buying a U.K. startup for $1 billion")

Processing document length: 12
