In [1]:
!pip install spacy



In [2]:
import spacy

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
nlp

<spacy.lang.en.English at 0x7ccca13437f0>

In [5]:
introduction_doc = nlp(
    "This tutorial is about Natural Language Processing in spaCy.")

In [6]:
type(introduction_doc)

spacy.tokens.doc.Doc

In [7]:
[token.text for token in introduction_doc]

['This',
 'tutorial',
 'is',
 'about',
 'Natural',
 'Language',
 'Processing',
 'in',
 'spaCy',
 '.']

In [None]:
#Reading in text from a file

In [8]:
import pathlib

In [9]:
file_name = "/content/introduction.txt"

In [10]:
introduction_doc = nlp(pathlib.Path(file_name).read_text(encoding="UTF_8"))

In [11]:
print ([token.text for token in introduction_doc])

['This', 'tutorial', 'is', 'about', 'Natural', 'Lanuage', 'Processing', '.']


In [12]:
about_text = about_text = (
...     "Gus Proto is a Python developer currently"
...     " working for a London-based Fintech"
...     " company. He is interested in learning"
...     " Natural Language Processing."
... )

In [13]:
about_doc = nlp(about_text)

In [14]:
sentences = list(about_doc.sents)

In [15]:
len(sentences)

2

In [16]:
for sentence in sentences:
  print(f"{sentence[:5]}...")

Gus Proto is a Python...
He is interested in learning...


In [17]:
ellipsis_text = (
...     "Gus, can you, ... never mind, I forgot"
...     " what I was saying. So, do you think"
...     " we should ..."
... )

In [18]:
from spacy.language import Language

A decorator in Python is a design pattern that allows you to modify or enhance the behavior of functions or methods. It does this without permanently modifying the function itself. Decorators are often used for logging, enforcing access control, instrumentation, or modifying the behavior of existing code in a flexible, reusable way.

A decorator is typically defined as a higher-order function, which is a function that takes a function as input and returns a new function. The syntax for using a decorator involves prefixing a function definition with the decorator name, preceded by the @ symbol.

The @Language.component Decorator:
In the context of spaCy, the @Language.component decorator is used to register a custom pipeline component. A pipeline component is a function that processes a Doc object (a container for accessing linguistic annotations in spaCy) and returns it, potentially modifying it along the way.

Component Registration: The decorator @Language.component("set_custom_boundaries") is used to register the function set_custom_boundaries as a pipeline component. The string "set_custom_boundaries" is the name that will be used to refer to this component within the spaCy pipeline.

In [19]:
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
  """Add support to use `...` as a delimiter for sentence detection"""
  for token in doc[:-1]:
    if token.text == "...":
      doc[token.i + 1].is_sent_start = True
  return doc

In [20]:
custom_nlp = spacy.load("en_core_web_sm")

In [21]:
custom_nlp.add_pipe("set_custom_boundaries", before="parser")
custom_ellipsis_doc = custom_nlp(ellipsis_text)
custom_ellipsis_sentences = list(custom_ellipsis_doc.sents)
for sentence in custom_ellipsis_sentences:
  print(sentence)

Gus, can you, ...
never mind, I forgot what I was saying.
So, do you think we should ...


In [22]:
nlp = spacy.load("en_core_web_sm")

In [23]:
about_text = (
...     "Gus Proto is a Python developer currently"
...     " working for a London-based Fintech"
...     " company. He is interested in learning"
...     " Natural Language Processing."
... )

In [24]:
about_doc = nlp(about_text)

In [25]:
for token in about_doc:
  print(token,token.idx)

Gus 0
Proto 4
is 10
a 13
Python 15
developer 22
currently 32
working 42
for 50
a 54
London 56
- 62
based 63
Fintech 69
company 77
. 84
He 86
is 89
interested 92
in 103
learning 106
Natural 115
Language 123
Processing 132
. 142


In [26]:
print(
    f"{'Text with Whitespace':22}"
    f"{'Is Alphanumeric?':15}"
    f"{'Is Punctuation?':18}"
    f"{'Is Stop Word?'}"
)


Text with Whitespace  Is Alphanumeric?Is Punctuation?   Is Stop Word?


In [27]:
for token in about_doc:
  print(
       f"{str(token.text_with_ws):22}"
       f"{str(token.is_alpha):15}"
       f"{str(token.is_punct):18}"
       f"{str(token.is_stop)}"
    )

Gus                   True           False             False
Proto                 True           False             False
is                    True           False             True
a                     True           False             True
Python                True           False             False
developer             True           False             False
currently             True           False             False
working               True           False             False
for                   True           False             True
a                     True           False             True
London                True           False             False
-                     False          True              False
based                 True           False             False
Fintech               True           False             False
company               True           False             False
.                     False          True              False
He                    True  

In [28]:
>>> custom_about_text = (
  "Gus Proto is a Python developer currently"
  " working for a London@based Fintech"
  " company. He is interested in learning"
  " Natural Language Processing."
 )

print([token.text for token in nlp(custom_about_text)[8:15]])

['for', 'a', 'London@based', 'Fintech', 'company', '.', 'He']


Building a custom tokenizer

In [29]:
import re
from spacy.tokenizer import Tokenizer

custom_nlp = spacy.load("en_core_web_sm")

prefix_re = spacy.util.compile_prefix_regex(
   custom_nlp.Defaults.prefixes
 )

In [30]:
suffix_re = spacy.util.compile_suffix_regex(
...     custom_nlp.Defaults.suffixes
... )

In [31]:
custom_infixes = [r"@"]

In [32]:
infix_re = spacy.util.compile_infix_regex(
...     list(custom_nlp.Defaults.infixes) + custom_infixes
... )

In [33]:
custom_nlp.tokenizer = Tokenizer(
...     nlp.vocab,
...     prefix_search=prefix_re.search,
...     suffix_search=suffix_re.search,
...     infix_finditer=infix_re.finditer,
...     token_match=None,
... )

In [34]:
custom_tokenizer_about_doc = custom_nlp(custom_about_text)

In [35]:
print([token.text for token in custom_tokenizer_about_doc[8:15]])

['for', 'a', 'London', '@', 'based', 'Fintech', 'company']


To build a new Tokenizer, you generally provide it with:

Vocab: A storage container for special cases, which is used to handle cases like contractions and emoticons.
prefix_search: A function that handles preceding punctuation, such as opening parentheses.
suffix_search: A function that handles succeeding punctuation, such as closing parentheses.
infix_finditer: A function that handles non-whitespace separators, such as hyphens.
token_match: An optional Boolean function that matches strings that should never be split. It overrides the previous rules and is useful for entities like URLs or numbers.

Stop Words

In [36]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [37]:
len(spacy_stopwords)

326

In [38]:
for stop_word in list(spacy_stopwords)[:10]:
  print(stop_word)

one
rather
last
are
over
because
alone
which
perhaps
indeed


In [39]:
>>> custom_about_text = (
...     "Gus Proto is a Python developer currently"
...     " working for a London-based Fintech"
...     " company. He is interested in learning"
...     " Natural Language Processing."
... )
>>> nlp = spacy.load("en_core_web_sm")
>>> about_doc = nlp(custom_about_text)
>>> print([token for token in about_doc if not token.is_stop])


[Gus, Proto, Python, developer, currently, working, London, -, based, Fintech, company, ., interested, learning, Natural, Language, Processing, .]


In [40]:
nlp = spacy.load("en_core_web_sm")

In [41]:
conference_help_text = (
...     "Gus is helping organize a developer"
...     " conference on Applications of Natural Language"
...     " Processing. He keeps organizing local Python meetups"
...     " and several internal talks at his workplace."
... )

In [42]:
conference_help_doc = nlp(conference_help_text)

Lemmatization

In [43]:
for token in conference_help_doc:
  if str(token) != str(token.lemma_):
    print(f"{str(token):>20} : {str(token.lemma_)}")

                  is : be
                  He : he
               keeps : keep
          organizing : organize
             meetups : meetup
               talks : talk


If you don’t lemmatize the text, then organize and organizing will be counted as different tokens, even though they both refer to the same concept. Lemmatization helps you avoid duplicate words that may overlap conceptually.

In [44]:
from collections import Counter

In [45]:
nlp = spacy.load("en_core_web_sm")

In [46]:
complete_text = (
...     "Gus Proto is a Python developer currently"
...     " working for a London-based Fintech company. He is"
...     " interested in learning Natural Language Processing."
...     " There is a developer conference happening on 21 July"
...     ' 2019 in London. It is titled "Applications of Natural'
...     ' Language Processing". There is a helpline number'
...     " available at +44-1234567891. Gus is helping organize it."
...     " He keeps organizing local Python meetups and several"
...     " internal talks at his workplace. Gus is also presenting"
...     ' a talk. The talk will introduce the reader about "Use'
...     ' cases of Natural Language Processing in Fintech".'
...     " Apart from his work, he is very passionate about music."
...     " Gus is learning to play the Piano. He has enrolled"
...     " himself in the weekend batch of Great Piano Academy."
...     " Great Piano Academy is situated in Mayfair or the City"
...     " of London and has world-class piano instructors."
... )

In [47]:
complete_doc = nlp(complete_text)

In [48]:
words = [
...     token.text
...     for token in complete_doc
...     if not token.is_stop and not token.is_punct
... ]

In [49]:
>>> print(Counter(words).most_common(5))

[('Gus', 4), ('London', 3), ('Natural', 3), ('Language', 3), ('Processing', 3)]


POS tagging

In [50]:
nlp = spacy.load("en_core_web_sm")

In [51]:
about_text = (
...     "Gus Proto is a Python developer currently"
...     " working for a London-based Fintech"
...     " company. He is interested in learning"
...     " Natural Language Processing."
... )

In [52]:
about_doc = nlp(about_text)

In [53]:
for token in about_doc:
...     print(
...         f"""
... TOKEN: {str(token)}
... =====
... TAG: {str(token.tag_):10} POS: {token.pos_}
... EXPLANATION: {spacy.explain(token.tag_)}"""
...     )


TOKEN: Gus
=====
TAG: NNP        POS: PROPN
EXPLANATION: noun, proper singular

TOKEN: Proto
=====
TAG: NNP        POS: PROPN
EXPLANATION: noun, proper singular

TOKEN: is
=====
TAG: VBZ        POS: AUX
EXPLANATION: verb, 3rd person singular present

TOKEN: a
=====
TAG: DT         POS: DET
EXPLANATION: determiner

TOKEN: Python
=====
TAG: NNP        POS: PROPN
EXPLANATION: noun, proper singular

TOKEN: developer
=====
TAG: NN         POS: NOUN
EXPLANATION: noun, singular or mass

TOKEN: currently
=====
TAG: RB         POS: ADV
EXPLANATION: adverb

TOKEN: working
=====
TAG: VBG        POS: VERB
EXPLANATION: verb, gerund or present participle

TOKEN: for
=====
TAG: IN         POS: ADP
EXPLANATION: conjunction, subordinating or preposition

TOKEN: a
=====
TAG: DT         POS: DET
EXPLANATION: determiner

TOKEN: London
=====
TAG: NNP        POS: PROPN
EXPLANATION: noun, proper singular

TOKEN: -
=====
TAG: HYPH       POS: PUNCT
EXPLANATION: punctuation mark, hyphen

TOKEN: based
=====
TAG

In [55]:
nouns = []
adjectives = []
for token in about_doc:
   if token.pos_ == "NOUN":
    nouns.append(token)
   if token.pos_ == "ADJ":
            adjectives.append(token)

In [56]:
nouns

[developer, company]

In [57]:
adjectives

[interested]

Visualization : using displacy

In [58]:
from spacy import displacy

In [59]:
nlp = spacy.load("en_core_web_sm")

In [60]:
about_interest_text = (
...     "He is interested in learning Natural Language Processing."
... )

In [61]:
about_interest_doc = nlp(about_interest_text)

In [62]:
displacy.serve(about_interest_doc, style="dep")


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [63]:
displacy.render(about_interest_doc, style="dep", jupyter=True)

In [64]:
complete_text = (
...     "Gus Proto is a Python developer currently"
...     " working for a London-based Fintech company. He is"
...     " interested in learning Natural Language Processing."
...     " There is a developer conference happening on 21 July"
...     ' 2019 in London. It is titled "Applications of Natural'
...     ' Language Processing". There is a helpline number'
...     " available at +44-1234567891. Gus is helping organize it."
...     " He keeps organizing local Python meetups and several"
...     " internal talks at his workplace. Gus is also presenting"
...     ' a talk. The talk will introduce the reader about "Use'
...     ' cases of Natural Language Processing in Fintech".'
...     " Apart from his work, he is very passionate about music."
...     " Gus is learning to play the Piano. He has enrolled"
...     " himself in the weekend batch of Great Piano Academy."
...     " Great Piano Academy is situated in Mayfair or the City"
...     " of London and has world-class piano instructors."
... )

In [65]:
complete_doc = nlp(complete_text)

In [67]:
def is_token_allowed(token):
  return bool(
         token
         and str(token).strip()
         and not token.is_stop
         and not token.is_punct
     )

In [68]:
def preprocess_token(token):
...     return token.lemma_.strip().lower()

In [69]:
complete_filtered_tokens = [
...     preprocess_token(token)
...     for token in complete_doc
...     if is_token_allowed(token)
... ]

In [70]:
complete_filtered_tokens

['gus',
 'proto',
 'python',
 'developer',
 'currently',
 'work',
 'london',
 'base',
 'fintech',
 'company',
 'interested',
 'learn',
 'natural',
 'language',
 'processing',
 'developer',
 'conference',
 'happen',
 '21',
 'july',
 '2019',
 'london',
 'title',
 'application',
 'natural',
 'language',
 'processing',
 'helpline',
 'number',
 'available',
 '+44',
 '1234567891',
 'gus',
 'helping',
 'organize',
 'keep',
 'organize',
 'local',
 'python',
 'meetup',
 'internal',
 'talk',
 'workplace',
 'gus',
 'present',
 'talk',
 'talk',
 'introduce',
 'reader',
 'use',
 'case',
 'natural',
 'language',
 'processing',
 'fintech',
 'apart',
 'work',
 'passionate',
 'music',
 'gus',
 'learn',
 'play',
 'piano',
 'enrol',
 'weekend',
 'batch',
 'great',
 'piano',
 'academy',
 'great',
 'piano',
 'academy',
 'situate',
 'mayfair',
 'city',
 'london',
 'world',
 'class',
 'piano',
 'instructor']

In [71]:
nlp = spacy.load("en_core_web_sm")

In [86]:
import spacy
from spacy.matcher import Matcher

# Load a spaCy model
nlp = spacy.load("en_core_web_sm")

# Initialize the Matcher with the shared vocabulary
matcher = Matcher(nlp.vocab)

# Function to extract phone number
def extract_phone_number(nlp_doc):
    pattern = [
        {"ORTH": "("},
        {"SHAPE": "ddd"},
        {"ORTH": ")"},
        {"SHAPE": "ddd"},
        {"ORTH": "-", "OP": "?"},
        {"SHAPE": "dddd"},
    ]
    matcher.add("PHONE_NUMBER", [pattern])
    matches = matcher(nlp_doc)
    for match_id, start, end in matches:
        span = nlp_doc[start:end]
        return span.text

# Text to process
conference_org_text = ("There is a developer conference"
    " happening on 21 July 2019 in London. It is titled"
    ' "Applications of Natural Language Processing".'
    " There is a helpline number available"
    " at (123) 456-7891")

# Process the text
conference_org_doc = nlp(conference_org_text)

# Extract phone number
phone_number = extract_phone_number(conference_org_doc)
print(phone_number)


(123) 456-7891


Dependency parsing using SpaCy

In [87]:
piano_text = "Gus is learning piano"

In [88]:
piano_doc = nlp(piano_text)

In [91]:
for token in piano_doc:
  print(
      f"""
TOKEN: {token.text}
  )
=====
{token.tag_ = }
{token.head.text = }
{token.dep_ = }"""
  )


TOKEN: Gus
  )
=====
token.tag_ = 'NNP'
token.head.text = 'learning'
token.dep_ = 'nsubj'

TOKEN: is
  )
=====
token.tag_ = 'VBZ'
token.head.text = 'learning'
token.dep_ = 'aux'

TOKEN: learning
  )
=====
token.tag_ = 'VBG'
token.head.text = 'learning'
token.dep_ = 'ROOT'

TOKEN: piano
  )
=====
token.tag_ = 'NN'
token.head.text = 'learning'
token.dep_ = 'dobj'


In [93]:
one_line_about_text = (
...     "Gus Proto is a Python developer"
...     " currently working for a London-based Fintech company"
... )

In [94]:
one_line_about_doc = nlp(one_line_about_text)

In [95]:
# Extract children of `developer`
>>> print([token.text for token in one_line_about_doc[5].children])

['a', 'Python', 'working']


In [96]:
# Extract previous neighboring node of `developer`
>>> print (one_line_about_doc[5].nbor(-1))

Python


In [97]:
# Extract next neighboring node of `developer`
>>> print (one_line_about_doc[5].nbor())

currently


In [99]:
# Extract all tokens on the left of `developer`
print([token.text for token in one_line_about_doc[5].lefts])

['a', 'Python']


In [100]:
# Extract tokens on the right of `developer`
>>> print([token.text for token in one_line_about_doc[5].rights])

['working']


In [101]:
# Print subtree of `developer`
>>> print (list(one_line_about_doc[5].subtree))

[a, Python, developer, currently, working, for, a, London, -, based, Fintech, company]


In [102]:
nlp = spacy.load("en_core_web_sm")

Noun Phrase detection

In [103]:
conference_text = (
...     "There is a developer conference happening on 21 July 2019 in London."
... )

In [104]:
conference_doc = nlp(conference_text)

In [105]:
for chunk in conference_doc.noun_chunks:
  print(chunk)

a developer conference
21 July
London


Verb Phrase Detection

In [107]:
!pip install textacy

Collecting textacy
  Downloading textacy-0.13.0-py3-none-any.whl (210 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/210.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m204.8/210.7 kB[0m [31m6.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.7/210.7 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting cytoolz>=0.10.1 (from textacy)
  Downloading cytoolz-0.12.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting floret~=0.10.0 (from textacy)
  Downloading floret-0.10.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.4/320.4 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jellyfis

In [108]:
import textacy

In [109]:
about_talk_text = (
...     "The talk will introduce reader about use"
...     " cases of Natural Language Processing in"
...     " Fintech, making use of"
...     " interesting examples along the way."
... )

In [110]:
patterns = [{"POS": "AUX"}, {"POS": "VERB"}]

In [111]:
about_talk_doc = textacy.make_spacy_doc(
...     about_talk_text, lang="en_core_web_sm"
... )

In [112]:
verb_phrases = textacy.extract.token_matches(
...     about_talk_doc, patterns=patterns
... )

In [113]:
# Print all verb phrases
>>> for chunk in verb_phrases:
...     print(chunk.text)

will introduce


In [114]:
# Extract noun phrase to explain what nouns are involved
>>> for chunk in about_talk_doc.noun_chunks:
...     print (chunk)

The talk
reader
use cases
Natural Language Processing
Fintech
use
interesting examples
the way


Named Entity Recognition

In [115]:
piano_class_text = (
...     "Great Piano Academy is situated"
...     " in Mayfair or the City of London and has"
...     " world-class piano instructors."
... )

In [116]:
piano_class_doc = nlp(piano_class_text)

In [117]:
for ent in piano_class_doc.ents:
...     print(
...         f"""
... {ent.text = }
... {ent.start_char = }
... {ent.end_char = }
... {ent.label_ = }
... spacy.explain('{ent.label_}') = {spacy.explain(ent.label_)}"""
... )


ent.text = 'Great Piano Academy'
ent.start_char = 0
ent.end_char = 19
ent.label_ = 'ORG'
spacy.explain('ORG') = Companies, agencies, institutions, etc.

ent.text = 'Mayfair'
ent.start_char = 35
ent.end_char = 42
ent.label_ = 'GPE'
spacy.explain('GPE') = Countries, cities, states

ent.text = 'the City of London'
ent.start_char = 46
ent.end_char = 64
ent.label_ = 'GPE'
spacy.explain('GPE') = Countries, cities, states


In [118]:
survey_text = (
...     "Out of 5 people surveyed, James Robert,"
...     " Julie Fuller and Benjamin Brooks like"
...     " apples. Kelly Cox and Matthew Evans"
...     " like oranges."
... )

In [119]:
def replace_person_names(token):
...     if token.ent_iob != 0 and token.ent_type_ == "PERSON":
           return "[REDACTED] "
...     return token.text_with_ws

In [120]:
def redact_names(nlp_doc):
...     with nlp_doc.retokenize() as retokenizer:
...         for ent in nlp_doc.ents:
...             retokenizer.merge(ent)
...     tokens = map(replace_person_names, nlp_doc)
...     return "".join(tokens)

In [121]:
survey_doc = nlp(survey_text)

In [122]:
print(redact_names(survey_doc))

Out of 5 people surveyed, [REDACTED] , [REDACTED] and [REDACTED] like apples. [REDACTED] and [REDACTED] like oranges.
