In [10]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [11]:
introduction_doc = nlp(
    "This tutorial is about Natural Language Processing in spaCy."
)
type(introduction_doc)

spacy.tokens.doc.Doc

In [12]:
about_text = (
    "Sentence detection is the process of locating where sentences start and end in a given text." 
    "This allows you to you divide a text into linguistically meaningful units."
    "spaCy is correctly able to identify the input’s sentences."
)
about_doc = nlp(about_text)
sentences = list(about_doc.sents)
len(sentences)

for sentence in sentences:
    print(f"{sentence[:5]}...")


Sentence detection is the process...
This allows you to you...


In [13]:
ellipsis_text = (
    "Sentence detection is the process of locating where sentences start and end in a given text." 
    "This allows you to you divide a text into linguistically meaningful units."
    "spaCy is correctly able to identify the input’s sentences."
)

from spacy.language import Language
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    """Add support to use `...` as a delimiter for sentence detection"""
    for token in doc[:-1]:
        if token.text == "...":
            doc[token.i + 1].is_sent_start = True
    return doc


custom_nlp = spacy.load("en_core_web_sm")
custom_nlp.add_pipe("set_custom_boundaries", before="parser")
custom_ellipsis_doc = custom_nlp(ellipsis_text)
custom_ellipsis_sentences = list(custom_ellipsis_doc.sents)
for sentence in custom_ellipsis_sentences:
    print(sentence)

Sentence detection is the process of locating where sentences start and end in a given text.
This allows you to you divide a text into linguistically meaningful units.spaCy is correctly able to identify the input’s sentences.


In [14]:
import spacy
nlp = spacy.load("en_core_web_sm")
about_text = (
    "Sentence detection is the process of locating where sentences start and end in a given text." 
    "This allows you to you divide a text into linguistically meaningful units."
    "spaCy is correctly able to identify the input’s sentences."
)
about_doc = nlp(about_text)

for token in about_doc:
    print (token, token.idx)

Sentence 0
detection 9
is 19
the 22
process 26
of 34
locating 37
where 46
sentences 52
start 62
and 68
end 72
in 76
a 79
given 81
text 87
. 91
This 92
allows 97
you 104
to 108
you 111
divide 115
a 122
text 124
into 129
linguistically 134
meaningful 149
units.spaCy 160
is 172
correctly 175
able 185
to 190
identify 193
the 202
input 206
’s 211
sentences 214
. 223


In [15]:
print(
    f"{"Text with Whitespace":22}"
    f"{"Is Alphanumeric?":15}"
    f"{"Is Punctuation?":18}"
    f"{"Is Stop Word?"}"
)
for token in about_doc:
    print(
        f"{str(token.text_with_ws):22}"
        f"{str(token.is_alpha):15}"
        f"{str(token.is_punct):18}"
        f"{str(token.is_stop)}"
    )

Text with Whitespace  Is Alphanumeric?Is Punctuation?   Is Stop Word?
Sentence              True           False             False
detection             True           False             False
is                    True           False             True
the                   True           False             True
process               True           False             False
of                    True           False             True
locating              True           False             False
where                 True           False             True
sentences             True           False             False
start                 True           False             False
and                   True           False             True
end                   True           False             False
in                    True           False             True
a                     True           False             True
given                 True           False             False
text                  

In [16]:
custom_about_text = (
    "Sentence detection is the process of locating where sentences start and end in a given text." 
    "This allows you to you divide a text into linguistically meaningful units."
    "spaCy is correctly able to identify the input’s sentences."
)

print([token.text for token in nlp(custom_about_text)[8:15]])

['sentences', 'start', 'and', 'end', 'in', 'a', 'given']


In [17]:
import re
from spacy.tokenizer import Tokenizer

custom_nlp = spacy.load("en_core_web_sm")
prefix_re = spacy.util.compile_prefix_regex(
    custom_nlp.Defaults.prefixes
)
suffix_re = spacy.util.compile_suffix_regex(
    custom_nlp.Defaults.suffixes
)

custom_infixes = [r"@"]

infix_re = spacy.util.compile_infix_regex(
    list(custom_nlp.Defaults.infixes) + custom_infixes
)

custom_nlp.tokenizer = Tokenizer(
    nlp.vocab,
    prefix_search=prefix_re.search,
    suffix_search=suffix_re.search,
    infix_finditer=infix_re.finditer,
    token_match=None,
)

custom_tokenizer_about_doc = custom_nlp(custom_about_text)

print([token.text for token in custom_tokenizer_about_doc[8:15]])

['sentences', 'start', 'and', 'end', 'in', 'a', 'given']


In [18]:
import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
len(spacy_stopwords)

for stop_word in list(spacy_stopwords)[:10]:
    print(stop_word)

yourself
hereupon
very
perhaps
're
its
thereafter
give
mostly
never


In [19]:
custom_about_text = (
    "Sentence detection is the process of locating where sentences start and end in a given text." 
    "This allows you to you divide a text into linguistically meaningful units."
    "spaCy is correctly able to identify the input’s sentences."
)
nlp = spacy.load("en_core_web_sm")
about_doc = nlp(custom_about_text)
print([token for token in about_doc if not token.is_stop])

[Sentence, detection, process, locating, sentences, start, end, given, text, ., allows, divide, text, linguistically, meaningful, units.spaCy, correctly, able, identify, input, sentences, .]


In [20]:
import spacy
nlp = spacy.load("en_core_web_sm")
conference_help_text = (
"Sentence detection is the process of locating where sentences start and end in a given text." 
    "This allows you to you divide a text into linguistically meaningful units."
    "spaCy is correctly able to identify the input’s sentences.")
conference_help_doc = nlp(conference_help_text)
for token in conference_help_doc:
    if str(token) != str(token.lemma_):
        print(f"{str(token):>20} : {str(token.lemma_)}")

            Sentence : sentence
                  is : be
            locating : locate
           sentences : sentence
               given : give
                This : this
              allows : allow
         units.spaCy : units.spacy
                  is : be
           sentences : sentence


In [21]:
import spacy
nlp = spacy.load("en_core_web_sm")
about_text = (
"Sentence detection is the process of locating where sentences start and end in a given text." 
    "This allows you to you divide a text into linguistically meaningful units."
    "spaCy is correctly able to identify the input’s sentences."
)
about_doc = nlp(about_text)
for token in about_doc:
    print(
        f"""
TOKEN: {str(token)}
=====
TAG: {str(token.tag_):10} POS: {token.pos_}
EXPLANATION: {spacy.explain(token.tag_)}"""
    )


TOKEN: Sentence
=====
TAG: NN         POS: NOUN
EXPLANATION: noun, singular or mass

TOKEN: detection
=====
TAG: NN         POS: NOUN
EXPLANATION: noun, singular or mass

TOKEN: is
=====
TAG: VBZ        POS: AUX
EXPLANATION: verb, 3rd person singular present

TOKEN: the
=====
TAG: DT         POS: DET
EXPLANATION: determiner

TOKEN: process
=====
TAG: NN         POS: NOUN
EXPLANATION: noun, singular or mass

TOKEN: of
=====
TAG: IN         POS: ADP
EXPLANATION: conjunction, subordinating or preposition

TOKEN: locating
=====
TAG: VBG        POS: VERB
EXPLANATION: verb, gerund or present participle

TOKEN: where
=====
TAG: WRB        POS: SCONJ
EXPLANATION: wh-adverb

TOKEN: sentences
=====
TAG: NNS        POS: NOUN
EXPLANATION: noun, plural

TOKEN: start
=====
TAG: VBP        POS: VERB
EXPLANATION: verb, non-3rd person singular present

TOKEN: and
=====
TAG: CC         POS: CCONJ
EXPLANATION: conjunction, coordinating

TOKEN: end
=====
TAG: VB         POS: VERB
EXPLANATION: verb, base 