In [1]:
import spacy

In [2]:
nlp=spacy.load("en_core_web_sm")

In [3]:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [4]:
for doc in doc.sents:
    print(doc)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [5]:
doc = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

In [6]:
doc.text

# What if we want to cahnge this rule such that whenever there is a ; a new sentence starts.

'"Management is doing things right; leadership is doing the right things." -Peter Drucker'

In [7]:
for sent in doc.sents:
    print(sent)
    print('\n')

"Management is doing things right; leadership is doing the right things."


-Peter Drucker




In [8]:
for token in doc:
    print(token.i,token.text)

0 "
1 Management
2 is
3 doing
4 things
5 right
6 ;
7 leadership
8 is
9 doing
10 the
11 right
12 things
13 .
14 "
15 -Peter
16 Drucker


In [9]:
# ADD A NEW RULE TO THE PIPELINE

from spacy.language import Language

@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:     # Not including the last word of doc i.e Drucker.
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

nlp.add_pipe("set_custom_boundaries", before="parser")

# Run the code once because it gets in the pipeline and it cannot inserted in the pipeline with the same name.

<function __main__.set_custom_boundaries(doc)>

In [10]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'set_custom_boundaries', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [11]:
doc[:-1]

"Management is doing things right; leadership is doing the right things." -Peter

In [12]:
doc4 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

In [13]:
for sent in doc4.sents:
    print(sent)

"Management is doing things right;
leadership is doing the right things."
-Peter Drucker
