In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
text = "This is the first sentence. This is the second sentence. This is the third sentence."

In [4]:
doc = nlp(text)

In [6]:
for sentence in doc.sents:
    print(sentence)

this be the first sentence.
this be the second sentence.
this be the third sentence.


In [7]:
list(doc.sents)[0]

This is the first sentence.

In [8]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [9]:
doc.text

'"Management is doing the right things; leadership is doing the right things." -Peter Drucker'

In [13]:
# Two difference sentences are stored. We may want to change it based on our requirement.
for sentence in doc.sents:
    print(sentence)
    print()

"Management is doing the right things; leadership is doing the right things."

-Peter Drucker



In [36]:
"""
Add a SEGMENTATION rule.
"""

'\nAdd a SEGMENTATION rule.\n'

In [20]:
from spacy import Language

In [23]:
@Language.component("boundaries")
def set_custom_boundaries(doc):
    for token in doc[: -1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

In [26]:
# Current pipeline
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [27]:
nlp.add_pipe('boundaries', before='parser')

<function __main__.set_custom_boundaries(doc)>

In [30]:
# updated pipeline
nlp.pipe_names

['tok2vec',
 'tagger',
 'boundaries',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner']

In [33]:
doc2 = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [34]:
for sent in doc2.sents:
    print(sent)

"Management is doing the right things;
leadership is doing the right things."
-Peter Drucker


In [35]:
"""
Change the SEGMENTATION rule. Changing the default rules
"""

'\nChange the SEGMENTATION rule. Changing the default rules\n'

In [75]:
nlp = spacy.load('en_core_web_sm')

In [76]:
text = u"This is a sentence. This is another. \n\nThis is a \nthird sentence."

In [77]:
print(text)

This is a sentence. This is another. 

This is a 
third sentence.


In [78]:
doc = nlp(text)

In [79]:
for sentence in doc.sents:
    print("-->", doc)

--> This is a sentence. This is another. 

This is a 
third sentence.
--> This is a sentence. This is another. 

This is a 
third sentence.
--> This is a sentence. This is another. 

This is a 
third sentence.
--> This is a sentence. This is another. 

This is a 
third sentence.


In [80]:
# CHANGING THE RULES
from spacy.pipeline import Sentencizer

punct_marks = ["\n", "\n\n"]
config = {"punct_chars": punct_marks}
nlp.add_pipe("sentencizer", config=config, before='parser')

<spacy.pipeline.sentencizer.Sentencizer at 0x7f06411e0040>

In [81]:
doc = nlp(text)

In [82]:
for sentence in doc.sents:
    print("--->", sentence)

---> This is a sentence. This is another. 


---> This is a 

---> third sentence.
