In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [4]:
doc = nlp(u"This is the first sentence. This is the another sentence. This is the last sentence")

In [5]:
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is the another sentence.
This is the last sentence


In [121]:
doc = nlp(u'"Management is doinh right thing; leadership is doing right thing." -Peter Drucker')

In [128]:
print(nlp.pipe_names)
if nlp.pipe_names[1] == 'set_custom_boundries':
    nlp.remove_pipe('set_custom_boundries')
print(nlp.pipe_names)
for token in doc.sents:
    print(token)

['tagger', 'set_custom_boundries', 'parser', 'ner']
['tagger', 'parser', 'ner']
"Management is doinh right thing;
leadership is doing right thing."
-Peter Drucker


In [127]:
for token in doc:
    print(f"{token.i}, {token}, {token.is_sent_start}")

0, ", None
1, Management, None
2, is, None
3, doinh, None
4, right, None
5, thing, None
6, ;, None
7, leadership, True
8, is, None
9, doing, None
10, right, None
11, thing, None
12, ., None
13, ", None
14, -Peter, True
15, Drucker, None


## Add a SEGMENTATION RULE

In [92]:
def set_custom_boundries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i + 1].is_sent_start = True
    return doc

In [124]:
nlp.add_pipe(set_custom_boundries, before='parser')
nlp.pipe_names


['tagger', 'set_custom_boundries', 'parser', 'ner']

In [125]:
doc = nlp(u'"Management is doinh right thing; leadership is doing right thing." -Peter Drucker')

In [126]:
for sent in doc.sents:
    print(sent)

"Management is doinh right thing;
leadership is doing right thing."
-Peter Drucker


## change segmentation rules

In [146]:
from spacy.pipeline import SentenceSegmenter
nlp = spacy.load('en_core_web_sm')


In [147]:
def split_on_newlines(doc):
    start = 0
    seen_newline = False
    
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'):
            seen_newline = True
    yield doc[start:]

In [148]:
mystring = "this is a sentence. This is another sentence.\n\nThis is a \nthird sentence"

In [149]:
doc = nlp(mystring)

Default behaviour

In [150]:
for sentence in doc.sents:
    print(sentence)

this is a sentence.
This is another sentence.


This is a 
third sentence


In [151]:
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)

In [162]:
nlp.add_pipe(sbd)
# nlp.remove_pipe('sbd')
nlp.pipe_names

['tagger', 'parser', 'ner', 'sbd']

In [163]:
doc = nlp(mystring)
for sentence in doc.sents:
    print(sentence)

this is a sentence. This is another sentence.


This is a 

third sentence
