## Sentence Segmentation

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


## Can't grab sentences seperately

In [4]:
doc.sents[0]

TypeError: 'generator' object is not subscriptable

In [5]:
list(doc.sents)

[This is the first sentence.,
 This is another sentence.,
 This is the last sentence.]

In [6]:
list(doc.sents)[0]

This is the first sentence.

In [7]:
type(list(doc.sents)[0])

spacy.tokens.span.Span

## Adding our own Segmentation Rules

In [9]:
doc3 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

In [10]:
doc3.text

'"Management is doing things right; leadership is doing the right things." -Peter Drucker'

In [16]:
for sent in doc3.sents:
    print(sent)
    print('\n')

"


Management is doing things right; leadership is doing the right things.


" -Peter Drucker




In [22]:
doc3[:-1] # Prints everything except the last rule

"Management is doing things right; leadership is doing the right things." -Peter

In [25]:
# Add a Segmentation Rule
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc        

In [27]:
nlp.add_pipe(set_custom_boundaries,before='parser') # Segmentation Rule added to the pipeline

nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [28]:
doc4 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

In [32]:
for sent in doc4.sents:
    print(sent) 

"
Management is doing things right;
leadership is doing the right things.
" -Peter Drucker


## Changing Segmentation Rules

In [31]:
nlp = spacy.load('en_core_web_sm')

In [33]:
mystring = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."

In [34]:
print(mystring)

This is a sentence. This is another.

This is a 
third sentence.


In [35]:
doc = nlp(mystring)

In [38]:
for sentence in doc.sents:
    print(sentence)

This is a sentence.
This is another.


This is a 
third sentence.


In [40]:
from spacy.pipeline import SentenceSegmenter

In [41]:
def split_on_newlines(doc):
    start = 0
    seen_newline = False
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'): # handles multiple occurrences
            seen_newline = True
    yield doc[start:]      # handles the last group of tokens

sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)
nlp.add_pipe(sbd)    

In [42]:
doc = nlp(mystring)

In [44]:
for sentence in doc.sents:
    print(sentence)

This is a sentence. This is another.


This is a 

third sentence.
