## Basics

In [3]:
import spacy
nlp = spacy.load('en_core_web_sm')


In [4]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")
print(doc.text)

The quick brown fox jumped over the lazy dog's back.


In [5]:
for token in doc:
    print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')

The        DET      DT     determiner
quick      ADJ      JJ     adjective
brown      ADJ      JJ     adjective
fox        NOUN     NN     noun, singular or mass
jumped     VERB     VBD    verb, past tense
over       ADP      IN     conjunction, subordinating or preposition
the        DET      DT     determiner
lazy       ADJ      JJ     adjective
dog        NOUN     NN     noun, singular or mass
's         PART     POS    possessive ending
back       NOUN     NN     noun, singular or mass
.          PUNCT    .      punctuation mark, sentence closer


In [6]:
POS_counts = doc.count_by(spacy.attrs.POS)
POS_counts

{96: 1, 83: 3, 99: 1, 84: 1, 89: 2, 91: 3, 93: 1}

In [8]:
for k,v in sorted(POS_counts.items()):
    print(f'{k}. {doc.vocab[k].text:{5}}: {v}')

83. ADJ  : 3
84. ADP  : 1
89. DET  : 2
91. NOUN : 3
93. PART : 1
96. PUNCT: 1
99. VERB : 1


## Sentences---Spacy

In [9]:
doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [10]:
for sent in doc4.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [12]:
doc_sents = [sent for sent in doc4.sents]
doc_sents

[This is the first sentence.,
 This is another sentence.,
 This is the last sentence.]

In [13]:
print(doc_sents[1].start, doc_sents[1].end)

6 11


In [14]:
# Re-run the Doc object creation:
doc4 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

for sent in doc4.sents:
    print(sent)

"Management is doing things right; leadership is doing the right things."
-Peter Drucker


In [15]:
# ADD A NEW RULE TO THE PIPELINE
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

nlp.add_pipe(set_custom_boundaries, before='parser')

nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [16]:
for sent in doc4.sents:
    print(sent)

"Management is doing things right; leadership is doing the right things."
-Peter Drucker


In [17]:
# Re-run the Doc object creation:
doc5 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

for sent in doc5.sents:
    print(sent)

"Management is doing things right;
leadership is doing the right things."
-Peter Drucker
