# Part of Speech Tagging

In [107]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [43]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")
print(doc.text)

The quick brown fox jumped over the lazy dog's back.


In [3]:
print(doc[4])

jumped


In [4]:
print(doc[4].pos_)
print(doc[4].tag_)
# Reference the table for what the tag means (refers to a fine detail tag)
print(doc[4].pos) #without the "_" it will return the numerical id

VERB
VBD
99


In [5]:
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{5}} {spacy.explain(token.tag_)}")

The        DET        DT    determiner
quick      ADJ        JJ    adjective
brown      ADJ        JJ    adjective
fox        NOUN       NN    noun, singular or mass
jumped     VERB       VBD   verb, past tense
over       ADP        IN    conjunction, subordinating or preposition
the        DET        DT    determiner
lazy       ADJ        JJ    adjective
dog        NOUN       NN    noun, singular or mass
's         PART       POS   possessive ending
back       NOUN       NN    noun, singular or mass
.          PUNCT      .     punctuation mark, sentence closer


In [6]:
doc = nlp(u"I read books on NLP.")
word = doc[1]

In [7]:
word.text

'read'

In [9]:
token = word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{5}} {spacy.explain(token.tag_)}")

read       VERB       VBP   verb, non-3rd person singular present


In [11]:
doc = nlp(u"I read a book on NLP.")
word2 = doc[1]
token = word2
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{5}} {spacy.explain(token.tag_)}")
# Notice that Spacy can tell the difference between past and present tense!

read       VERB       VBD   verb, past tense


In [12]:
doc = nlp(u"I read book on NLP.")
word2 = doc[1]
token = word2
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{5}} {spacy.explain(token.tag_)}")

read       VERB       VBP   verb, non-3rd person singular present


In [13]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")
# parts of speech counts
POS_counts = doc.count_by(spacy.attrs.POS)
# this will return a dictionary

In [15]:
POS_counts
# these numbers are the parts of speech code (numerical identifier)

{96: 1, 83: 3, 99: 1, 84: 1, 89: 2, 91: 3, 93: 1}

In [17]:
doc.vocab[83].text
# This shows the part of speech

'ADJ'

In [18]:
doc[2].pos_

'ADJ'

In [19]:
doc[2].pos

83

In [20]:
# so in POS_counts we see there are 3 counts of adjetives 

In [21]:
for k,v in sorted(POS_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

83. ADJ   3
84. ADP   1
89. DET   2
91. NOUN  3
93. PART  1
96. PUNCT 1
99. VERB  1


In [26]:
TAG_counts = doc.count_by(spacy.attrs.TAG)
for k,v in sorted(TAG_counts.items()):
    print(f"{k:{10}}. {doc.vocab[k].text:{5}} {v}")

        74. POS   1
1292078113972184607. IN    1
10554686591937588953. JJ    3
12646065887601541794. .     1
15267657372422890137. DT    2
15308085513773655218. NN    3
17109001835818727656. VBD   1


In [27]:
len(doc.vocab)

57863

In [30]:
#Syntactic dependencies 
DEP_counts = doc.count_by(spacy.attrs.DEP)
for k,v in sorted(DEP_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

399. amod  3
412. det   2
426. nsubj 1
436. pobj  1
437. poss  1
440. prep  1
442. punct 1
8110129090154140942. case  1
8206900633647566924. ROOT  1


# Visualizing Parts of Speech

In [31]:
doc = nlp(u"The quick brown fox jumped over the lazy dog.")

In [32]:
from spacy import displacy

In [34]:
displacy.render(doc,style='dep',jupyter=True)

In [36]:
options = {'distance':110,'compact':'True','color':'yellow','bg':'#09a3d5','font':'Times'}

In [37]:
displacy.render(doc,style='dep',jupyter=True,options=options)

In [38]:
doc2 = nlp(u"This is a sentence. This is another sentence, possibly longer than the other.")

In [39]:
spans = list(doc2.sents)
# grabs a list of sentences automatically
# creating a list of spans

In [40]:
displacy.serve(spans,style='dep',options={'distance':110})


[93m    Serving on port 5000...[0m
    Using the 'dep' visualizer



127.0.0.1 - - [21/Jan/2020 11:48:49] "GET / HTTP/1.1" 200 9577
127.0.0.1 - - [21/Jan/2020 11:48:50] "GET /favicon.ico HTTP/1.1" 200 9577



    Shutting down server on port 5000.



# Named Entity Recognition

### Part 1

In [53]:
'''We can add in our own entities!
Refer to the provided notes for more information in the jupyter notebook
02-NER-Named-Entity-Recognition.ipynb'''

'We can add in our own entities!\nRefer to the provided notes for more information in the jupyter notebook\n02-NER-Named-Entity-Recognition.ipynb'

In [51]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
    else:
        print('No entities found')

In [47]:
doc = nlp(u'Hi how are you?')
show_ents(doc)

No entities found


In [52]:
doc = nlp(u"May I go to Washington, DC next May to see the Washington Monument?")
show_ents(doc)

Washington, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [55]:
doc = nlp(u"Can I please have 500 dollars of Microsoft stock?")
show_ents(doc)

500 dollars - MONEY - Monetary values, including unit
Microsoft - ORG - Companies, agencies, institutions, etc.


In [63]:
doc = nlp(u"Telsa to build a U.K. factory for $6 million")
show_ents(doc)
print(doc[0])

Telsa - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit
Telsa


In [59]:
from spacy.tokens import Span

In [62]:
ORG = doc.vocab.strings[u"ORG"]
print(ORG)

381


In [65]:
new_ent = Span(doc,0,1,label=ORG)
doc.ents = list(doc.ents) + [new_ent]

In [66]:
show_ents(doc)

Telsa - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


### Part 2

In [68]:
doc = nlp(u"Our company created a brand new vacuum cleaner." 
         u"This new vacuum-cleaner is the best in show.")
show_ents(doc)

No entities found


In [69]:
# We want to add "vacuum cleaner" and "vacuum-cleaner" as named entities
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [70]:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']

In [71]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [72]:
matcher.add('newproduct', None, *phrase_patterns)

In [75]:
found_matches = matcher(doc)
print(found_matches)

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]


In [76]:
from spacy.tokens import Span

In [77]:
PROD = doc.vocab.strings[u"PRODUCT"]

In [78]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [80]:
new_ents = [Span(doc,match[1],match[2],label=PROD) for match in found_matches]

In [81]:
doc.ents = list(doc.ents) + new_ents

In [82]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [83]:
doc = nlp(u"Originally I paid $29.95 for this car toy, but now it is marked down by 10 dollars.")

In [88]:
# How many times was money or named entities mentioned in my doc?
# list comprehension: give me every entity for each entity in doc entities
print([ent for ent in doc.ents if ent.label_ == "MONEY"])
print(len([ent for ent in doc.ents if ent.label_ == "MONEY"]
))

[29.95, 10 dollars]
2


# Visualizing NER

In [89]:
from spacy import displacy

In [92]:
doc = nlp(u"Over the last quarter Apple sold nearly 20 thousands iPods for a profit of $6 Million."
         u"By contrast, Sony only sold 8 thousand Walkman music players.")

In [93]:
displacy.render(doc,style='ent',jupyter=True)

In [95]:
for sent in doc.sents:
    displacy.render(nlp(sent.text),style='ent',jupyter=True)

In [97]:
options = {'ents':['PRODUCT']}
displacy.render(doc,style='ent',jupyter=True,options = options)

In [98]:
options = {'ents':['PRODUCT','ORG']}
displacy.render(doc,style='ent',jupyter=True,options = options)

In [101]:
colors = {'ORG':'red','PRODUCT':'#aa9cfc'}
options = {'ents':['PRODUCT','ORG'],'colors':colors}
displacy.render(doc,style='ent',jupyter=True,options = options)

In [105]:

colors = {'ORG':'linear-gradient(90deg, #aa9cfc, #fc9ce7)','PRODUCT':'radial-gradient(yellow, green)'}
options = {'ents':['PRODUCT','ORG'],'colors':colors}
displacy.render(doc,style='ent',jupyter=True,options = options)

In [106]:
displacy.serve(doc,style='ent',options=options)


[93m    Serving on port 5000...[0m
    Using the 'ent' visualizer



127.0.0.1 - - [21/Jan/2020 13:51:10] "GET / HTTP/1.1" 200 2210
127.0.0.1 - - [21/Jan/2020 13:51:10] "GET /favicon.ico HTTP/1.1" 200 2210



    Shutting down server on port 5000.



# Sentence Segmentation

In [108]:
doc = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")

In [109]:
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [110]:
#doc.sents is a generator and cannot be indexed:
doc.sents[0]

TypeError: 'generator' object is not subscriptable

In [111]:
list(doc.sents)

[This is the first sentence.,
 This is another sentence.,
 This is the last sentence.]

In [114]:
type(list(doc.sents)[0])

spacy.tokens.span.Span

In [115]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [116]:
doc.text

'"Management is doing the right things; leadership is doing the right things." -Peter Drucker'

In [118]:
for sent in doc.sents:
    print(sent)
    print("\n")

"Management is doing the right things; leadership is doing the right things."


-Peter Drucker




In [134]:
# Add a segmentation rule
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        #print(token)
        #print(token.i)
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

In [135]:
nlp.add_pipe(set_custom_boundaries, before = 'parser')

ValueError: [E007] 'set_custom_boundaries' already exists in pipeline. Existing names: ['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [136]:
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [137]:
doc4 = nlp(u'"Management is doing the right things; leadership is doing the right things." -Peter Drucker')

In [138]:
for sent in doc4.sents:
    print(sent)
    print("\n")

"Management is doing the right things;


leadership is doing the right things."


-Peter Drucker




In [None]:
# change the segmentation rules

In [139]:
nlp = spacy.load('en_core_web_sm')

In [140]:
mystring = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."
print(mystring)

This is a sentence. This is another.

This is a 
third sentence.


In [141]:
doc = nlp(mystring)

In [143]:
for sentence in doc.sents:
    print(sentence)

This is a sentence.
This is another.


This is a 
third sentence.


In [144]:
from spacy.pipeline import SentenceSegmenter

In [145]:
def split_on_newlines(doc):
    start = 0
    seen_newline = False
    
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'):
            seen_newline = True
    yield doc[start:]

In [146]:
sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)

In [147]:
nlp.add_pipe(sbd)

In [148]:
doc = nlp(mystring)

In [150]:
for sentence in doc.sents:
    print(sentence)

This is a sentence. This is another.


This is a 

third sentence.


# POS (Part of Speech) Assessment

In [151]:
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy import displacy