## POS Tagging

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp(u"The quick lazy brown fox jumped over the dog's back")

In [4]:
print(doc.text)

The quick lazy brown fox jumped over the dog's back


In [6]:
print(doc[4].pos_)

NOUN


In [7]:
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

The        DET        DT         determiner
quick      ADJ        JJ         adjective
lazy       ADJ        JJ         adjective
brown      ADJ        JJ         adjective
fox        NOUN       NN         noun, singular or mass
jumped     VERB       VBD        verb, past tense
over       ADP        IN         conjunction, subordinating or preposition
the        DET        DT         determiner
dog        NOUN       NN         noun, singular or mass
's         PART       POS        possessive ending
back       NOUN       NN         noun, singular or mass


In [8]:
doc = nlp(u"I read books on NLP")

In [9]:
word = doc[1]

In [10]:
word.text

'read'

In [11]:
token = word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

read       VERB       VBP        verb, non-3rd person singular present


In [12]:
doc = nlp(u"I read a book on NLP")

In [13]:
word = doc[1]

token = word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

read       VERB       VBD        verb, past tense


In [14]:
doc = nlp(u"The quick lazy brown fox jumped over the dog's back")

In [15]:
POS_COUNTS = doc.count_by(spacy.attrs.POS)

In [16]:
POS_COUNTS

{83: 3, 99: 1, 84: 1, 89: 2, 91: 3, 93: 1}

In [17]:
doc.vocab[83].text

'ADJ'

In [18]:
for k,v in sorted(POS_COUNTS.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

83. ADJ   3
84. ADP   1
89. DET   2
91. NOUN  3
93. PART  1
99. VERB  1


In [19]:
TAG_counts = doc.count_by(spacy.attrs.TAG)

for k,v in sorted(TAG_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

74. POS   1
1292078113972184607. IN    1
10554686591937588953. JJ    3
15267657372422890137. DT    2
15308085513773655218. NN    3
17109001835818727656. VBD   1


In [20]:
# Synthetic dependency

DEP_counts = doc.count_by(spacy.attrs.DEP)

for k,v in sorted(DEP_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

399. amod  3
412. det   2
426. nsubj 1
436. pobj  1
437. poss  1
440. prep  1
8110129090154140942. case  1
8206900633647566924. ROOT  1


In [21]:
doc = nlp(u"The quick brown fox jumped over the lazy dog.")

In [22]:
from spacy import displacy

In [23]:
displacy.render(doc, style='dep',jupyter=True)

In [24]:
options = {'distance':110,'compact':'True','color':'yellow','bg':'#09a3d5','font':'Times New Roman'}

In [25]:
displacy.render(doc, style='dep',jupyter=True, options = options)

In [26]:
doc2 = nlp(u"This is a seb=ntence. This is another sentence, probably longer than first one")

In [27]:
spans = list(doc2.sents)

In [None]:
displacy.serve(spans, style='dep', options={'distance':110})


[93m    Serving on port 5000...[0m
    Using the 'dep' visualizer



127.0.0.1 - - [22/Feb/2021 10:37:13] "GET / HTTP/1.1" 200 11065
127.0.0.1 - - [22/Feb/2021 10:37:13] "GET /favicon.ico HTTP/1.1" 200 11065


## Named Entity Recognition

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [11]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(f"{ent.text} - {ent.label_} - {str(spacy.explain(ent.label_))}")
    else:
        print("No entities found")

In [12]:
doc = nlp(u"Hi how are you")

In [13]:
show_ents(doc)

No entities found


In [14]:
doc = nlp(u"May I go to Washinyon DC , next May to see the Washinton Monument?")

In [15]:
show_ents(doc)

Washinyon DC - PERSON - People, including fictional
next May - DATE - Absolute or relative dates or periods
the Washinton Monument - WORK_OF_ART - Titles of books, songs, etc.


In [16]:
doc = nlp(u"Can i please 500 dollar of Microsoft stocks")

In [17]:
show_ents(doc)

500 dollar - MONEY - Monetary values, including unit
Microsoft - ORG - Companies, agencies, institutions, etc.


In [18]:
doc = nlp(u"Tesla to build a U.K. factory for $ 6 million")

In [19]:
show_ents(doc)

U.K. - GPE - Countries, cities, states
$ 6 million - MONEY - Monetary values, including unit


In [20]:
from spacy.tokens import Span

In [22]:
ORG = doc.vocab.strings[u"ORG"]

In [23]:
ORG

381

In [24]:
new_ent = Span(doc,0,1,label=ORG)

In [25]:
doc.ents = list(doc.ents) + [new_ent]

In [26]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$ 6 million - MONEY - Monetary values, including unit


### Adding all Entities to NER

In [61]:
doc = nlp(u"Our company created a brand new vaccum cleaner."
         u"This new vaccum-cleaner is the best in show.")

In [62]:
show_ents(doc)

No entities found


In [63]:
from spacy.matcher import PhraseMatcher

In [64]:
matcher = PhraseMatcher(nlp.vocab)

In [65]:
phrase_list = ['vaccum cleaner','vaccum-cleaner']

In [66]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [67]:
phrase_patterns

[vaccum cleaner, vaccum-cleaner]

In [68]:
matcher.add('newproduct',None,*phrase_patterns)

In [69]:
found_matches = matcher(doc)

In [70]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [71]:
from spacy.tokens import Span

In [72]:
PROD = doc.vocab.strings[u"PRODUCT"]

In [76]:
PROD

384

In [77]:
new_ents = [Span(doc, match[1],match[2],label=PROD) for match in found_matches]

In [78]:
new_ents

[vaccum cleaner, vaccum-cleaner]

In [82]:
doc.ents = list(doc.ents) + new_ents

In [83]:
show_ents(doc)

vaccum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vaccum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [84]:
doc =nlp(u"Originally I paid $29.95 for this car toy, but now it is marked down by $10.0")

In [86]:
len([ent for ent in doc.ents if ent.label_ == 'MONEY'])

2

### Visualize NER

In [87]:
from spacy import displacy

In [97]:
doc = nlp(u"Over the last quater Apple sold nearly 20 thousands iPods for a profit of $6 million. "
         u"By contrast Sony only sold 8 thousands Walkman player.")

In [98]:
displacy.render(doc, style='ent',jupyter=True)

In [99]:
for sent in doc.sents:
    displacy.render(doc, style='ent',jupyter=True)

In [110]:
# colors = {'ORG':'radial-gradient(yellow,red)'}
colors = {'ORG':'linear-gradient(90deg, #aa9cfc,#fc9ce7)'}
options = {'ents':['PRODUCT','ORG'], 'colors':colors}

In [111]:
displacy.render(doc, style='ent',jupyter=True, options=options)

In [117]:
displacy.serve(doc,style='ent',options=options)


[93m    Serving on port 5000...[0m
    Using the 'ent' visualizer


    Shutting down server on port 5000.



### Sentence Segmentation

In [115]:
doc = nlp(u'This is the first sentence. This is the second sentence.This is the last sentence ')

In [116]:
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is the second sentence.
This is the last sentence


In [118]:
list(doc.sents)[0]

This is the first sentence.

In [122]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things."-Peter Drucker')

In [123]:
doc.text

'"Management is doing the right things; leadership is doing the right things."-Peter Drucker'

In [124]:
for sent in doc.sents:
    print(sent)
    print('\n')

"Management is doing the right things; leadership is doing the right things."-


Peter Drucker




In [125]:
# Add the segmentation rule

In [138]:
def set_custom_boundaries(doc):
    for token in doc:
        if token.text == ";":
            doc[token.i+1].is_sent_start = True
    return doc

In [139]:
# Set custom boundary in the pipeline
nlp.add_pipe(set_custom_boundaries,before='parser')

nlp.pipe_names

['tagger', 'set_custome_boundaries', 'set_custom_boundaries', 'parser', 'ner']

In [140]:
doc[:-1]

"Management is doing the right things; leadership is doing the right things."-Peter

In [141]:
doc4= nlp(u'"Management is doing the right things; leadership is doing the right things."-Peter Drucker')

In [142]:
for sent in doc4.sents:
    print(sent)

"Management is doing the right things;
leadership is doing the right things."-
Peter Drucker


In [None]:
# Change segmentation rules

In [144]:
nlp = spacy.load('en_core_web_sm')

In [150]:
mystring = u"This is a sentence. This is another.\n\n This is a\n third sentence"

In [151]:
print(mystring)

This is a sentence. This is another.

 This is a
 third sentence


In [152]:
doc = nlp(mystring)

In [153]:
for sentence in doc.sents:
    print(sentence)

This is a sentence.
This is another.

 
This is a
 third sentence


In [154]:
from spacy.pipeline import SentenceSegmenter

In [155]:
def split_on_newlines(doc):
    start = seen_newline = False
    
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'):
            seen_newline = True
            
    yield doc[start:]

In [156]:
sbd = SentenceSegmenter(nlp.vocab,strategy=split_on_newlines)

In [157]:
nlp.add_pipe(sbd)

In [159]:
doc = nlp(mystring)

In [160]:
for sentence in doc.sents:
    print(sentence)

This is a sentence. This is another.

 
This is a
 
third sentence
