# Parts of Speech Text Named Entity Recognition

In [141]:
# Lesson 1 POS basics
import spacy 
nlp = spacy.load('en_core_web_sm')

In [2]:
doc=nlp(u"The quick brown fox jumped over the lazy dog's back")

In [3]:
doc.text

"The quick brown fox jumped over the lazy dog's back"

In [7]:
doc[3].tag_

'NN'

In [15]:
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_):{10}}")

The        DET        DT         determiner
quick      ADJ        JJ         adjective 
brown      ADJ        JJ         adjective 
fox        NOUN       NN         noun, singular or mass
jumped     VERB       VBD        verb, past tense
over       ADP        IN         conjunction, subordinating or preposition
the        DET        DT         determiner
lazy       ADJ        JJ         adjective 
dog        NOUN       NN         noun, singular or mass
's         PART       POS        possessive ending
back       NOUN       NN         noun, singular or mass


In [16]:
doc=nlp(u"I read books on NLP.")

In [17]:
word=doc[1]

In [18]:
word.text

'read'

In [19]:
token=word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_):{10}}")

read       VERB       VBP        verb, non-3rd person singular present


In [24]:
doc=nlp("I read a book on NLP.")

In [25]:
token=doc[1]

In [31]:
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_):{10}}")

read       VERB       VBD        verb, past tense


Spacy is smart enough to understand present and past tense!

In [27]:
doc=nlp(u"The quick brown fox jumped over the lazy dog's back")

In [28]:
POS_counts=doc.count_by(spacy.attrs.POS)

In [29]:
POS_counts

{83: 3, 99: 1, 84: 1, 89: 2, 91: 3, 93: 1}

In [33]:
doc.vocab[83].text

'ADJ'

# Lesson 3 : Visualizing POS

In [1]:
import spacy

In [2]:
nlp=spacy.load('en_core_web_sm')

In [3]:
doc=nlp(u'The quick brown fox jumped over the lazy dog')

In [4]:
from spacy import displacy

In [5]:
# Inside the network/notebook
displacy.render(doc,style='dep',jupyter=True)

In [6]:
options={'distance':110,'compact':'True','color':'yellow','bg':'#09a3d5','font':'Times'}

In [8]:
displacy.render(doc,style='dep',options=options,jupyter=True)

In [11]:
doc2=nlp(u'This is a sentence. This is another sentence,possibly longer than the other')

In [12]:
spans=list(doc2.sents)

In [15]:
# Outside the network/notebook
displacy.serve(spans,style='dep',options={'distance':110})
# View on 127.0.0.1:5000


[93m    Serving on port 5000...[0m
    Using the 'dep' visualizer



127.0.0.1 - - [03/Jan/2023 18:09:16] "GET / HTTP/1.1" 200 9576
127.0.0.1 - - [03/Jan/2023 18:09:16] "GET /favicon.ico HTTP/1.1" 200 9576



    Shutting down server on port 5000.



# Lesson 4 : Named Entity Recognizationm
Person Names <br>
Organizations <br>
Locations <br>
Medical codes <br>
Time experssion <br>
Quantities <br>
Monetary values <br> 
Percentages <br>

In [20]:
#NER with Spacy Tags as well as add own custom entities

In [21]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [26]:
def show_ent(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
    else:
        print('No entities found!')

In [27]:
doc=nlp(u"Hi How are you?")

In [28]:
show_ent(doc)

No entities found!


In [31]:
doc2=nlp(u"May I go to Washington, DC next May to see the Washington Monument?")

In [33]:
show_ent(doc2)

Washington, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [34]:
doc3=nlp(u"Can I please have $500 of Microsoft stock?")

In [35]:
show_ent(doc3)

500 - MONEY - Monetary values, including unit
Microsoft - ORG - Companies, agencies, institutions, etc.


## NER Tags
Tags are accessible through the `.label_` property of an entity.
<table>
<tr><th>TYPE</th><th>DESCRIPTION</th><th>EXAMPLE</th></tr>
<tr><td>`PERSON`</td><td>People, including fictional.</td><td>*Fred Flintstone*</td></tr>
<tr><td>`NORP`</td><td>Nationalities or religious or political groups.</td><td>*The Republican Party*</td></tr>
<tr><td>`FAC`</td><td>Buildings, airports, highways, bridges, etc.</td><td>*Logan International Airport, The Golden Gate*</td></tr>
<tr><td>`ORG`</td><td>Companies, agencies, institutions, etc.</td><td>*Microsoft, FBI, MIT*</td></tr>
<tr><td>`GPE`</td><td>Countries, cities, states.</td><td>*France, UAR, Chicago, Idaho*</td></tr>
<tr><td>`LOC`</td><td>Non-GPE locations, mountain ranges, bodies of water.</td><td>*Europe, Nile River, Midwest*</td></tr>
<tr><td>`PRODUCT`</td><td>Objects, vehicles, foods, etc. (Not services.)</td><td>*Formula 1*</td></tr>
<tr><td>`EVENT`</td><td>Named hurricanes, battles, wars, sports events, etc.</td><td>*Olympic Games*</td></tr>
<tr><td>`WORK_OF_ART`</td><td>Titles of books, songs, etc.</td><td>*The Mona Lisa*</td></tr>
<tr><td>`LAW`</td><td>Named documents made into laws.</td><td>*Roe v. Wade*</td></tr>
<tr><td>`LANGUAGE`</td><td>Any named language.</td><td>*English*</td></tr>
<tr><td>`DATE`</td><td>Absolute or relative dates or periods.</td><td>*20 July 1969*</td></tr>
<tr><td>`TIME`</td><td>Times smaller than a day.</td><td>*Four hours*</td></tr>
<tr><td>`PERCENT`</td><td>Percentage, including "%".</td><td>*Eighty percent*</td></tr>
<tr><td>`MONEY`</td><td>Monetary values, including unit.</td><td>*Twenty Cents*</td></tr>
<tr><td>`QUANTITY`</td><td>Measurements, as of weight or distance.</td><td>*Several kilometers, 55kg*</td></tr>
<tr><td>`ORDINAL`</td><td>"first", "second", etc.</td><td>*9th, Ninth*</td></tr>
<tr><td>`CARDINAL`</td><td>Numerals that do not fall under another type.</td><td>*2, Two, Fifty-two*</td></tr>
</table>

In [37]:
# Add custom Named entity

In [63]:
doc4=nlp(u'Tesla to build a UK factory for $6 million')

In [64]:
show_ent(doc4)

UK - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [65]:
from spacy.tokens import Span

In [66]:
ORG =doc4.vocab.strings[u"ORG"]

In [67]:
ORG

381

In [68]:
new_ent=Span(doc4,0,1,label=ORG)

In [69]:
doc4.ents=list(doc4.ents)+[new_ent]

In [70]:
show_ent(doc4)

Tesla - ORG - Companies, agencies, institutions, etc.
UK - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


# Lesson 5 : Adding NER Phrases 

In [72]:
# Add in multple pharases as NERs
# Vaccum-Cleaner, Vaccum Cleaner as product entity
import spacy
nlp=spacy.load('en_core_web_sm')

In [73]:
doc=nlp(u"Our company created a brand new vacuum cleaner."
       u"This new vacuum-cleaner ")

In [74]:
show_ent(doc)

No entities found!


In [75]:
from spacy.matcher import PhraseMatcher

In [77]:
matcher=PhraseMatcher(nlp.vocab)

In [79]:
phrase_list=['vacuum cleaner','vacuum-cleaner']

In [80]:
phrase_patterns=[nlp(text) for text in phrase_list]

In [82]:
matcher.add('newproduct',None,*phrase_patterns)

In [83]:
found_matches=matcher(doc)
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [84]:
from spacy.tokens import Span

In [85]:
PROD=doc.vocab.strings[u"PRODUCT"]

In [86]:
PROD

384

In [88]:
new_ents=[Span(doc,match[1],match[2],label=PROD) for match in found_matches]

In [90]:
doc.ents=list(doc.ents)+new_ents

In [91]:
show_ent(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [100]:
# Finding counts of Entities in a nlp doc

docx=nlp(u"Originally I paid $29.95 for this car toy,but now it is marked down by 10 dollars")
# how many times money was there?

In [98]:
[ent for ent in docx.ents if ent.label_=="MONEY"]

[29.95, 10 dollars]

In [99]:
len([ent for ent in docx.ents if ent.label_=="MONEY"])

2

# Lesson 6 : visualizing Named Entity Recognization

In [102]:
import spacy
nlp=spacy.load('en_core_web_sm')

from spacy import displacy

In [111]:
doc=nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of 6 million dollars."
       u"By contrast Sony only sold 8000 Walkman music players")

In [112]:
displacy.render(doc,style='ent',jupyter=True)

In [113]:
for sent in doc.sents:
    displacy.render(nlp(sent.text),style='ent',jupyter=True)

In [119]:
#Filter out which entities you want
options={'ents':['PRODUCT','ORG']}

In [118]:
displacy.render(doc,style='ent',jupyter=True,options=options)

In [123]:
# Change the color of each entity and type of entity
# HeX code as well

colors={'ORG':'red','PRODUCT':'blue'}

options={'ents':['PRODUCT','ORG'],'colors':colors}
displacy.render(doc,style='ent',jupyter=True,options=options)

In [134]:
# lInear and Radiatn gradients
colors={'ORG':'radial-gradient(yellow,green)','PRODUCT':'blue'}
options={'ents':['PRODUCT','ORG'],'colors':colors}
displacy.render(doc,style='ent',jupyter=True,options=options)

In [139]:
colors={'ORG':'linear-gradient(45deg,yellow,green)','PRODUCT':'blue'}
options={'ents':['PRODUCT','ORG'],'colors':colors}
displacy.render(doc,style='ent',jupyter=True,options=options)

In [140]:
displacy.serve(doc,style='ent',options=options)
# 127.0.0.1:5000


[93m    Serving on port 5000...[0m
    Using the 'ent' visualizer



127.0.0.1 - - [03/Jan/2023 18:48:56] "GET / HTTP/1.1" 200 2146
127.0.0.1 - - [03/Jan/2023 18:48:56] "GET /favicon.ico HTTP/1.1" 200 2146



    Shutting down server on port 5000.



# Lesson 7 : Sentence Segmentation

In [143]:
import spacy
nlp=spacy.load('en_core_web_sm')


In [146]:
doc=nlp(u'This is a sentence. This is another sentence. This is the last sentence.')

In [147]:
for sent in doc.sents:
    print(sent)

This is a sentence.
This is another sentence.
This is the last sentence.


In [148]:
doc.sents[0]

TypeError: 'generator' object is not subscriptable

In [150]:
doc[0]

This

In [154]:
list(doc.sents)[0]

This is a sentence.

In [153]:
type(list(doc.sents)[0])

spacy.tokens.span.Span

In [156]:
doc=nlp(u'"Management is doing the right things; leadership is doing the right things."-Peter Drucker')

In [157]:
doc.text

'"Management is doing the right things; leadership is doing the right things."-Peter Drucker'

In [158]:
for sent in doc.sents:
    print(sent)
    print('\n')

"Management is doing the right things; leadership is doing the right things."-


Peter Drucker




In [163]:
# Add a new rule to the pipeline when it process the natural language
# add segmentation rule
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text==";":
            doc[token.i+1].is_sent_start=True
    return doc        

In [164]:
nlp.add_pipe(set_custom_boundaries,before="parser")
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [165]:
doc4=nlp(u'"Management is doing the right things; leadership is doing the right things."-Peter Drucker')

In [167]:
for sent in doc4.sents:
    print(sent)

"Management is doing the right things;
leadership is doing the right things."-
Peter Drucker


In [168]:
# change segmentation rule
nlp=spacy.load('en_core_web_sm')

In [176]:
mstring=u"This is a sentence.This is another. \n\nThis is a \nthird sentence"

In [174]:
print(mstring)

This is a sentence.This is another. 

This is a 
third sentence


In [177]:
doc=nlp(mstring)

In [179]:
for sentence in doc.sents:
    print(sentence)

This is a sentence.
This is another. 


This is a 
third sentence


In [None]:
# We only want the new line to be indicator of anew segment "\n"

In [180]:
from spacy.pipeline import SentenceSegmenter

In [181]:
def split_on_new_lines(doc):
    start=0
    seen_newline=False
    
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start=word.i
            seen_newline=False
        elif word.text.startswith('\n'):
            seen_newline=True
    yield doc[start:]

In [182]:
sbd=SentenceSegmenter(nlp.vocab,strategy=split_on_new_lines)

In [183]:
nlp.add_pipe(sbd)

In [185]:
nlp.pipe_names

['tagger', 'parser', 'ner', 'sbd']

In [184]:
doc=nlp(mstring)

In [187]:
for sentence in doc.sents:
    print(sentence)

This is a sentence.This is another. 


This is a 

third sentence


In [192]:
sentence3=list(doc.sents)[2]