# Named Entity Recognition (NER)


In [1]:
# Perform standard imports
import spacy
nlp = spacy.load('en_core_web_sm')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# Write a function to display basic entity info:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')

In [3]:
doc = nlp(u'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')

show_ents(doc)

European - NORP - Nationalities or religious or political groups
Google - ORG - Companies, agencies, institutions, etc.
$5.1 billion - MONEY - Monetary values, including unit
Wednesday - DATE - Absolute or relative dates or periods


## Entity annotations
`Doc.ents` are token spans with their own set of annotations.
<table>
<tr><td>`ent.text`</td><td>The original entity text</td></tr>
<tr><td>`ent.label`</td><td>The entity type's hash value</td></tr>
<tr><td>`ent.label_`</td><td>The entity type's string description</td></tr>
<tr><td>`ent.start`</td><td>The token span's *start* index position in the Doc</td></tr>
<tr><td>`ent.end`</td><td>The token span's *stop* index position in the Doc</td></tr>
<tr><td>`ent.start_char`</td><td>The entity text's *start* index position in the Doc</td></tr>
<tr><td>`ent.end_char`</td><td>The entity text's *stop* index position in the Doc</td></tr>
</table>



In [4]:
doc = nlp(u'Can I please borrow 500 dollars from you to buy some Microsoft stock?')

for ent in doc.ents:
    print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

500 dollars 4 6 20 31 MONEY
Microsoft 11 12 53 62 ORG


## NER Tags
Tags are accessible through the `.label_` property of an entity.
<table>
<tr><th>TYPE</th><th>DESCRIPTION</th><th>EXAMPLE</th></tr>
<tr><td>`PERSON`</td><td>People, including fictional.</td><td>*Fred Flintstone*</td></tr>
<tr><td>`NORP`</td><td>Nationalities or religious or political groups.</td><td>*The Republican Party*</td></tr>
<tr><td>`FAC`</td><td>Buildings, airports, highways, bridges, etc.</td><td>*Logan International Airport, The Golden Gate*</td></tr>
<tr><td>`ORG`</td><td>Companies, agencies, institutions, etc.</td><td>*Microsoft, FBI, MIT*</td></tr>
<tr><td>`GPE`</td><td>Countries, cities, states.</td><td>*France, UAR, Chicago, Idaho*</td></tr>
<tr><td>`LOC`</td><td>Non-GPE locations, mountain ranges, bodies of water.</td><td>*Europe, Nile River, Midwest*</td></tr>
<tr><td>`PRODUCT`</td><td>Objects, vehicles, foods, etc. (Not services.)</td><td>*Formula 1*</td></tr>
<tr><td>`EVENT`</td><td>Named hurricanes, battles, wars, sports events, etc.</td><td>*Olympic Games*</td></tr>
<tr><td>`WORK_OF_ART`</td><td>Titles of books, songs, etc.</td><td>*The Mona Lisa*</td></tr>
<tr><td>`LAW`</td><td>Named documents made into laws.</td><td>*Roe v. Wade*</td></tr>
<tr><td>`LANGUAGE`</td><td>Any named language.</td><td>*English*</td></tr>
<tr><td>`DATE`</td><td>Absolute or relative dates or periods.</td><td>*20 July 1969*</td></tr>
<tr><td>`TIME`</td><td>Times smaller than a day.</td><td>*Four hours*</td></tr>
<tr><td>`PERCENT`</td><td>Percentage, including "%".</td><td>*Eighty percent*</td></tr>
<tr><td>`MONEY`</td><td>Monetary values, including unit.</td><td>*Twenty Cents*</td></tr>
<tr><td>`QUANTITY`</td><td>Measurements, as of weight or distance.</td><td>*Several kilometers, 55kg*</td></tr>
<tr><td>`ORDINAL`</td><td>"first", "second", etc.</td><td>*9th, Ninth*</td></tr>
<tr><td>`CARDINAL`</td><td>Numerals that do not fall under another type.</td><td>*2, Two, Fifty-two*</td></tr>
</table>

In [5]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [6]:
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)

158

In [7]:
show_ents(article)

Peter Strzok - PERSON - People, including fictional
Texts - GPE - Countries, cities, states
The New York Times - ORG - Companies, agencies, institutions, etc.
Peter Strzok - PERSON - People, including fictional
Texts - GPE - Countries, cities, states
byContinue - GPE - Countries, cities, states
Peter Strzok - PERSON - People, including fictional
Texts - GPE - Countries, cities, states
FiredPeter Strzok - PERSON - People, including fictional
F.B.I. - ORG - Companies, agencies, institutions, etc.
Trump - PERSON - People, including fictional
T.J. Kirkpatrick - PERSON - People, including fictional
The New York - ORG - Companies, agencies, institutions, etc.
Adam Goldman - PERSON - People, including fictional
Michael S. SchmidtAug - PERSON - People, including fictional
13 - CARDINAL - Numerals that do not fall under another type
2018WASHINGTON - CARDINAL - Numerals that do not fall under another type
Peter Strzok - PERSON - People, including fictional
F.B.I. - ORG - Companies, agencies, ins

___
## Adding a Named Entity to a Span


In [8]:
doc = nlp(u'TATA MOTORS to build a electric cars factory in INDIA $6 million')

show_ents(doc)

INDIA - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [9]:
from spacy.tokens import Span

# Get the hash value of the ORG entity label
ORG = doc.vocab.strings[u'ORG']  

In [10]:
ORG

383

In [11]:
# Create a Span for the new entity
new_ent = Span(doc, 0, 2, label=ORG)

In [12]:
new_ent

TATA MOTORS

In [13]:
# Add the entity to the existing Doc object
doc.ents = list(doc.ents) + [new_ent]

In [14]:
show_ents(doc)

TATA MOTORS - ORG - Companies, agencies, institutions, etc.
INDIA - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


___
## Adding Named Entities to All Matching Spans


In [15]:
doc = nlp(u'Our company plans to introduce a new vacuum cleaner. '
          u'If successful, the vacuum-cleaner will be our first product.')

show_ents(doc)

first - ORDINAL - "first", "second", etc.


In [16]:
# Import PhraseMatcher and create a matcher object:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [17]:
# Create the desired phrase patterns:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]

In [18]:
# Apply the patterns to our matcher object:
matcher.add('newproduct', None, *phrase_patterns)

# Apply the matcher to our Doc object:
matches = matcher(doc)

# See what matches occur:
matches

[(2689272359382549672, 7, 9), (2689272359382549672, 14, 17)]

In [19]:
# Here we create Spans from each match, and create named entities from them:
from spacy.tokens import Span

PROD = doc.vocab.strings[u'PRODUCT']

new_ents = [Span(doc, match[1],match[2],label=PROD) for match in matches]

doc.ents = list(doc.ents) + new_ents

In [20]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
first - ORDINAL - "first", "second", etc.


___
## Counting Entities


In [21]:
doc = nlp(u'Originally priced at $29.50, the sweater was marked down to five dollars.')

show_ents(article)

Peter Strzok - PERSON - People, including fictional
Texts - GPE - Countries, cities, states
The New York Times - ORG - Companies, agencies, institutions, etc.
Peter Strzok - PERSON - People, including fictional
Texts - GPE - Countries, cities, states
byContinue - GPE - Countries, cities, states
Peter Strzok - PERSON - People, including fictional
Texts - GPE - Countries, cities, states
FiredPeter Strzok - PERSON - People, including fictional
F.B.I. - ORG - Companies, agencies, institutions, etc.
Trump - PERSON - People, including fictional
T.J. Kirkpatrick - PERSON - People, including fictional
The New York - ORG - Companies, agencies, institutions, etc.
Adam Goldman - PERSON - People, including fictional
Michael S. SchmidtAug - PERSON - People, including fictional
13 - CARDINAL - Numerals that do not fall under another type
2018WASHINGTON - CARDINAL - Numerals that do not fall under another type
Peter Strzok - PERSON - People, including fictional
F.B.I. - ORG - Companies, agencies, ins

In [22]:
len([ent for ent in article.ents if ent.label_=='DATE'])

19

# Visualizing Named Entities

In [23]:
# Import the displaCy library
from spacy import displacy

In [24]:
displacy.render(article, style='ent', jupyter=True)

In [25]:
options = {'ents': ['ORG', 'PRODUCT']}

displacy.render(article, style='ent', jupyter=True, options=options)

# Using NLTK

In [28]:
#NER

import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')
 nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Shubham\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Shubham\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shubham\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [30]:


#my_sent = "WASHINGTON -- In the wake of a string of abuses by New York police officers in the 1990s, Loretta E. Lynch, the top federal prosecutor in Brooklyn, spoke forcefully about the pain of a broken trust that African-Americans felt and said the responsibility for repairing generations of miscommunication and mistrust fell to law enforcement."
my_sent= "In 1999, Vajpayee laid the foundation for the GoldenQuadrilateralHighway project, which would link four major cities: Delhi, Mumbai, Chennai and Kolkata."
#my_sent="“Indians is a football country now,” FIFA president Giani Infantino declared after arriving here to chair the FIFA Council meeting on Friday and attend the U-17 World Cup final."

for sent in nltk.sent_tokenize(my_sent):
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
        if hasattr(chunk, 'label'):
            print(chunk.label(), ' '.join(c[0] for c in chunk))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Shubham\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


PERSON Vajpayee
ORGANIZATION GoldenQuadrilateralHighway
PERSON Delhi
GPE Mumbai
PERSON Chennai
PERSON Kolkata
