## NER with NLTK

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [2]:
sentence = '''European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'''

In [3]:
# tokenize with pos
sentence_pos = nltk.pos_tag(nltk.word_tokenize(sentence))

In [4]:
sentence_pos

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

In [5]:
from nltk import ne_chunk

# nltk.download('words')
# nltk.download('maxent_ne_chunker')
 
ner = ne_chunk(sentence_pos)

In [6]:
print(ner.__repr__())

Tree('S', [Tree('GPE', [('European', 'JJ')]), ('authorities', 'NNS'), ('fined', 'VBD'), Tree('PERSON', [('Google', 'NNP')]), ('a', 'DT'), ('record', 'NN'), ('$', '$'), ('5.1', 'CD'), ('billion', 'CD'), ('on', 'IN'), ('Wednesday', 'NNP'), ('for', 'IN'), ('abusing', 'VBG'), ('its', 'PRP$'), ('power', 'NN'), ('in', 'IN'), ('the', 'DT'), ('mobile', 'JJ'), ('phone', 'NN'), ('market', 'NN'), ('and', 'CC'), ('ordered', 'VBD'), ('the', 'DT'), ('company', 'NN'), ('to', 'TO'), ('alter', 'VB'), ('its', 'PRP$'), ('practices', 'NNS')])


In [7]:
# import os

# path_to_gs = ";C:\Program Files\gs\gs9.51\bin;"
# path_to_gs_32 = ";C:\Program Files (x86)\gs\gs9.51\bin;"
# os.environ['PATH'] += os.pathsep + path_to_gs + path_to_gs_32
# ner2 = ne_chunk(sentence_pos)
# ner2

## NER with SpaCy

In [8]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [9]:
from pprint import pprint
doc = nlp(sentence)
pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('$5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]


In [10]:
''' "B" means the token begins an entity,
    "I" means it is inside an entity,
    "O" means it is outside an entity,
    and "" means no entity tag is set.'''

pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'O', ''),
 (record, 'O', ''),
 ($, 'B', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


In [11]:
# task
# Scrape Data from a news article (BBC news, thelocal.fr, or even wikipedia)
# fetch the entities being discussed
# Hint: use "requests" to fetch data from a URL
# Install and use bs4.BeautifulSoup to scrape html

from bs4 import BeautifulSoup

In [25]:
import urllib.request
with urllib.request.urlopen('http://thelocal.fr/') as response:
    html = response.read()

In [26]:
soup = BeautifulSoup(html, 'html.parser')

In [27]:
print(soup.prettify())

<html>
 <body>
  <div style="position:absolute;top:100px;text-align:center;margin-left:auto;margin-right:auto;left:0;right:0;width:500px;font-family:Roboto,sans-serif;">
   <img alt="Project Shield Logo" height="50px" src="https://storage.googleapis.com/ddos-shield.appspot.com/shield-logo-mono-darktext.svg" width="250px"/>
   <p style="font-size:18px;">
    You will be connected to
    <b>
     thelocal.fr
    </b>
    in just a moment...
   </p>
   <p>
    <a href="https://g.co/shield">
     Learn about Project Shield
    </a>
   </p>
  </div>
  <script src="https://storage.googleapis.com/ddos-shield.appspot.com/aes.js" type="text/javascript">
  </script>
  <script>
   function toNumbers(d){var e=[];d.replace(/(..)/g,function(d){e.push(parseInt(d,16))});return e;}function toHex(){for(var d=[],d=1==arguments.length&&arguments[0].constructor==Array?arguments[0]:arguments,e="",f=0;f<d.length;f++)e+=(16>d[f]?"0":"")+d[f].toString(16);return e.toLowerCase()}var a=toNumbers("aa5a872fb72170d

In [28]:
soup.title

In [29]:
for link in soup.find_all('a'):
    print(link.get('href'))

https://g.co/shield


In [30]:
print(soup.get_text())

You will be connected to thelocal.fr in just a moment...Learn about Project Shieldfunction toNumbers(d){var e=[];d.replace(/(..)/g,function(d){e.push(parseInt(d,16))});return e;}function toHex(){for(var d=[],d=1==arguments.length&&arguments[0].constructor==Array?arguments[0]:arguments,e="",f=0;f<d.length;f++)e+=(16>d[f]?"0":"")+d[f].toString(16);return e.toLowerCase()}var a=toNumbers("aa5a872fb72170d33276857db27ea0d7"),b=toNumbers("612febbe4bc3f33fa619d5a6c5a5fda8"),c=toNumbers("bda7581f131981ec65503d0bce7102fc");document.cookie="STC="+toHex(slowAES.decrypt(c,2,a,b))+"; expires=Thu, 31-Dec-37 23:55:55 GMT; domain=.thelocal.fr; path=/";location.href="http://thelocal.fr/?sckattempt=1".replace(new RegExp("sckattempt=[0-9]\&"), "").replace(new RegExp("[?&]sckattempt=[0-9]"), "");


In [31]:
text_soup = soup.get_text()

In [32]:
text_soup_pos = nltk.pos_tag(nltk.word_tokenize(text_soup))

In [33]:
text_soup_pos

[('You', 'PRP'),
 ('will', 'MD'),
 ('be', 'VB'),
 ('connected', 'VBN'),
 ('to', 'TO'),
 ('thelocal.fr', 'VB'),
 ('in', 'IN'),
 ('just', 'RB'),
 ('a', 'DT'),
 ('moment', 'NN'),
 ('...', ':'),
 ('Learn', 'NNP'),
 ('about', 'IN'),
 ('Project', 'NNP'),
 ('Shieldfunction', 'NNP'),
 ('toNumbers', 'NNS'),
 ('(', '('),
 ('d', 'NN'),
 (')', ')'),
 ('{', '('),
 ('var', 'JJ'),
 ('e=', 'NN'),
 ('[', 'NNP'),
 (']', 'NNP'),
 (';', ':'),
 ('d.replace', 'NN'),
 ('(', '('),
 ('/', 'NNP'),
 ('(', '('),
 ('..', 'NNP'),
 (')', ')'),
 ('/g', 'NN'),
 (',', ','),
 ('function', 'NN'),
 ('(', '('),
 ('d', 'NN'),
 (')', ')'),
 ('{', '('),
 ('e.push', 'NN'),
 ('(', '('),
 ('parseInt', 'NN'),
 ('(', '('),
 ('d,16', 'NN'),
 (')', ')'),
 (')', ')'),
 ('}', ')'),
 (')', ')'),
 (';', ':'),
 ('return', 'VB'),
 ('e', 'NN'),
 (';', ':'),
 ('}', ')'),
 ('function', 'NN'),
 ('toHex', 'NNS'),
 ('(', '('),
 (')', ')'),
 ('{', '('),
 ('for', 'IN'),
 ('(', '('),
 ('var', 'JJ'),
 ('d=', 'NN'),
 ('[', 'NNP'),
 (']', 'NNP'),
 ('

In [34]:
ner = ne_chunk(text_soup_pos)

In [35]:
print(ner.__repr__())

Tree('S', [('You', 'PRP'), ('will', 'MD'), ('be', 'VB'), ('connected', 'VBN'), ('to', 'TO'), ('thelocal.fr', 'VB'), ('in', 'IN'), ('just', 'RB'), ('a', 'DT'), ('moment', 'NN'), ('...', ':'), ('Learn', 'NNP'), ('about', 'IN'), Tree('PERSON', [('Project', 'NNP'), ('Shieldfunction', 'NNP')]), ('toNumbers', 'NNS'), ('(', '('), ('d', 'NN'), (')', ')'), ('{', '('), ('var', 'JJ'), ('e=', 'NN'), ('[', 'NNP'), (']', 'NNP'), (';', ':'), ('d.replace', 'NN'), ('(', '('), ('/', 'NNP'), ('(', '('), ('..', 'NNP'), (')', ')'), ('/g', 'NN'), (',', ','), ('function', 'NN'), ('(', '('), ('d', 'NN'), (')', ')'), ('{', '('), ('e.push', 'NN'), ('(', '('), Tree('ORGANIZATION', [('parseInt', 'NN')]), ('(', '('), ('d,16', 'NN'), (')', ')'), (')', ')'), ('}', ')'), (')', ')'), (';', ':'), ('return', 'VB'), ('e', 'NN'), (';', ':'), ('}', ')'), ('function', 'NN'), ('toHex', 'NNS'), ('(', '('), (')', ')'), ('{', '('), ('for', 'IN'), ('(', '('), ('var', 'JJ'), ('d=', 'NN'), ('[', 'NNP'), (']', 'NNP'), (',', ','), (

In [36]:
doc = nlp(text_soup)
pprint([(X.text, X.label_) for X in doc.ents])

[('e="",f=0;f', 'CARDINAL'),
 ('toNumbers("612febbe4bc3f33fa619d5a6c5a5fda8"),c', 'CARDINAL'),
 ('31-Dec-37 23:55:55 GMT', 'FAC'),
 ('path=/";location.href="http://thelocal.fr/?sckattempt=1".replace(new '
  'RegExp("sckattempt=[0-9]\\&"',
  'ORG')]


In [37]:
''' "B" means the token begins an entity,
    "I" means it is inside an entity,
    "O" means it is outside an entity,
    and "" means no entity tag is set.'''

pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(You, 'O', ''),
 (will, 'O', ''),
 (be, 'O', ''),
 (connected, 'O', ''),
 (to, 'O', ''),
 (thelocal.fr, 'O', ''),
 (in, 'O', ''),
 (just, 'O', ''),
 (a, 'O', ''),
 (moment, 'O', ''),
 (..., 'O', ''),
 (Learn, 'O', ''),
 (about, 'O', ''),
 (Project, 'O', ''),
 (Shieldfunction, 'O', ''),
 (toNumbers(d){var, 'O', ''),
 (e=[];d.replace(/(, 'O', ''),
 (.., 'O', ''),
 ()/g, 'O', ''),
 (,, 'O', ''),
 (function(d){e.push(parseInt(d,16))});return, 'O', ''),
 (e;}function, 'O', ''),
 (toHex(){for(var, 'O', ''),
 (d=[],d=1==arguments.length&&arguments[0].constructor==Array?arguments[0]:arguments,
  'O',
  ''),
 (,, 'O', ''),
 (e="",f=0;f, 'B', 'CARDINAL'),
 (<, 'O', ''),
 (d.length;f++)e+=(16, 'O', ''),
 (>, 'O', ''),
 (d[f]?"0":"")+d[f].toString(16);return, 'O', ''),
 (e.toLowerCase()}var, 'O', ''),
 (a, 'O', ''),
 (=, 'O', ''),
 (toNumbers("aa5a872fb72170d33276857db27ea0d7"),b, 'O', ''),
 (=, 'O', ''),
 (toNumbers("612febbe4bc3f33fa619d5a6c5a5fda8"),c, 'B', 'CARDINAL'),
 (=, 'O', ''),
 (toNumb