### Import dependencies to use NLTK

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [2]:
demo1 = """
Samsung told owners of its Galaxy Note 4 phone to install a new version of Google’s Android operating system intended for the more recent Galaxy Note 7, but which users claimed rendered the old model sluggish.

Likewise, Apple told iPhone 6 owners to install an operating system designed for the iPhone 7, leading to problems for owners of the older model.

Both firms were issued the maximum fine of €5m each and ordered to display a notice on their Italian websites informing customers of the watchdog’s decision.

Apple was fined an additional €5m for failing to give customers clear information about “essential” characteristics of lithium batteries, including their average life expectancy, how to maintain them or eventually replace them in the firm’s iPhones.
"""

### Tokenize the sentences into words and perform part of speech tagging

In [3]:
def preprocess_v1(text):
    word_list = nltk.word_tokenize(text)
    word_tags = nltk.pos_tag(word_list)
    return word_list, word_tags

In [4]:
l1, t1 = preprocess_v1(demo1)

In [5]:
l1

['Samsung',
 'told',
 'owners',
 'of',
 'its',
 'Galaxy',
 'Note',
 '4',
 'phone',
 'to',
 'install',
 'a',
 'new',
 'version',
 'of',
 'Google',
 '’',
 's',
 'Android',
 'operating',
 'system',
 'intended',
 'for',
 'the',
 'more',
 'recent',
 'Galaxy',
 'Note',
 '7',
 ',',
 'but',
 'which',
 'users',
 'claimed',
 'rendered',
 'the',
 'old',
 'model',
 'sluggish',
 '.',
 'Likewise',
 ',',
 'Apple',
 'told',
 'iPhone',
 '6',
 'owners',
 'to',
 'install',
 'an',
 'operating',
 'system',
 'designed',
 'for',
 'the',
 'iPhone',
 '7',
 ',',
 'leading',
 'to',
 'problems',
 'for',
 'owners',
 'of',
 'the',
 'older',
 'model',
 '.',
 'Both',
 'firms',
 'were',
 'issued',
 'the',
 'maximum',
 'fine',
 'of',
 '€5m',
 'each',
 'and',
 'ordered',
 'to',
 'display',
 'a',
 'notice',
 'on',
 'their',
 'Italian',
 'websites',
 'informing',
 'customers',
 'of',
 'the',
 'watchdog',
 '’',
 's',
 'decision',
 '.',
 'Apple',
 'was',
 'fined',
 'an',
 'additional',
 '€5m',
 'for',
 'failing',
 'to',
 'g

In [6]:
t1

[('Samsung', 'NNP'),
 ('told', 'VBD'),
 ('owners', 'NNS'),
 ('of', 'IN'),
 ('its', 'PRP$'),
 ('Galaxy', 'NNP'),
 ('Note', 'NNP'),
 ('4', 'CD'),
 ('phone', 'NN'),
 ('to', 'TO'),
 ('install', 'VB'),
 ('a', 'DT'),
 ('new', 'JJ'),
 ('version', 'NN'),
 ('of', 'IN'),
 ('Google', 'NNP'),
 ('’', 'NNP'),
 ('s', 'VBD'),
 ('Android', 'NNP'),
 ('operating', 'VBG'),
 ('system', 'NN'),
 ('intended', 'VBN'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('more', 'RBR'),
 ('recent', 'JJ'),
 ('Galaxy', 'NNP'),
 ('Note', 'NNP'),
 ('7', 'CD'),
 (',', ','),
 ('but', 'CC'),
 ('which', 'WDT'),
 ('users', 'NNS'),
 ('claimed', 'VBD'),
 ('rendered', 'VBD'),
 ('the', 'DT'),
 ('old', 'JJ'),
 ('model', 'NN'),
 ('sluggish', 'JJ'),
 ('.', '.'),
 ('Likewise', 'NNP'),
 (',', ','),
 ('Apple', 'NNP'),
 ('told', 'VBD'),
 ('iPhone', 'NN'),
 ('6', 'CD'),
 ('owners', 'NNS'),
 ('to', 'TO'),
 ('install', 'VB'),
 ('an', 'DT'),
 ('operating', 'NN'),
 ('system', 'NN'),
 ('designed', 'VBN'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('iPhone', 'NN'),

In [7]:
#Noun preceeded by adjectives and determinant is considered Named Entity
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [8]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(t1)
print(cs)

(S
  Samsung/NNP
  told/VBD
  owners/NNS
  of/IN
  its/PRP$
  Galaxy/NNP
  Note/NNP
  4/CD
  (NP phone/NN)
  to/TO
  install/VB
  (NP a/DT new/JJ version/NN)
  of/IN
  Google/NNP
  ’/NNP
  s/VBD
  Android/NNP
  operating/VBG
  (NP system/NN)
  intended/VBN
  for/IN
  the/DT
  more/RBR
  recent/JJ
  Galaxy/NNP
  Note/NNP
  7/CD
  ,/,
  but/CC
  which/WDT
  users/NNS
  claimed/VBD
  rendered/VBD
  (NP the/DT old/JJ model/NN)
  sluggish/JJ
  ./.
  Likewise/NNP
  ,/,
  Apple/NNP
  told/VBD
  (NP iPhone/NN)
  6/CD
  owners/NNS
  to/TO
  install/VB
  (NP an/DT operating/NN)
  (NP system/NN)
  designed/VBN
  for/IN
  (NP the/DT iPhone/NN)
  7/CD
  ,/,
  leading/VBG
  to/TO
  problems/NNS
  for/IN
  owners/NNS
  of/IN
  the/DT
  older/JJR
  (NP model/NN)
  ./.
  Both/DT
  firms/NNS
  were/VBD
  issued/VBN
  (NP the/DT maximum/JJ fine/NN)
  of/IN
  €5m/NNP
  each/DT
  and/CC
  ordered/VBD
  to/TO
  display/VB
  (NP a/DT notice/NN)
  on/IN
  their/PRP$
  Italian/JJ
  websites/NNS
  informing/VBG

In [9]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('Samsung', 'NNP', 'O'),
 ('told', 'VBD', 'O'),
 ('owners', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('its', 'PRP$', 'O'),
 ('Galaxy', 'NNP', 'O'),
 ('Note', 'NNP', 'O'),
 ('4', 'CD', 'O'),
 ('phone', 'NN', 'B-NP'),
 ('to', 'TO', 'O'),
 ('install', 'VB', 'O'),
 ('a', 'DT', 'B-NP'),
 ('new', 'JJ', 'I-NP'),
 ('version', 'NN', 'I-NP'),
 ('of', 'IN', 'O'),
 ('Google', 'NNP', 'O'),
 ('’', 'NNP', 'O'),
 ('s', 'VBD', 'O'),
 ('Android', 'NNP', 'O'),
 ('operating', 'VBG', 'O'),
 ('system', 'NN', 'B-NP'),
 ('intended', 'VBN', 'O'),
 ('for', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('more', 'RBR', 'O'),
 ('recent', 'JJ', 'O'),
 ('Galaxy', 'NNP', 'O'),
 ('Note', 'NNP', 'O'),
 ('7', 'CD', 'O'),
 (',', ',', 'O'),
 ('but', 'CC', 'O'),
 ('which', 'WDT', 'O'),
 ('users', 'NNS', 'O'),
 ('claimed', 'VBD', 'O'),
 ('rendered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('old', 'JJ', 'I-NP'),
 ('model', 'NN', 'I-NP'),
 ('sluggish', 'JJ', 'O'),
 ('.', '.', 'O'),
 ('Likewise', 'NNP', 'O'),
 (',', ',', 'O'),
 ('Apple', 'NNP', '

In [10]:
ne_tree = nltk.ne_chunk(t1)
print(ne_tree)

(S
  (PERSON Samsung/NNP)
  told/VBD
  owners/NNS
  of/IN
  its/PRP$
  (PERSON Galaxy/NNP Note/NNP)
  4/CD
  phone/NN
  to/TO
  install/VB
  a/DT
  new/JJ
  version/NN
  of/IN
  (PERSON Google/NNP)
  ’/NNP
  s/VBD
  (PERSON Android/NNP)
  operating/VBG
  system/NN
  intended/VBN
  for/IN
  the/DT
  more/RBR
  recent/JJ
  (PERSON Galaxy/NNP Note/NNP)
  7/CD
  ,/,
  but/CC
  which/WDT
  users/NNS
  claimed/VBD
  rendered/VBD
  the/DT
  old/JJ
  model/NN
  sluggish/JJ
  ./.
  (PERSON Likewise/NNP)
  ,/,
  (PERSON Apple/NNP)
  told/VBD
  (ORGANIZATION iPhone/NN)
  6/CD
  owners/NNS
  to/TO
  install/VB
  an/DT
  operating/NN
  system/NN
  designed/VBN
  for/IN
  the/DT
  (ORGANIZATION iPhone/NN)
  7/CD
  ,/,
  leading/VBG
  to/TO
  problems/NNS
  for/IN
  owners/NNS
  of/IN
  the/DT
  older/JJR
  model/NN
  ./.
  Both/DT
  firms/NNS
  were/VBD
  issued/VBN
  the/DT
  maximum/JJ
  fine/NN
  of/IN
  €5m/NNP
  each/DT
  and/CC
  ordered/VBD
  to/TO
  display/VB
  a/DT
  notice/NN
  on/IN
  th

In [11]:
def preprocess_v2(text):
    from nltk.tokenize import RegexpTokenizer

    tokenizer = RegexpTokenizer(r'\w+')
    word_list = tokenizer.tokenize(text)
    word_tags = pos_tag(word_list)
    
    return word_list, word_tags

In [12]:
l2, t2 = preprocess_v2(demo1)

In [13]:
l2

['Samsung',
 'told',
 'owners',
 'of',
 'its',
 'Galaxy',
 'Note',
 '4',
 'phone',
 'to',
 'install',
 'a',
 'new',
 'version',
 'of',
 'Google',
 's',
 'Android',
 'operating',
 'system',
 'intended',
 'for',
 'the',
 'more',
 'recent',
 'Galaxy',
 'Note',
 '7',
 'but',
 'which',
 'users',
 'claimed',
 'rendered',
 'the',
 'old',
 'model',
 'sluggish',
 'Likewise',
 'Apple',
 'told',
 'iPhone',
 '6',
 'owners',
 'to',
 'install',
 'an',
 'operating',
 'system',
 'designed',
 'for',
 'the',
 'iPhone',
 '7',
 'leading',
 'to',
 'problems',
 'for',
 'owners',
 'of',
 'the',
 'older',
 'model',
 'Both',
 'firms',
 'were',
 'issued',
 'the',
 'maximum',
 'fine',
 'of',
 '5m',
 'each',
 'and',
 'ordered',
 'to',
 'display',
 'a',
 'notice',
 'on',
 'their',
 'Italian',
 'websites',
 'informing',
 'customers',
 'of',
 'the',
 'watchdog',
 's',
 'decision',
 'Apple',
 'was',
 'fined',
 'an',
 'additional',
 '5m',
 'for',
 'failing',
 'to',
 'give',
 'customers',
 'clear',
 'information',
 'ab

In [14]:
t2

[('Samsung', 'NNP'),
 ('told', 'VBD'),
 ('owners', 'NNS'),
 ('of', 'IN'),
 ('its', 'PRP$'),
 ('Galaxy', 'NNP'),
 ('Note', 'NNP'),
 ('4', 'CD'),
 ('phone', 'NN'),
 ('to', 'TO'),
 ('install', 'VB'),
 ('a', 'DT'),
 ('new', 'JJ'),
 ('version', 'NN'),
 ('of', 'IN'),
 ('Google', 'NNP'),
 ('s', 'FW'),
 ('Android', 'NNP'),
 ('operating', 'VBG'),
 ('system', 'NN'),
 ('intended', 'VBN'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('more', 'RBR'),
 ('recent', 'JJ'),
 ('Galaxy', 'NNP'),
 ('Note', 'NNP'),
 ('7', 'CD'),
 ('but', 'CC'),
 ('which', 'WDT'),
 ('users', 'NNS'),
 ('claimed', 'VBD'),
 ('rendered', 'VBD'),
 ('the', 'DT'),
 ('old', 'JJ'),
 ('model', 'NN'),
 ('sluggish', 'JJ'),
 ('Likewise', 'NNP'),
 ('Apple', 'NNP'),
 ('told', 'VBD'),
 ('iPhone', 'NN'),
 ('6', 'CD'),
 ('owners', 'NNS'),
 ('to', 'TO'),
 ('install', 'VB'),
 ('an', 'DT'),
 ('operating', 'NN'),
 ('system', 'NN'),
 ('designed', 'VBN'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('iPhone', 'NN'),
 ('7', 'CD'),
 ('leading', 'VBG'),
 ('to', 'TO'),
 ('p

# Named Entity Recognition with Spacy

In [15]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [16]:
doc = nlp(demo1)

In [17]:
pprint([(X.text, X.label_) for X in doc.ents])

[('\n', 'GPE'),
 ('Samsung', 'ORG'),
 ('Galaxy Note', 'ORG'),
 ('4', 'CARDINAL'),
 ('Google’s', 'PRODUCT'),
 ('Android', 'FAC'),
 ('Galaxy Note', 'PRODUCT'),
 ('Apple', 'ORG'),
 ('iPhone', 'ORG'),
 ('6', 'CARDINAL'),
 ('the iPhone 7', 'ORG'),
 ('€5m', 'MONEY'),
 ('Italian', 'NORP'),
 ('Apple', 'ORG'),
 ('an additional €5m', 'MONEY'),
 ('iPhones', 'GPE'),
 ('\n', 'GPE')]


In [18]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(
, 'B', 'GPE'),
 (Samsung, 'B', 'ORG'),
 (told, 'O', ''),
 (owners, 'O', ''),
 (of, 'O', ''),
 (its, 'O', ''),
 (Galaxy, 'B', 'ORG'),
 (Note, 'I', 'ORG'),
 (4, 'B', 'CARDINAL'),
 (phone, 'O', ''),
 (to, 'O', ''),
 (install, 'O', ''),
 (a, 'O', ''),
 (new, 'O', ''),
 (version, 'O', ''),
 (of, 'O', ''),
 (Google, 'B', 'PRODUCT'),
 (’s, 'I', 'PRODUCT'),
 (Android, 'B', 'FAC'),
 (operating, 'O', ''),
 (system, 'O', ''),
 (intended, 'O', ''),
 (for, 'O', ''),
 (the, 'O', ''),
 (more, 'O', ''),
 (recent, 'O', ''),
 (Galaxy, 'B', 'PRODUCT'),
 (Note, 'I', 'PRODUCT'),
 (7, 'O', ''),
 (,, 'O', ''),
 (but, 'O', ''),
 (which, 'O', ''),
 (users, 'O', ''),
 (claimed, 'O', ''),
 (rendered, 'O', ''),
 (the, 'O', ''),
 (old, 'O', ''),
 (model, 'O', ''),
 (sluggish, 'O', ''),
 (., 'O', ''),
 (

, 'O', ''),
 (Likewise, 'O', ''),
 (,, 'O', ''),
 (Apple, 'B', 'ORG'),
 (told, 'O', ''),
 (iPhone, 'B', 'ORG'),
 (6, 'B', 'CARDINAL'),
 (owners, 'O', ''),
 (to, 'O', ''),
 (install, 'O', ''),
 (an, 'O', ''),
 (

In [19]:
#passing the sentences
displacy.render(nlp(str(demo1)), jupyter=True, style='ent')

In [21]:
#passing the word list for same sentences
displacy.render(nlp(str(l2)), jupyter=True, style='ent')