## Non-stanford version of ents_from_trees

Need to:
- 1) tokenise sentence
- 2) apply BIO or IO tags
- 3) create tree
- 4) get ents from the tree

N.b. Use nltk.ne_recognize(). It doesn't use the Stanford recognizer but it does chunk entities. (It's a wrapper around an IOB named entity tagger).

In [1]:
import nltk

In [88]:
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> words
    Downloading package words to /home/chronos/user/nltk_data...
      Unzipping corpora/words.zip.

---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

In [4]:
sample = "Mr David Gauke is the Member of Parliament for South West Hertfordshire. David Gauke is a member of the Conservative Party."
sample

'Mr David Gauke is the Member of Parliament for South West Hertfordshire. David Gauke is a member of the Conservative Party.'

In [8]:
sentences = nltk.sent_tokenize(sample)
sentences

['Mr David Gauke is the Member of Parliament for South West Hertfordshire.',
 'David Gauke is a member of the Conservative Party.']

In [9]:
sentences = [nltk.word_tokenize(sent) for sent in sentences]
sentences

[['Mr',
  'David',
  'Gauke',
  'is',
  'the',
  'Member',
  'of',
  'Parliament',
  'for',
  'South',
  'West',
  'Hertfordshire',
  '.'],
 ['David',
  'Gauke',
  'is',
  'a',
  'member',
  'of',
  'the',
  'Conservative',
  'Party',
  '.']]

In [12]:
sentences = [nltk.pos_tag(sent) for sent in sentences]
sentences

[[('Mr', 'NNP'),
  ('David', 'NNP'),
  ('Gauke', 'NNP'),
  ('is', 'VBZ'),
  ('the', 'DT'),
  ('Member', 'NNP'),
  ('of', 'IN'),
  ('Parliament', 'NNP'),
  ('for', 'IN'),
  ('South', 'NNP'),
  ('West', 'NNP'),
  ('Hertfordshire', 'NNP'),
  ('.', '.')],
 [('David', 'NNP'),
  ('Gauke', 'NNP'),
  ('is', 'VBZ'),
  ('a', 'DT'),
  ('member', 'NN'),
  ('of', 'IN'),
  ('the', 'DT'),
  ('Conservative', 'NNP'),
  ('Party', 'NNP'),
  ('.', '.')]]

In [15]:
grammar = "NP: {<DT>?<JJ>*<NN>}"
cp = nltk.RegexpParser(grammar)

for sent in sentences:
    result = cp.parse(sent)
    print(result)


(S
  Mr/NNP
  David/NNP
  Gauke/NNP
  is/VBZ
  the/DT
  Member/NNP
  of/IN
  Parliament/NNP
  for/IN
  South/NNP
  West/NNP
  Hertfordshire/NNP
  ./.)
(S
  David/NNP
  Gauke/NNP
  is/VBZ
  (NP a/DT member/NN)
  of/IN
  the/DT
  Conservative/NNP
  Party/NNP
  ./.)


In [61]:
grammar = r"""
  NP: {<DT|PP\$>?<JJ>*<NN>}   # chunk determiner/possessive, adjectives and noun
      {<NNP>+}                # chunk sequences of proper nouns
"""
cp = nltk.RegexpParser(grammar)

parsed_trees=[]

for sent in sentences:
    result = cp.parse(sent)
    print("\nSentence:\n")
    print(result)
    parsed_trees.append(result)

print("\nFinal Tree:\n")
print(parsed_trees)


Sentence:

(S
  (NP Mr/NNP David/NNP Gauke/NNP)
  is/VBZ
  the/DT
  (NP Member/NNP)
  of/IN
  (NP Parliament/NNP)
  for/IN
  (NP South/NNP West/NNP Hertfordshire/NNP)
  ./.)

Sentence:

(S
  (NP David/NNP Gauke/NNP)
  is/VBZ
  (NP a/DT member/NN)
  of/IN
  the/DT
  (NP Conservative/NNP Party/NNP)
  ./.)

Final Tree:

[Tree('S', [Tree('NP', [('Mr', 'NNP'), ('David', 'NNP'), ('Gauke', 'NNP')]), ('is', 'VBZ'), ('the', 'DT'), Tree('NP', [('Member', 'NNP')]), ('of', 'IN'), Tree('NP', [('Parliament', 'NNP')]), ('for', 'IN'), Tree('NP', [('South', 'NNP'), ('West', 'NNP'), ('Hertfordshire', 'NNP')]), ('.', '.')]), Tree('S', [Tree('NP', [('David', 'NNP'), ('Gauke', 'NNP')]), ('is', 'VBZ'), Tree('NP', [('a', 'DT'), ('member', 'NN')]), ('of', 'IN'), ('the', 'DT'), Tree('NP', [('Conservative', 'NNP'), ('Party', 'NNP')]), ('.', '.')])]


In [63]:
# DO for 1 sentence only

result = cp.parse(sent)
print(result)


(S
  (NP David/NNP Gauke/NNP)
  is/VBZ
  (NP a/DT member/NN)
  of/IN
  the/DT
  (NP Conservative/NNP Party/NNP)
  ./.)


In [66]:
type(result)

nltk.tree.Tree

In [51]:
type(parsed_trees)

list

In [52]:
len(result)

7

In [54]:
ne_label = result.label()
ne_label

'S'

In [48]:
for i in result.leaves():
    print(i)

('David', 'NNP')
('Gauke', 'NNP')
('is', 'VBZ')
('a', 'DT')
('member', 'NN')
('of', 'IN')
('the', 'DT')
('Conservative', 'NNP')
('Party', 'NNP')
('.', '.')


In [67]:
# Do for 1 sentence

ne_in_sent = []

if type(result) == nltk.tree.Tree: # If subtree is a noun chunk, i.e. NE != "O"
        ne_string = " ".join([token for token, pos in result.leaves()])
        ne_in_sent.append((ne_string, ne_label))

ne_in_sent

[('David Gauke is a member of the Conservative Party .', 'S')]

In [69]:
def get_ents(ne_tree):
    ne_in_sent = []
    for subtree in ne_tree:
        if type(subtree) == nltk.tree.Tree: # If subtree is a noun chunk, i.e. NE != "O"
            ne_label = subtree.label()
            ne_string = " ".join([token for token, pos in subtree.leaves()])
            ne_in_sent.append((ne_string, ne_label))
    # Return list of entity tuples
    return ne_in_sent

get_ents(result)

[('David Gauke', 'NP'), ('a member', 'NP'), ('Conservative Party', 'NP')]

In [71]:
ents = [get_ents(tree) for tree in parsed_trees]
ents

[[('Mr David Gauke', 'NP'),
  ('Member', 'NP'),
  ('Parliament', 'NP'),
  ('South West Hertfordshire', 'NP')],
 [('David Gauke', 'NP'), ('a member', 'NP'), ('Conservative Party', 'NP')]]

In [72]:
type(ents)

list

In [74]:
import pandas

ents_df = pandas.DataFrame(ents)

ents_df

Unnamed: 0,0,1,2,3
0,"(Mr David Gauke, NP)","(Member, NP)","(Parliament, NP)","(South West Hertfordshire, NP)"
1,"(David Gauke, NP)","(a member, NP)","(Conservative Party, NP)",
