## Using Stanford NER from Dan's stan.py work, on Hansard four entity process

In [65]:
# Stanford imports
from nltk.tag.stanford import StanfordNERTagger
from nltk.tokenize.stanford import StanfordTokenizer
# Other imports
import os
import nltk
from collections import defaultdict
# NLTK imports
from nltk import pos_tag
from nltk.chunk import conlltags2tree
from nltk.tree import Tree

import tkinter

import pandas as pd
import numpy as np

from collections import Counter
import operator

In [2]:
### Define paths ###

#Set core path for Stanford NLP packages
main_path = os.path.join("C:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\StanfordNLP\\",
                         "stanford-corenlp-full-2016-10-31\\")
# Set paths where the Standford NLP .jar files are located
pathlist = [os.path.join(main_path,"stanford-corenlp-3.7.0"),
            os.path.join(main_path,"ner\\stanford-ner.jar"),
            os.path.join(main_path,"postagger\\stanford-postagger.jar")]
###            os.path.join(main_path,"parser\\stanford-parser.jar"),
###            os.path.join(main_path,"parser\\stanford-parser-3.6.0-models.jar"),
# Set path to Stanford models
mpath = [os.path.join(main_path,"postagger\\models"),
         os.path.join(main_path,"ner\\classifiers")]
# Set path to java.exe
javapath = "C:\\Program Files\\Java\\jre1.8.0_121\\bin\\java.exe"

# Add paths to the CLASSPATH environmental variable (as instructed by NLTK)
os.environ['CLASSPATH'] = os.pathsep.join(pathlist)
os.environ['STANFORD_MODELS'] = os.pathsep.join(mpath)
os.environ['JAVAHOME'] = javapath

In [12]:
### Define functions ###

# Define function to take in a sentence and return it NER tagged
def get_tagged_sent(sample):
    # Tokenize sentence with stanford NLP
    tkn_sent = StanfordTokenizer().tokenize(sample)
    print("\nTokenized sentence:\n", tkn_sent)
    # Named entity tagging with stanford NLP
    tag_sent = StanfordNERTagger(
                'english.muc.7class.distsim.crf.ser.gz').tag(tkn_sent)
    print("\nNamed Entity tagged sentence:\n", tag_sent)
    # Return NER tagged sentence
    return tag_sent

# Define function to tag NER sentence with BIO tags
def stanfordNE2BIO(tagged_sent):
    bio_tagged_sent = []
    prev_tag = "O"
    for token, tag in tagged_sent:
        if tag == "O": #O
            bio_tagged_sent.append((token, tag))
            prev_tag = tag
            continue
        if tag != "O" and prev_tag == "O": # Begin NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag == tag: # Inside NE
            bio_tagged_sent.append((token, "I-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag != tag: # Adjacent NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag
    print("\nBIO chunked setence:\n", bio_tagged_sent)
    # Return BIO tagged sentence
    return bio_tagged_sent

# Define function to create tree from BIO tagged sentence
def stanfordNE2tree(ne_tagged_sent):
    bio_tagged_sent = stanfordNE2BIO(ne_tagged_sent)
    sent_tokens, sent_ne_tags = zip(*bio_tagged_sent)
    sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)]

    sent_conlltags = [(token, pos, ne) for token, pos, ne
                     in zip(sent_tokens, sent_pos_tags, sent_ne_tags)]
    ne_tree = conlltags2tree(sent_conlltags)
    print("\nPOS tagged tree:\n", ne_tree)
    # Return sentence in NLTK tree format
    return ne_tree

# Define function to get entities from NLTK tree
def get_ents(ne_tree):
    ne_in_sent = []
    for subtree in ne_tree:
        if type(subtree) == Tree: # If subtree is a noun chunk, i.e. NE != "O"
            ne_label = subtree.label()
            ne_string = " ".join([token for token, pos in subtree.leaves()])
            ne_in_sent.append((ne_string, ne_label))
    # Return list of entity tuples
    return ne_in_sent

# Define function to turn text into NLTK trees (one tree per sentence)
def text_to_trees(sample):
    # Tokenize sentence with NLTK
    sentences = nltk.sent_tokenize(sample)
    tagged_sentences = [get_tagged_sent(sent) for sent in sentences]
    trees = [stanfordNE2tree(sent) for sent in tagged_sentences]
    # Return list of NLTK trees
    return trees

# Define function to extract entities from NLTK trees
def ents_from_trees(trees):
    ents = [get_ents(tree) for tree in trees]
    ent_list = [item for sublist in ents for item in sublist] # Flatten list
    dedupe_ent_list = set(ent_list)
    # Return list of entities
    return dedupe_ent_list

# Re-creation of nltk tree2semi_rel to look inside lists of lists
def tree2rel_list(trees):
    semi_rels = []
    semi_rel = [[], None]

    for item in trees:
        for tree in item:
            if not isinstance(tree, Tree):
                semi_rel[0].append(tree)
            else:
                semi_rel[1] = tree
                semi_rels.append(semi_rel)
                semi_rel = [[], None]
    return semi_rels

# Re-creation of nltk semi_rel2reldic to remove NER tags
def rel2reldict(pairs, window=5, trace=False):
    result = []

    while len(pairs) > 2:
        reldict = defaultdict(str)
        reldict['lcon'] = " ".join(token for token, pos in pairs[0][0][-window:])
        reldict['subjclass'] = pairs[0][1].label()
        reldict['subjtext'] = " ".join(token for token, pos in pairs[0][1].leaves())
        reldict['filler'] = " ".join(token for token, pos in pairs[1][0])
        reldict['objclass'] = pairs[1][1].label()
        reldict['objtext'] = " ".join(token for token, pos in pairs[1][1].leaves())
        reldict['rcon'] = " ".join(token for token, pos in pairs[2][0][:window])
        if trace:
            print("(%s(%s, %s)" % (reldict['untagged_filler'], reldict['subjclass'], reldict['objclass']))
        result.append(reldict)
        pairs = pairs[1:]
    return result

## Hansard Four Entity sample data

In [10]:
# Set up input parameters

file_in = 'c:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Motor Neurone Disease-Gordon Aikman 2017-02-20.txt'
file_out_suffix = 'motor_neurone'
date = '20/02/2017'

# Read in file
sample = open(file_in, 'r', encoding='utf8').read()


In [33]:
sample_tagged = get_tagged_sent(sample)


Tokenized sentence:
 ['Motor', 'Neurone', 'Disease/Gordon', 'Aikman', 'Motion', 'made', ',', 'and', 'Question', 'proposed', ',', 'That', 'this', 'House', 'do', 'now', 'adjourn', '.', '--', '-LRB-', 'Chris', 'Heaton-Harris', '.', '-RRB-', '19:18:00', 'Ian', 'Murray', '-LRB-', 'Edinburgh', 'South', '-RRB-', '-LRB-', 'Lab', '-RRB-', 'I', 'am', 'very', 'grateful', 'to', 'the', 'good', 'offices', 'of', 'Mr', 'Speaker', 'and', 'you', ',', 'Madam', 'Deputy', 'Speaker', ',', 'for', 'granting', 'time', 'for', 'this', 'Adjournment', 'debate', '.', 'I', 'want', 'to', 'talk', 'about', 'the', 'dreadful', 'disease', 'that', 'is', 'motor', 'neurone', 'disease', 'and', 'to', 'pay', 'tribute', 'to', 'the', 'life', 'of', 'Gordon', 'Aikman', '.', 'I', 'wish', 'we', 'were', 'not', 'having', 'this', 'debate', ',', 'because', 'that', 'would', 'mean', 'Gordon', 'Aikman', 'was', 'still', 'with', 'us', '.', 'He', 'sadly', 'passed', 'away', 'on', '2', 'February', ',', 'aged', 'just', '31', '.', 'I', 'want', 't

In [14]:
sample_bio = stanfordNE2BIO(sample_tagged) 


BIO chunked setence:
 [('Motor', 'O'), ('Neurone', 'B-ORGANIZATION'), ('DiseaseGordon', 'I-ORGANIZATION'), ('Aikman', 'I-ORGANIZATION'), ('Motion', 'I-ORGANIZATION'), ('made', 'O'), (',', 'O'), ('and', 'O'), ('Question', 'O'), ('proposed', 'O'), (',', 'O'), ('That', 'O'), ('this', 'O'), ('House', 'B-ORGANIZATION'), ('do', 'O'), ('now', 'O'), ('adjourn', 'O'), ('.', 'O'), ('--', 'O'), ('-LRB-', 'O'), ('Chris', 'B-PERSON'), ('Heaton-Harris', 'I-PERSON'), ('.', 'O'), ('-RRB-', 'O'), ('19:18:00', 'O'), ('Ian', 'B-PERSON'), ('Murray', 'I-PERSON'), ('-LRB-', 'O'), ('Edinburgh', 'B-ORGANIZATION'), ('South', 'I-ORGANIZATION'), ('-RRB-', 'O'), ('-LRB-', 'O'), ('Lab', 'O'), ('-RRB-', 'O'), ('I', 'O'), ('am', 'O'), ('very', 'O'), ('grateful', 'O'), ('to', 'O'), ('the', 'O'), ('good', 'O'), ('offices', 'O'), ('of', 'O'), ('Mr', 'O'), ('Speaker', 'O'), ('and', 'O'), ('you', 'O'), (',', 'O'), ('Madam', 'O'), ('Deputy', 'O'), ('Speaker', 'O'), (',', 'O'), ('for', 'O'), ('granting', 'O'), ('time', 'O

In [21]:
sample_tree = stanfordNE2tree(sample_tagged)


BIO chunked setence:
 [('Motor', 'O'), ('Neurone', 'B-ORGANIZATION'), ('DiseaseGordon', 'I-ORGANIZATION'), ('Aikman', 'I-ORGANIZATION'), ('Motion', 'I-ORGANIZATION'), ('made', 'O'), (',', 'O'), ('and', 'O'), ('Question', 'O'), ('proposed', 'O'), (',', 'O'), ('That', 'O'), ('this', 'O'), ('House', 'B-ORGANIZATION'), ('do', 'O'), ('now', 'O'), ('adjourn', 'O'), ('.', 'O'), ('--', 'O'), ('-LRB-', 'O'), ('Chris', 'B-PERSON'), ('Heaton-Harris', 'I-PERSON'), ('.', 'O'), ('-RRB-', 'O'), ('19:18:00', 'O'), ('Ian', 'B-PERSON'), ('Murray', 'I-PERSON'), ('-LRB-', 'O'), ('Edinburgh', 'B-ORGANIZATION'), ('South', 'I-ORGANIZATION'), ('-RRB-', 'O'), ('-LRB-', 'O'), ('Lab', 'O'), ('-RRB-', 'O'), ('I', 'O'), ('am', 'O'), ('very', 'O'), ('grateful', 'O'), ('to', 'O'), ('the', 'O'), ('good', 'O'), ('offices', 'O'), ('of', 'O'), ('Mr', 'O'), ('Speaker', 'O'), ('and', 'O'), ('you', 'O'), (',', 'O'), ('Madam', 'O'), ('Deputy', 'O'), ('Speaker', 'O'), (',', 'O'), ('for', 'O'), ('granting', 'O'), ('time', 'O

In [None]:
# Count occurences of each entity

sample_ents = set(get_ents(sample_tree))
sample_ents

In [34]:
# Export entities version

file_out = open('four_entity_process_stanford_NER_%s.txt' %file_out_suffix,'w+')
for item in sample_ents:
    file_out.write("%s \n" %(item,))
file_out.close()

## To DO

- Wrap above into a function and run for each file
- Collate all found entities together and count up, get most commonly occurring ones
- Find how to add probability - from nltk.tag.stanford documentation?

## Full run

In [1]:
# Imports

# Stanford imports
from nltk.tag.stanford import StanfordNERTagger
from nltk.tokenize.stanford import StanfordTokenizer
# Other imports
import os
import nltk
from collections import defaultdict
# NLTK imports
from nltk import pos_tag
from nltk.chunk import conlltags2tree
from nltk.tree import Tree

import tkinter

import pandas as pd
import numpy as np

from collections import Counter
import operator


### Define paths ###

#Set core path for Stanford NLP packages
main_path = os.path.join("C:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\StanfordNLP\\",
                         "stanford-corenlp-full-2016-10-31\\")
# Set paths where the Standford NLP .jar files are located
pathlist = [os.path.join(main_path,"stanford-corenlp-3.7.0"),
            os.path.join(main_path,"ner\\stanford-ner.jar"),
            os.path.join(main_path,"postagger\\stanford-postagger.jar")]
###            os.path.join(main_path,"parser\\stanford-parser.jar"),
###            os.path.join(main_path,"parser\\stanford-parser-3.6.0-models.jar"),
# Set path to Stanford models
mpath = [os.path.join(main_path,"postagger\\models"),
         os.path.join(main_path,"ner\\classifiers")]
# Set path to java.exe
javapath = "C:\\Program Files\\Java\\jre1.8.0_121\\bin\\java.exe"

# Add paths to the CLASSPATH environmental variable (as instructed by NLTK)
os.environ['CLASSPATH'] = os.pathsep.join(pathlist)
os.environ['STANFORD_MODELS'] = os.pathsep.join(mpath)
os.environ['JAVAHOME'] = javapath



# Define function to tag NER sentence with BIO tags
def stanfordNE2BIO(tagged_sent):
    bio_tagged_sent = []
    prev_tag = "O"
    for token, tag in tagged_sent:
        if tag == "O": #O
            bio_tagged_sent.append((token, tag))
            prev_tag = tag
            continue
        if tag != "O" and prev_tag == "O": # Begin NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag == tag: # Inside NE
            bio_tagged_sent.append((token, "I-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag != tag: # Adjacent NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag
    # Return BIO tagged sentence
    return bio_tagged_sent



### Main function ###

def stanford_ner_full_run(file_in, file_out_suffix):
    
    # Read in file
    sample = open(file_in, 'r', encoding='utf8').read()
    
    
    # Tokenize sentence with stanford NLP
    tkn_sent = StanfordTokenizer().tokenize(sample)

    # Named entity tagging with stanford NLP
    tag_sent = StanfordNERTagger('english.muc.7class.distsim.crf.ser.gz').tag(tkn_sent)
    
    # Apply BIO tags to the tagged sentence
    bio_tagged_sent = stanfordNE2BIO(tag_sent)
    
    # Collate BIO parts of entities together
    sent_tokens, sent_ne_tags = zip(*bio_tagged_sent)
    sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)]

    sent_conlltags = [(token, pos, ne) for token, pos, ne
                     in zip(sent_tokens, sent_pos_tags, sent_ne_tags)]
    ne_tree = conlltags2tree(sent_conlltags)

    
    # Get entities from the trees
    ne_in_sent = []
    for subtree in ne_tree:
        if type(subtree) == Tree: # If subtree is a noun chunk, i.e. NE != "O"
            ne_label = subtree.label()
            ne_string = " ".join([token for token, pos in subtree.leaves()])
            ne_in_sent.append((ne_string, ne_label))
    
    # Output entities (with duplicates retained)
    return ne_in_sent

In [2]:
ents_mn = stanford_ner_full_run('c:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Motor Neurone Disease-Gordon Aikman 2017-02-20.txt','motor_neurone')
ents_tr = stanford_ner_full_run('c:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\President Trump- State Visit 2017-02-20.txt', 'trump')
ents_hi = stanford_ner_full_run('c:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\High Speed Rail (London - West Midlands) Bill 2017-02-20.txt', 'highspeedrail')
ents_va = stanford_ner_full_run('c:\\Users\\rothw\\Documents\\Python Scripts\\Python NLP\\Vauxhall-Opel- Proposed Takeover 2017-02-20.txt', 'vauxhallopel')


In [18]:
# Rack all outputs together

ents_all = ents_mn + ents_tr + ents_hi + ents_va

In [19]:
# Count occurences of each entity

counts = Counter(ents_all)
counts_sorted = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)
counts_sorted

[(('Trump', 'PERSON'), 85),
 (('MND', 'ORGANIZATION'), 70),
 (('United States', 'LOCATION'), 62),
 (('House', 'ORGANIZATION'), 60),
 (('Donald Trump', 'PERSON'), 58),
 (('Government', 'ORGANIZATION'), 55),
 (('Gordon', 'PERSON'), 33),
 (('Vauxhall', 'ORGANIZATION'), 31),
 (('Greg Clark', 'PERSON'), 31),
 (('America', 'LOCATION'), 30),
 (('Secretary of State', 'ORGANIZATION'), 29),
 (('PSA', 'ORGANIZATION'), 25),
 (('Scotland', 'LOCATION'), 22),
 (('Europe', 'LOCATION'), 22),
 (('NATO', 'ORGANIZATION'), 22),
 (('London', 'LOCATION'), 19),
 (('Paul Flynn', 'PERSON'), 19),
 (('Ellesmere Port', 'LOCATION'), 18),
 (('United Kingdom', 'LOCATION'), 17),
 (('Alex Salmond', 'PERSON'), 17),
 (('Britain', 'LOCATION'), 16),
 (('US', 'LOCATION'), 15),
 (('United States of America', 'LOCATION'), 15),
 (('UK', 'ORGANIZATION'), 13),
 (('Parliament', 'ORGANIZATION'), 13),
 (('GM', 'ORGANIZATION'), 13),
 (('White House', 'ORGANIZATION'), 11),
 (('Obama', 'PERSON'), 11),
 (('Administration', 'ORGANIZATIO

In [20]:
# Export

file_out = open('four_entity_process_stanford.txt','w+')

for row in counts_sorted:
    for item in row:
        file_out.write("%s, " %(item,))
    file_out.write("\n")
file_out.close()


## Conclusion

- 707 entities were found across the 4 debates that took place on 20th Feb 2017
- out of these, only 19 (15% of the total) occurred 3 or more times; only 9 (7%of the total) occurred 4 or more times
- these feel like reasonable volumes to then have an expert review and decide whether or not to add to a "known entity lookup"

## Scrap Code

In [81]:
nltk.tag.stanford?

In [21]:
len(counts_sorted)

707

In [35]:
flatten = lambda l: [item for sublist in counts_sorted for item in sublist]

In [36]:
flatten

<function __main__.<lambda>>

In [37]:
print(flatten)

<function <lambda> at 0x0000018302B95400>


In [39]:
flatten(counts_sorted)

[('MND', 'ORGANIZATION'),
 70,
 ('Gordon', 'PERSON'),
 30,
 ('Scotland', 'LOCATION'),
 12,
 ('Gordon Aikman', 'PERSON'),
 9,
 ('House', 'ORGANIZATION'),
 7,
 ('Ian Murray', 'PERSON'),
 6,
 ('this evening', 'TIME'),
 4,
 ('First Minister', 'ORGANIZATION'),
 4,
 ('# 5 million', 'MONEY'),
 4,
 ('Edinburgh South', 'ORGANIZATION'),
 3,
 ('2014', 'DATE'),
 3,
 ('Edinburgh', 'LOCATION'),
 3,
 ('Saturday', 'DATE'),
 3,
 ('NHS', 'ORGANIZATION'),
 3,
 ('England', 'LOCATION'),
 3,
 ('14 %', 'PERCENT'),
 3,
 ('MRC', 'ORGANIZATION'),
 3,
 ('NHS England', 'ORGANIZATION'),
 3,
 ('AAC', 'ORGANIZATION'),
 3,
 ('Joe', 'PERSON'),
 2,
 ('UK', 'ORGANIZATION'),
 2,
 ('Gordon', 'LOCATION'),
 2,
 ('summer', 'DATE'),
 2,
 ('# 10 million', 'MONEY'),
 2,
 ('MND Association', 'ORGANIZATION'),
 2,
 ('Euan MacDonald', 'PERSON'),
 2,
 ('US', 'LOCATION'),
 2,
 ('Government', 'ORGANIZATION'),
 2,
 ('Neurone DiseaseGordon Aikman Motion', 'ORGANIZATION'),
 1,
 ('Chris Heaton-Harris', 'PERSON'),
 1,
 ('February', 'DATE')

In [16]:
for row in ents_tr:
    if row[0].index('Trump')>0:
        print(row)

ValueError: substring not found

In [17]:
ents_tr

[('Charles Walker', 'PERSON'),
 ('Public Gallery', 'LOCATION'),
 ('House of Commons', 'ORGANIZATION'),
 ('Public Gallery', 'ORGANIZATION'),
 ('Paul Flynn', 'PERSON'),
 ('Newport West', 'LOCATION'),
 ('House', 'ORGANIZATION'),
 ('Donald Trump', 'PERSON'),
 ('Walker', 'PERSON'),
 ('Petitions Committee', 'ORGANIZATION'),
 ('Donald Trump', 'PERSON'),
 ('State Visit', 'ORGANIZATION'),
 ('U.K.', 'LOCATION'),
 ('Donald Trump', 'PERSON'),
 ('US Government', 'ORGANIZATION'),
 ('State Visit', 'ORGANIZATION'),
 ('Trump', 'PERSON'),
 ('United States', 'LOCATION'),
 ('1952', 'DATE'),
 ('Trump', 'PERSON'),
 ('United States', 'LOCATION'),
 ('United States', 'LOCATION'),
 ('Alex Salmond', 'PERSON'),
 ('Trump', 'PERSON'),
 ('Paul Flynn', 'PERSON'),
 ('Europe', 'LOCATION'),
 ('Europe', 'LOCATION'),
 ('Lithuania', 'LOCATION'),
 ('Trump', 'PERSON'),
 ('Pritchard', 'PERSON'),
 ('Trump', 'PERSON'),
 ('United Kingdom', 'LOCATION'),
 ('United States of America', 'LOCATION'),
 ('White House', 'ORGANIZATION'),
