### Effective Entity Recognition

In [1]:
import spacy
import nltk
nlp = spacy.load("en_core_web_lg")

In [2]:
from nltk.tag import StanfordNERTagger
from names_dataset import NameDataset
st = StanfordNERTagger('./stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz','./stanford-ner/stanford-ner.jar',encoding='utf-8')
m = NameDataset()

In [3]:
text = ""
paragraphs = []
with open("./obama_text.txt") as afile:
    lines = afile.readlines()
    text += lines[0]
    paragraphs.append(lines[0])
    for line in lines[1:]:
        text += ("\n"+line)
        paragraphs.append(line)

In [4]:
print(text)

Obama was born in Honolulu, Hawaii. After graduating from Columbia University in 1983, he worked as a community organizer in Chicago. In 1988, he enrolled in Harvard Law School, where he was the first black president of the Harvard Law Review. After graduating, he became a civil rights attorney and an academic, teaching constitutional law at the University of Chicago Law School from 1992 to 2004. He represented the 13th district for three terms in the Illinois Senate from 1997 until 2004, when he ran for the U.S. Senate. He received national attention in 2004 with his March primary win, his well-received July Democratic National Convention keynote address, and his landslide November election to the Senate. In 2008, he was nominated for president a year after his campaign began, after a close primary campaign against Hillary Clinton. He was elected over Republican John McCain and was inaugurated on January 20, 2009. Nine months later, he was named the 2009 Nobel Peace Prize laureate.

R

In [7]:
nltk_tokens = nltk.word_tokenize(text)
stanford_text = st.tag(nltk_tokens)
nltk_pos = nltk.pos_tag(nltk_tokens)

In [8]:
len(nltk_pos)

719

In [15]:
def clean_and_get_entities(text):
    """ Assumption that Stanford entity classifier will capture all valid entities.
        Spacy and name check approach simply used to validate Stanford entities.
        Basically this system helps prevent False Positives.
    """
    spacy_text = nlp(text)
    spacy_entities = []
    for token in spacy_text.ents: # getting entities classified from spacy
        if token.label_ == "PERSON":
            spacy_entities.append(token.text.lower())
    ##print(spacy_entities)
            
    nltk_tokens = nltk.word_tokenize(text)
    stanford_text = st.tag(nltk_tokens)
    nltk_pos = nltk.pos_tag(nltk_tokens)
    assert(len(nltk_pos) == len(stanford_text))
    
    final_tokens = [] # includes all tokens, and POS (& classifications for entities)
    i = 0
    while i < len(stanford_text):
        word = stanford_text[i][0].lower()
        classification = stanford_text[i][1]
        part_of_speech = nltk_pos[i][1]
        if classification != 'PERSON': # simply add non-entities
            final_tokens.append((word,part_of_speech))
        else: # handle entities
            names = [word]
            while stanford_text[i+1][1] == 'PERSON': # loop until complete name is gotten
                next_name = stanford_text[i+1][0].lower()
                word += (" "+next_name) # combining entity names together
                names.append(next_name)
                i += 1
            
            # extra layer to validate that nlp entity tokenizer isn't making a mistake:
            if word in spacy_entities: # check if full name also in spacy
                ##print("spacy",word)
                final_tokens.append((word,part_of_speech+" PERSON"))
            else: # if not in spacy, check that all parts of entity name are names
                ##print("word database",word)
                if sum([m.search_first_name(indiv_name) or m.search_last_name(indiv_name) for indiv_name in names]) == len(names):
                    final_tokens.append((word,part_of_speech+" PERSON"))
        i += 1
        
    return final_tokens

#### demo of using this code

In [16]:
final_tokens = clean_and_get_entities(paragraphs[1])

In [17]:
final_tokens

[('regarded', 'VBN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('centrist', 'JJ'),
 ('new', 'NNP'),
 ('democrat', 'NNP'),
 (',', ','),
 ('obama', 'NNP PERSON'),
 ('signed', 'VBD'),
 ('many', 'JJ'),
 ('landmark', 'NN'),
 ('bills', 'NNS'),
 ('into', 'IN'),
 ('law', 'NN'),
 ('during', 'IN'),
 ('his', 'PRP$'),
 ('first', 'JJ'),
 ('two', 'CD'),
 ('years', 'NNS'),
 ('in', 'IN'),
 ('office', 'NN'),
 ('.', '.'),
 ('the', 'DT'),
 ('main', 'JJ'),
 ('reforms', 'NNS'),
 ('that', 'WDT'),
 ('were', 'VBD'),
 ('passed', 'VBN'),
 ('include', 'VBP'),
 ('the', 'DT'),
 ('patient', 'NNP'),
 ('protection', 'NNP'),
 ('and', 'CC'),
 ('affordable', 'NNP'),
 ('care', 'NNP'),
 ('act', 'NNP'),
 ('(', '('),
 ('commonly', 'RB'),
 ('referred', 'VBN'),
 ('to', 'TO'),
 ('as', 'IN'),
 ('the', 'DT'),
 ('``', '``'),
 ('affordable', 'NNP'),
 ('care', 'NNP'),
 ('act', 'NNP'),
 ("''", "''"),
 ('or', 'CC'),
 ('``', '``'),
 ('obamacare', 'NNP'),
 ("''", "''"),
 (')', ')'),
 (',', ','),
 ('the', 'DT'),
 ('dodd–frank', 'NNP'),
 ('wall', '

#### demo of using the names dataset

In [35]:
m = NameDataset()
m.search_first_name('mikael')
m.search_last_name('Remy')

True

In [41]:
# different approach for using the names database
for token in nlp_paragraph:
    pos = token.pos_
    text = token.text
    if pos == "PROPN":
        print(text,pos,m.search_first_name(text),m.search_last_name(text))

Obama PROPN True False
Honolulu PROPN False False
Hawaii PROPN False False
Columbia PROPN False False
University PROPN False False
Chicago PROPN False False
Harvard PROPN False False
Law PROPN False False
School PROPN False False
Harvard PROPN False False
Law PROPN False False
Review PROPN False False
University PROPN False False
Chicago PROPN False False
Law PROPN False False
School PROPN False False
Illinois PROPN False False
Senate PROPN False False
U.S. PROPN False False
Senate PROPN False False
March PROPN False False
July PROPN False False
Democratic PROPN False False
National PROPN False False
Convention PROPN False False
November PROPN False False
Senate PROPN False False
Hillary PROPN True True
Clinton PROPN True True
Republican PROPN False False
John PROPN True True
McCain PROPN True True
January PROPN False False
Nobel PROPN True True
Peace PROPN False False
Prize PROPN False False
