## NER - Named Entity Recognition

In [1]:
#import nltk
#nltk.download('popular', halt_on_error=False)
#nltk.download('all', halt_on_error=False)

In [2]:
import nltk as nltk
import nltk.corpus  
from nltk.text import Text
import pandas as pd
import re
import sys

In [3]:
print(sys.version)

3.5.4 |Anaconda custom (64-bit)| (default, Aug 14 2017, 13:41:13) [MSC v.1900 64 bit (AMD64)]


## NLTK-based for NER

In [4]:
text = "Surging Chinese demand and an improving U.S. economy have lifted sales of Caterpillar's signature yellow mining and construction machines. Now, with the pace of growth quickening in Latin America and Europe, the company is projecting higher earnings for 2018 than analysts estimated.  The outlook from Caterpillar, considered an economic bellwether, comes as industries from manufacturing to services report increased sales and orders that have fueled record equity prices and buoyed investor expectations for this year. This week, the International Monetary Fund raised its estimate for 2018 global growth to the fastest in seven years.  Caterpillar's results showed strength across the board in nearly every industry for the first time, which indicated coordinated and synchronized macroeconomic growth, Larry De Maria, an analyst at William Blair & Co., said in an interview. It's a good harbinger for overall economic activity."

### Basic NER: tagging words (tokens) as "NE"

In [5]:
# NLTK chunked_sentences is a tree structure, or list of lists.  We have to traverse it to get the values

entities = []
labels = []
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)), binary = True):
    if hasattr(chunk, 'label'):
        entities.append(' '.join(c[0] for c in chunk)) #Add space as between multi-token entities
        labels.append(chunk.label())

#entities_labels = list(zip(entities, labels))
entities_labels = list(set(zip(entities, labels))) #unique entities

#Binary=True means just tag entities as NE 
#Binary=False give us PERSON, ORGANIZATION, and GPE (Geo-political Entity) 

In [6]:
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities", "Labels"]
entities_df

Unnamed: 0,Entities,Labels
0,U.S.,NE
1,Larry De Maria,NE
2,Europe,NE
3,William Blair,NE
4,International Monetary Fund,NE
5,Chinese,NE
6,Latin America,NE
7,Caterpillar,NE


### Basic NER: tagging words (tokens) as PERSON, ORGANIZATION, and GPE

In [7]:
entities = []
labels = []
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)), binary = False):
    if hasattr(chunk, 'label'):
        entities.append(' '.join(c[0] for c in chunk)) #Add space as between multi-token entities
        labels.append(chunk.label())

#entities_labels = list(zip(entities, labels))
entities_labels = list(set(zip(entities, labels))) #unique entities

In [8]:
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities", "Labels"]
entities_df

Unnamed: 0,Entities,Labels
0,Caterpillar,PERSON
1,Europe,GPE
2,International Monetary Fund,ORGANIZATION
3,William Blair,PERSON
4,Larry De Maria,PERSON
5,U.S.,GPE
6,Caterpillar,GPE
7,Latin America,GPE
8,Chinese,GPE


### Alternative NER, separating by sentenses first, then by tokens

In [9]:
entities = []
labels = []

for sent in nltk.sent_tokenize(text):
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)), binary = False):
        if hasattr(chunk, 'label'):
            entities.append(' '.join(c[0] for c in chunk)) #Add space as between multi-token entities
            labels.append(chunk.label())

#entities_labels = list(zip(entities, labels))
entities_labels = list(set(zip(entities, labels))) #unique entities

In [10]:
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities", "Labels"]
entities_df

Unnamed: 0,Entities,Labels
0,Europe,GPE
1,International Monetary Fund,ORGANIZATION
2,William Blair,PERSON
3,Larry De Maria,PERSON
4,U.S.,GPE
5,Caterpillar,GPE
6,Latin America,GPE
7,Chinese,GPE


## Compare NER Results between Sentense vs. Word Tokenization

In [11]:
#http://www.chicagotribune.com/business/ct-caterpillar-earnings-20180125-story.html
directory = 'C://Users//Nick//Documents//Teaching//Data Projects//Text//News Articles//'
article = 'News_1.txt'

In [12]:
f = open(directory+article, encoding="utf8")
text = f.read()

### Tagging word tokens
Shallow parsing (also chunking or light parsing) is an analysis of a sentence which first identifies constituent parts of sentences (nouns, verbs, adjectives, etc.) and then links them to higher order units that have discrete grammatical meanings (noun groups or phrases, verb groups, etc.). Wikipedia 

In [13]:
entities = []
labels = []
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)), binary = False):
    if hasattr(chunk, 'label'):
        entities.append(' '.join(c[0] for c in chunk)) #Add space as between multi-token entities
        labels.append(chunk.label())

#entities_labels = list(zip(entities, labels))
entities_labels = list(set(zip(entities, labels))) #unique entities

In [14]:
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities", "Labels"]
entities_df.head(20)

Unnamed: 0,Entities,Labels
0,Joe,PERSON
1,Europe,GPE
2,Caterpillar Caterpillar,ORGANIZATION
3,Gene,ORGANIZATION
4,China,GPE
5,U.S.,GPE
6,William Blair,PERSON
7,Dow Jones Industrial,ORGANIZATION
8,Caterpillar,PERSON
9,Jim Umpleby,PERSON


### Sentense split, then tagging word tokens

In [15]:
entities = []
labels = []

for sent in nltk.sent_tokenize(text):
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)), binary = False):
        if hasattr(chunk, 'label'):
            entities.append(' '.join(c[0] for c in chunk)) #Add space as between multi-token entities
            labels.append(chunk.label())

#entities_labels = list(zip(entities, labels))
entities_labels = list(set(zip(entities, labels))) #unique entities

In [16]:
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities", "Labels"]
entities_df.head(20)

Unnamed: 0,Entities,Labels
0,Joe,PERSON
1,Europe,GPE
2,Caterpillar Caterpillar,ORGANIZATION
3,Gene,ORGANIZATION
4,China,GPE
5,U.S.,GPE
6,William Blair,PERSON
7,Dow Jones Industrial,ORGANIZATION
8,Caterpillar,PERSON
9,Jim Umpleby,PERSON


## Leveraging more powerful NLP packages, such as Stanford NLP to improve NER

### Installing and configuring Stanford NLP
https://blog.manash.me/configuring-stanford-parser-and-stanford-ner-tagger-with-nltk-in-python-on-windows-f685483c374a

In [17]:
import os
java_path = "C:\ProgramData\Oracle\Java\javapath\java.exe"
os.environ['JAVAHOME'] = java_path

# Change the path according to your system
stanford_classifier = 'C:\StanfordNLP\stanford-ner-2017-06-09\classifiers\english.all.3class.distsim.crf.ser.gz'
stanford_ner_path = 'C:\StanfordNLP\stanford-ner-2017-06-09\stanford-ner.jar'

from nltk.tag.stanford import StanfordNERTagger

# Creating Tagger Object
st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')

In [18]:
tokenized_text = nltk.word_tokenize(text)
classified_text = st.tag(tokenized_text)

classified_text_df = pd.DataFrame(classified_text)

classified_text_df.drop_duplicates(keep='first', inplace=True)
classified_text_df.reset_index(drop=True, inplace=True)
classified_text_df.columns = ["Entities", "Labels"]

In [19]:
classified_text_df.groupby("Labels").count()

Unnamed: 0_level_0,Entities
Labels,Unnamed: 1_level_1
LOCATION,8
O,235
ORGANIZATION,11
PERSON,10


In [20]:
entities_df = classified_text_df.loc[classified_text_df["Labels"].isin(['LOCATION','ORGANIZATION','PERSON'])]
entities_df.reset_index(drop=True, inplace=True)
entities_df.head(20)

Unnamed: 0,Entities,Labels
0,Caterpillar,ORGANIZATION
1,Murrysville,LOCATION
2,Pa.,LOCATION
3,",",ORGANIZATION
4,Inc.,ORGANIZATION
5,Gene,PERSON
6,J.,PERSON
7,Puskar,PERSON
8,U.S.,LOCATION
9,Latin,LOCATION


### StanfordNERTagger does not have native capabilities to support multi-word NER
Therefore we will have to build them by hand

In [21]:
tokenized_text = nltk.word_tokenize(text)
classified_text = st.tag(tokenized_text)

netagged_words = classified_text

entities = []
labels = []

from itertools import groupby
for tag, chunk in groupby(classified_text, lambda x:x[1]):
    if tag != "O":
        #print("%-12s"%tag, " ".join(w for w, t in chunk))
        entities.append(' '.join(w for w, t in chunk))
        labels.append(tag)
        
        
entities_all = list(zip(entities, labels))
entities_unique = list(set(zip(entities, labels))) #unique entities   

In [22]:
entities_df = pd.DataFrame(entities_unique)
entities_df.columns = ["Entities", "Labels"]
entities_df.groupby('Labels').count()

Unnamed: 0_level_0,Entities
Labels,Unnamed: 1_level_1
LOCATION,7
ORGANIZATION,6
PERSON,5


In [23]:
entities_df = pd.DataFrame(entities_all)
entities_df.columns = ["Entities", "Labels"]
persons_df = entities_df.loc[entities_df["Labels"].isin(['LOCATION','ORGANIZATION','PERSON'])]
counts_df = persons_df.groupby('Entities').count()
counts_df.rename(columns={"Labels": "Mentions"}, inplace=True)
counts_df.sort_values(by=['Mentions'], ascending=False).head(20)

Unnamed: 0_level_0,Mentions
Entities,Unnamed: 1_level_1
Caterpillar,11
U.S.,2
Latin America,2
Bloomberg,1
William Blair,1
Pa.,1
North America,1
Murrysville,1
Larry De Maria,1
Jim Umpleby,1
