# NER Using 3 Models and Rules-based

In [1]:
# others libraries
import pandas as pd
from pprint import pprint
import re
import json

In [2]:
# NLTK and Stanford libraries
import nltk, re, os
import nltk.corpus
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.probability import FreqDist
from nltk.tag.stanford import StanfordNERTagger
from nltk import ne_chunk, pos_tag
from nltk.tree import Tree
from nltk import RegexpParser
from nltk.chunk.api import ChunkParserI

In [3]:
# spaCy libraries
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm

***

## Reading json file and storing in data frame

In [476]:
def read_load(path):
    # reading json file
    with open(path, 'r') as json_file:
        f = json.load(json_file)
    data = f
    
    # Collecting index of word, word, start time, and end time
    df = pd.DataFrame({'count': ([X for X in range(len(data['values']['word']))]),
                       'word': data['values']['word'], 'start_time': data['values']['start'],
                       'end_time': data['values']['end']})
    
    return data, df

***

## Stanford NER Tagger

It has 3 models

* 3 classes model for recognizing locations, person, and organizations
* 4 classes model for recognizing locations, person, organizations, and miscellaneous entities
* 7 classes model for recognizing locations, person, organizations, times, money, percents, and dates

In this project, we use 7 classes model

In [279]:
def stanford_tagger(document):
    lst_word = []
    lst_ne = []
    lst_count = []

    java_path = ("C:/Program Files/Java/jdk-15.0.1/bin/java.exe")
    os.environ['JAVAHOME'] = java_path
    jar = ('D:/Program/stanford-ner-4.0.0/stanford-ner.jar')
    model = ('D:/Program/stanford-ner-4.0.0/classifiers/english.muc.7class.distsim.crf.ser') # 7 classes
    st = StanfordNERTagger(model, jar, encoding = 'utf-8')
    
    # word_token = word_tokenize(document)
    classified_text = st.tag(document['word'])
    
    for i in range(len(classified_text)):
        if str(classified_text[i][1]) != 'O':
            lst_word.append(str(classified_text[i][0]))
            lst_ne.append(str(classified_text[i][1]))
            lst_count.append(i)
    st_df = pd.DataFrame({'count': lst_count,'word': lst_word, 'stanford_ne': lst_ne})

    return st_df

In [280]:
st_df = stanford_tagger(df)
st_df

Unnamed: 0,count,word,stanford_ne
0,8,Sarah,PERSON
1,9,speaking.,PERSON
2,49,December,DATE
3,50,2019,DATE
4,72,Amy,PERSON
5,73,Gough,PERSON
6,74,Gough,PERSON


***

## NLTK

**NLTK recognizes the following entities:**
* ORGANIZATION - Georgia-Pacific Corp., WHO
* PERSON - Eddy Bonte, President Obama
* LOCATION - Murray River, Mount Everest
* DATE - June, 2008-06-29
* TIME - two fifty a m, 1:30 p.m.
* MONEY - 175 million Canadian Dollars, GBP 10.40
* PERCENT - twenty pct, 18.75 %
* FACILITY - Washington Monument, Stonehenge
* GPE - South East Asia, Midlothian

In [313]:
def nltk_tagger(document):
    lst_word = []
    lst_ne = []
    lst_count = []
    
    # word_token = word_tokenize(document)
    tagged_words = pos_tag(document['word'])
    ne_tagged = ne_chunk(tagged_words, binary = False) # False for details NE type
    # ne_tagged.draw()
    
    for chunk in range(len(ne_tagged)):
        if hasattr(ne_tagged[chunk], 'label'):
            lst_word.append(str(ne_tagged[chunk][0][0]))
            lst_ne.append(str(ne_tagged[chunk].label()))
            lst_count.append(chunk)
        if not hasattr(ne_tagged[chunk], 'label'):
            if str(ne_tagged[chunk][0:][1:][0]) == 'CD':
                lst_word.append(str(ne_tagged[chunk][0]))
                lst_ne.append(str(ne_tagged[chunk][0:][1:][0]))
                lst_count.append(chunk)
            
    nltk_df = pd.DataFrame({'count': lst_count, 'word': lst_word, 'nltk_ne': lst_ne})
    
    return nltk_df

In [315]:
nltk_df = nltk_tagger(df)
nltk_df

Unnamed: 0,count,word,nltk_ne
0,8,Sarah,PERSON
1,15,Hey,PERSON
2,47,2nd,CD
3,50,2019,CD
4,66,Amy,PERSON
5,72,Amy,PERSON
6,89,Mega,ORGANIZATION
7,130,Yes,PERSON


***

## spaCy

**spaCy recognizes the following entities:**
* PERSON - People, including fictional.
* NORP - Nationalities or religious or political groups.
* FAC - Buildings, airports, highways, bridges, etc.
* ORG - Companies, agencies, institutions, etc.
* GPE - Countries, cities, states.
* LOC - Non-GPE locations, mountain ranges, bodies of water.
* PRODUCT - Objects, vehicles, foods, etc. (Not services.)
* EVENT - Named hurricanes, battles, wars, sports events, etc.
* WORK_OF_ART - Titles of books, songs, etc.
* LAW - Named documents made into laws.
* LANGUAGE - Any named language.
* DATE - Absolute or relative dates or periods.
* TIME - Times smaller than a day.
* PERCENT - Percentage, including ”%“.
* MONEY - Monetary values, including unit.
* QUANTITY - Measurements, as of weight or distance.
* ORDINAL - “first”, “second”, etc.
* CARDINAL - Numerals that do not fall under another type.

In [344]:
def clean_text(text):
    # replace . and a space with only a space
    text = text.replace(".", "").replace(",", "").replace("?", "").replace("$", "").replace("\'", "")
    # get rid of the . at the end of each line. 
    cleaned_text = re.sub("\.$", "", text)
    
    return cleaned_text

def spaCy_tagger(document):
    new_words = []
    new_tokens = []
    count = []
    
    nlp = en_core_web_sm.load()
    doc = nlp(clean_text(document))
    
    for token in range(len(doc)):
        if doc[token].ent_type_ != '':
            if doc[token].ent_type_ == 'DATE':
                if (doc[token].text != 'a') and (doc[token].text != 'good') and (doc[token].text != 'day'):
                    new_words.append(doc[token].text)
                    new_tokens.append(doc[token].ent_type_)
                    count.append(token)
            elif doc[token].ent_type_ == 'GPE' or doc[token].ent_type_ == 'LOC':
                new_words.append(doc[token].text)
                new_tokens.append('LOCATION')
                count.append(token)
            elif doc[token].ent_type_ == 'ORG':
                new_words.append(doc[token].text)
                new_tokens.append('ORGANIZATION')
                count.append(token)
            else:
                new_words.append(doc[token].text)
                new_tokens.append(doc[token].ent_type_)
                count.append(token)
     
    spc_df = pd.DataFrame({'count': count, 'word': new_words, 'spacy_ne': new_tokens})
    
    return spc_df

In [345]:
spc_df = spaCy_tagger(data['transcript'])
spc_df

Unnamed: 0,count,word,spacy_ne
0,8,Sarah,PERSON
1,46,the,DATE
2,47,2nd,DATE
3,48,of,DATE
4,49,December,DATE
5,50,2019,DATE
6,66,Amy,PERSON
7,72,Amy,PERSON
8,73,Gough,PERSON
9,74,Gough,PERSON


***

In [347]:
df.to_csv('D:/DSBA/Project/Final-Project-2/data/Text files/word-time.csv')

In [346]:
spc_df.to_csv('D:/DSBA/Project/Final-Project-2/data/Text files/spacy-ner-tagger.csv')

***

In [66]:
sentence = """Hello, you have called Virtual bank, this is Linda speaking. How may I help you?
Hi Linda. I was just at your Ville branch and I think I left my Debit card in the ATM machine.
Okay. Do you have your Debit card number?
I don’t have.
Okay, well do you have the checking account number associated with the Debit
card? 
That I do have. Are you ready? I will give you what I have got. 765456789. 
Okay. That’s 765456789.
Correct.
What is your identification number?
7745896589665.
Okay, I have 7745896589665 and what is your name sir? 
It is Robert Applebaum.
Okay. I have Robert Applebaum.
Yes.
And what is your date of birth Mr. Applebaum?
July 7th, 1974. 
Okay. July 7th, 1974.
Yes.
And your phone number?
It is 6102651715. 
Okay. I have 6102651715.
Yes.
Okay Mr. Applebaum. I have just suspended your card. If it is in the machine, we will contact you and lift the suspension. 
Oh, thank you.
Sure. Thank you."""

In [64]:
wlst = []
nelst = []

for i in range(0, len(tokenized_sent)):
    if re.search('phone number', tokenized_sent[i]):
        if re.search('([0-9]|zero|two|three|four|five|six|seven|eight|nine)+', tokenized_sent[i+1]) or re.search(r'(\bone\b)+', tokenized_sent[i+1]):
            wlst.append(str(tokenized_sent[i+1]))
            nelst.append('PHONENUM')
        elif re.search('([0-9]|zero|one|two|three|four|five|six|seven|eight|nine)+', tokenized_sent[i+2])  or re.search(r'(\bone\b)+', tokenized_sent[i+2]):
            wlst.append(str(tokenized_sent[i+2]))
            nelst.append('PHONENUM')
    if re.search('account number', tokenized_sent[i]):
        if re.search('([0-9]|zero|two|three|four|five|six|seven|eight|nine)+', tokenized_sent[i+1]) or re.search(r'(\bone\b)+', tokenized_sent[i+1]):
            wlst.append(str(tokenized_sent[i+1]))
            nelst.append('ACCNUM')
        elif re.search('([0-9]|zero|two|three|four|five|six|seven|eight|nine)+', tokenized_sent[i+2]) or re.search(r'(\bone\b)+', tokenized_sent[i+2]):
            wlst.append(str(tokenized_sent[i+2]))
            nelst.append('ACCNUM')
    if re.search('(identify number|identification number)', tokenized_sent[i]):
        if re.search('([0-9]|zero|two|three|four|five|six|seven|eight|nine)+', tokenized_sent[i+1]) or re.search(r'(\bone\b)+', tokenized_sent[i+1]):
            wlst.append(str(tokenized_sent[i+1]))
            nelst.append('IDCARD')
        elif re.search('([0-9]|zero|one|two|three|four|five|six|seven|eight|nine)+', tokenized_sent[i+2]) or re.search(r'(\bone\b)+', tokenized_sent[i+2]):
            wlst.append(str(tokenized_sent[i+2]))
            nelst.append('IDCARD')

pd.DataFrame({'sent': wlst, 'ne': nelst})

Unnamed: 0,sent,ne
0,"I didn't use my old phone number anymore, but ...",PHONENUM
1,That's one one one five four four two two two?,ACCNUM
2,My identification number is 1 1 0 2 5 6 9 8 5 ...,IDCARD
3,Five four two 9800 11-2.,IDCARD
4,My phone number is seven.,PHONENUM
5,Seven one nine eight five five five eight seven.,PHONENUM


***

## Models Evaluation

In [478]:
data, df = read_load('D:/DSBA/Project/Final-Project-2/Nancy-Sandra.json')

In [480]:
df.to_csv('D:/DSBA/Project/Final-Project-2/data/Text files/word-time1.csv')

In [540]:
ref_label = pd.read_csv('D:/DSBA/Project/Final-Project-2/data/Text files/ref-nancy-sandra.csv')
ref_label.head(10)

Unnamed: 0,word,label
0,"Hello,",O
1,you,O
2,have,O
3,called,O
4,virtual,ORGANIZATION
5,bank.,ORGANIZATION
6,This,O
7,is,O
8,Nancy,PERSON
9,speaking.,O


**Stanford prediction**

In [477]:
def stanford_pred(document):
    
    java_path = ("C:/Program Files/Java/jdk-15.0.1/bin/java.exe")
    os.environ['JAVAHOME'] = java_path
    jar = ('D:/Program/stanford-ner-4.0.0/stanford-ner.jar')
    model = ('D:/Program/stanford-ner-4.0.0/classifiers/english.muc.7class.distsim.crf.ser') # 7 classes
    st = StanfordNERTagger(model, jar, encoding = 'utf-8')

    classified_text = st.tag(df['word'])

    st_pred = []

    for i in range(len(classified_text)):
        st_pred.append(str(classified_text[i][1]))
    
    return st_pred

In [541]:
st_pred = stanford_pred(df)

**NLTK prediction**

In [471]:
def NLTK_pred(document):
    
    tagged_words = pos_tag(document['word'])
    ne_tagged = ne_chunk(tagged_words)
    # convert prediction to multiline string and then to list (includes pos tags)
    multiline_string = nltk.chunk.tree2conllstr(ne_tagged)
    multiline_string.split("\n")
    nltk_pred = [i.split(" ")[2] for i in multiline_string.split("\n")]

    # amend class annotations for consistency with reference_annotations
    for n,i in enumerate(nltk_pred):
        if i == "B-PERSON":
            nltk_pred[n] = "PERSON"
        if i == "I-PERSON":
            nltk_pred[n] = "PERSON"    
        if i == "B-ORGANIZATION":
            nltk_pred[n] = "ORGANIZATION"
        if i == "I-ORGANIZATION":
            nltk_pred[n] = "ORGANIZATION"
        if i == "B-LOCATION":
            nltk_pred[n] = "LOCATION"
        if i == "I-LOCATION":
            nltk_pred[n] = "LOCATION"
        if i == "B-GPE":
            nltk_pred[n] = "LOCATION"
        if i == "I-GPE":
            nltk_pred[n] = "LOCATION"
    
    return nltk_pred

In [542]:
nltk_pred = NLTK_pred(df)

**spaCy prediction**

In [490]:
def spaCy_pred(document):
    new_words = []
    new_tokens = []
    count = []
    
    # replace . and a space with only a space
    document = document.replace(".", "").replace(",", "").replace("?", "").replace("$", "").replace("\'", "")
    # get rid of the . at the end of each line.
    cleaned_text = re.sub("\.$", "", document)
    
    nlp = en_core_web_sm.load()
    doc = nlp(cleaned_text)
    
    for token in doc:
        if token.ent_type_ != '':
            if token.ent_type_ == 'DATE':
                if (token.text != 'a') and (token.text != 'good') and (token.text != 'day'):
                    new_tokens.append(token.ent_type_)
                else:
                    new_tokens.append('O')
            elif token.ent_type_ == 'CARDINAL':
                new_tokens.append('CD')
            elif token.ent_type_ == 'GPE' or token.ent_type_ == 'LOC':
                new_tokens.append('LOCATION')
            elif token.ent_type_ == 'ORG':
                new_tokens.append('ORGANIZATION')
            else:
                new_tokens.append(token.ent_type_)
        elif token.ent_type_ == '':
            new_tokens.append('O')
    
    return new_tokens

In [491]:
# spacy_pred = spaCy_pred(data['transcript'])

In [539]:
spacy_pred = pd.read_csv('D:/DSBA/Project/Final-Project-2/data/Text files/spacy-pred-nancy-sandra.csv')
spacy_pred = spacy_pred['spacy_label']
spacy_pred = [x for x in spacy_pred]

**Accuracy of the 3 models**

In [509]:
from nltk.metrics.scores import accuracy
st_acc = accuracy(ref_label['label'], st_pred)
nltk_acc = accuracy(ref_label['label'], nltk_pred)
spacy_acc = accuracy(ref_label['label'], spacy_pred)

print('Stanford Accuracy: %.2f' % (st_acc * 100) + '%')
print('NLTK Accuracy: %.2f' % (nltk_acc * 100) + '%')
print('spaCy Accuracy: %.2f' % (spacy_acc * 100) + '%')

Stanford Accuracy: 89.01%
NLTK Accuracy: 85.34%
spaCy Accuracy: 91.10%


### Specify named entity accuracy evaluation

In [551]:
def only_PERSON(person):
    for n,i in enumerate(person):
        if i != "PERSON":
            person[n] = "O"
    
    return person

def only_ORG(org):
    for n,i in enumerate(org):
        if i != "ORGANIZATION":
            org[n] = "O"
    
    return org

def only_LOC(loc):
    for n,i in enumerate(loc):
        if i != "LOCATION":
            loc[n] = "O"
    
    return loc

def only_CD(cd):
    for n,i in enumerate(cd):
        if i != "CD":
            cd[n] = "O"
    
    return cd

def only_DATE(date):
    for n,i in enumerate(date):
        if i != "DATE":
            date[n] = "O"
    
    return date

def only_MONEY(money):
    for n,i in enumerate(money):
        if i != "MONEY":
            money[n] = "O"
    
    return money

**DETECT ACCURACIES**

In [569]:
print('Stanford Accuracy: %.2f' % (accuracy(only_PERSON([x for x in ref_label['label']]), only_PERSON(st_pred)) * 100) + '%')
print('NLTK Accuracy: %.2f' % (accuracy(only_PERSON([x for x in ref_label['label']]), only_PERSON(nltk_pred)) * 100) + '%')
print('spaCy Accuracy: %.2f' % (accuracy(only_PERSON([x for x in ref_label['label']]), only_PERSON(spacy_pred)) * 100) + '%')

Stanford Accuracy: 97.91%
NLTK Accuracy: 97.91%
spaCy Accuracy: 97.91%


In [568]:
print('\n-------------------------------------------\n')
print('PERSON DETECT ACCURACY:')
print('Stanford Accuracy: %.2f' % (accuracy(only_PERSON([x for x in ref_label['label']]), only_PERSON(st_pred)) * 100) + '%')
print('NLTK Accuracy: %.2f' % (accuracy(only_PERSON([x for x in ref_label['label']]), only_PERSON(nltk_pred)) * 100) + '%')
print('spaCy Accuracy: %.2f' % (accuracy(only_PERSON([x for x in ref_label['label']]), only_PERSON(spacy_pred)) * 100) + '%')
print('\n-------------------------------------------\n')

print('ORGANIZATION DETECT ACCURACY:')
print('Stanford Accuracy: %.2f' % (accuracy(only_ORG([x for x in ref_label['label']]), only_ORG(st_pred)) * 100) + '%')
print('NLTK Accuracy: %.2f' % (accuracy(only_ORG([x for x in ref_label['label']]), only_ORG(nltk_pred)) * 100) + '%')
print('spaCy Accuracy: %.2f' % (accuracy(only_ORG([x for x in ref_label['label']]), only_ORG(spacy_pred)) * 100) + '%')
print('\n-------------------------------------------\n')

print('LOCATION DETECT ACCURACY:')
print('Stanford Accuracy: %.2f' % (accuracy(only_LOC([x for x in ref_label['label']]), only_LOC(st_pred)) * 100) + '%')
print('NLTK Accuracy: %.2f' % (accuracy(only_LOC([x for x in ref_label['label']]), only_LOC(nltk_pred)) * 100) + '%')
print('spaCy Accuracy: %.2f' % (accuracy(only_LOC([x for x in ref_label['label']]), only_LOC(spacy_pred)) * 100) + '%')
print('\n-------------------------------------------\n')


-------------------------------------------

PERSON DETECT ACCURACY:
Stanford Accuracy: 97.91%
NLTK Accuracy: 97.91%
spaCy Accuracy: 97.91%

-------------------------------------------

ORGANIZATION DETECT ACCURACY:
Stanford Accuracy: 98.95%
NLTK Accuracy: 98.95%
spaCy Accuracy: 98.95%

-------------------------------------------

LOCATION DETECT ACCURACY:
Stanford Accuracy: 95.81%
NLTK Accuracy: 95.81%
spaCy Accuracy: 95.81%

-------------------------------------------



In [510]:
print([(x.text, x.ent_type_) for x in nlp(data['transcript']) if x.ent_type_ != ''])

[('Nancy', 'PERSON'), ('ATM', 'ORG'), ('ATM', 'ORG'), ('111', 'CARDINAL'), ('July', 'DATE'), ('7th', 'DATE'), ('.', 'DATE'), ('1974', 'DATE'), ('132', 'CARDINAL'), ('New', 'GPE'), ('York', 'GPE'), ('ATM', 'ORG'), ('877', 'CARDINAL'), ('877', 'CARDINAL'), ('Nancy', 'PERSON'), ('a', 'DATE'), ('good', 'DATE'), ('day', 'DATE')]


In [495]:
print([(str(x), x.label_) for x in nlp(data['transcript']).ents])

[('Nancy', 'PERSON'), ('ATM', 'ORG'), ('ATM', 'ORG'), ('111', 'CARDINAL'), ('July 7th.', 'DATE'), ('1974', 'DATE'), ('132', 'CARDINAL'), ('New York', 'GPE'), ('ATM', 'ORG'), ('877', 'CARDINAL'), ('877', 'CARDINAL'), ('Nancy', 'PERSON'), ('a good day', 'DATE')]


In [502]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in nlp(test)])

[(Hello, 'O', ''),
 (you, 'O', ''),
 (have, 'O', ''),
 (called, 'O', ''),
 (virtual, 'O', ''),
 (bank, 'O', ''),
 (This, 'O', ''),
 (is, 'O', ''),
 (Nancy, 'O', ''),
 (speaking, 'O', ''),
 (How, 'O', ''),
 (may, 'O', ''),
 (I, 'O', ''),
 (help, 'O', ''),
 (you, 'O', ''),
 (I, 'O', ''),
 (just, 'O', ''),
 (had, 'O', ''),
 (withdrawn, 'O', ''),
 (some, 'O', ''),
 (cash, 'O', ''),
 (from, 'O', ''),
 (the, 'O', ''),
 (ATM, 'B', 'ORG'),
 (machine, 'O', ''),
 (and, 'O', ''),
 (ATM, 'B', 'ORG'),
 (transaction, 'O', ''),
 (failed, 'O', ''),
 (but, 'O', ''),
 (money, 'O', ''),
 (got, 'O', ''),
 (debited, 'O', ''),
 (Can, 'O', ''),
 (you, 'O', ''),
 (fix, 'O', ''),
 (this, 'O', ''),
 (problem, 'O', ''),
 (Sure, 'O', ''),
 (What, 'O', ''),
 (is, 'O', ''),
 (your, 'O', ''),
 (account, 'O', ''),
 (number, 'O', ''),
 (It, 'O', ''),
 (is, 'O', ''),
 (111, 'B', 'CARDINAL'),
 (to, 'O', ''),
 (36669, 'O', ''),
 (Just, 'O', ''),
 (a, 'O', ''),
 (moment, 'O', ''),
 (Okay, 'O', ''),
 (And, 'O', ''),
 (what