# NER Using 3 Models and Rules-based

In [1]:
# others libraries
import pandas as pd
from pprint import pprint
import re
import json

In [159]:
# NLTK and Stanford libraries
import nltk, re, os
import nltk.corpus
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.probability import FreqDist
from nltk.tag.stanford import StanfordNERTagger
from nltk import ne_chunk, pos_tag
from nltk.tree import Tree
from nltk import RegexpParser
from nltk.chunk.api import ChunkParserI

In [3]:
# spaCy libraries
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm

***

## Reading json file and storing in data frame

In [165]:
def read_load(path):
    # reading json file
    with open(path, 'r') as json_file:
        f = json.load(json_file)
    data = f
    
    # Collecting index of word, word, start time, and end time
    df = pd.DataFrame({'indx': ([X for X in range(len(data['values']['word']))]),
                       'word': data['values']['word'], 'start_time': data['values']['start'],
                       'end_time': data['values']['end']})
    
    df = df.set_index('indx')
    
    return data, df

In [399]:
data, df = read_load('D:/DSBA/Project/Final-Project-2/Nancy-Sandra.json')

In [400]:
df.head(10)

Unnamed: 0_level_0,word,start_time,end_time
indx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"Hello,",0.1,0.7
1,you,0.7,1.4
2,have,1.4,1.6
3,called,1.6,2.0
4,virtual,2.0,2.3
5,bank.,2.3,2.6
6,This,2.6,3.3
7,is,3.3,3.5
8,Nancy,3.5,3.9
9,speaking.,3.9,4.4


***

## Stanford NER Tagger

It has 3 models

* 3 classes model for recognizing locations, person, and organizations
* 4 classes model for recognizing locations, person, organizations, and miscellaneous entities
* 7 classes model for recognizing locations, person, organizations, times, money, percents, and dates

In this project, we use 7 classes model

In [168]:
def Stanford_pred(df):
    
    st_pred = []
    
    java_path = ("C:/Program Files/Java/jdk-15.0.1/bin/java.exe")
    os.environ['JAVAHOME'] = java_path
    jar = ('D:/Program/stanford-ner-4.0.0/stanford-ner.jar')
    model = ('D:/Program/stanford-ner-4.0.0/classifiers/english.muc.7class.distsim.crf.ser') # 7 classes
    st = StanfordNERTagger(model, jar, encoding = 'utf-8')

    classified_text = st.tag(df['word'])

    for i in range(len(classified_text)):
        if str(classified_text[i][1]) != 'PERSON' and str(classified_text[i][1]) != 'LOCATION' and str(classified_text[i][1]) != 'ORGANIZATION' and str(classified_text[i][1]) != 'MONEY' and str(classified_text[i][1]) != 'DATE':
                st_pred.append('O')
        else:
            st_pred.append(str(classified_text[i][1]))
    
    df['stanford_pred'] = st_pred
    
    return st_pred, df

***

## NLTK

**NLTK recognizes the following entities:**
* ORGANIZATION - Georgia-Pacific Corp., WHO
* PERSON - Eddy Bonte, President Obama
* LOCATION - Murray River, Mount Everest
* DATE - June, 2008-06-29
* TIME - two fifty a m, 1:30 p.m.
* MONEY - 175 million Canadian Dollars, GBP 10.40
* PERCENT - twenty pct, 18.75 %
* FACILITY - Washington Monument, Stonehenge
* GPE - South East Asia, Midlothian

In [151]:
def NLTK_pred(df):
    
    tagged_words = pos_tag(df['word'])
    ne_tagged = ne_chunk(tagged_words)
    # convert prediction to multiline string and then to list (includes pos tags)
    multiline_string = nltk.chunk.tree2conllstr(ne_tagged)
    multiline_string.split('\n')
    nltk_pred = [i.split(' ')[2] for i in multiline_string.split('\n')]

    # amend class annotations for consistency with reference_annotations
    for n, i in enumerate(nltk_pred):
        if i == 'B-PERSON':
            nltk_pred[n] = 'PERSON'
        if i == 'I-PERSON':
            nltk_pred[n] = 'PERSON'    
        if i == 'B-ORGANIZATION':
            nltk_pred[n] = 'ORGANIZATION'
        if i == 'I-ORGANIZATION':
            nltk_pred[n] = 'ORGANIZATION'
        if i == 'B-LOCATION':
            nltk_pred[n] = 'LOCATION'
        if i == 'I-LOCATION':
            nltk_pred[n] = 'LOCATION'
        if i == 'B-GPE':
            nltk_pred[n] = 'LOCATION'
        if i == 'I-GPE':
            nltk_pred[n] = 'LOCATION'
        if i == 'B-FACILITY':
            nltk_pred[n] = 'O'
        if i == 'I-FACILITY':
            nltk_pred[n] = 'O'
        if i == 'B-PERCENT':
            nltk_pred[n] = 'O'
        if i == 'I-PERCENT':
            nltk_pred[n] = 'O'
        if i == 'B-TIME':
            nltk_pred[n] = 'O'
        if i == 'I-TIME':
            nltk_pred[n] = 'O'
    
    df['nltk_pred'] = nltk_pred
    
    return nltk_pred, df

***

## spaCy

**spaCy recognizes the following entities:**
* PERSON - People, including fictional.
* NORP - Nationalities or religious or political groups.
* FAC - Buildings, airports, highways, bridges, etc.
* ORG - Companies, agencies, institutions, etc.
* GPE - Countries, cities, states.
* LOC - Non-GPE locations, mountain ranges, bodies of water.
* PRODUCT - Objects, vehicles, foods, etc. (Not services.)
* EVENT - Named hurricanes, battles, wars, sports events, etc.
* WORK_OF_ART - Titles of books, songs, etc.
* LAW - Named documents made into laws.
* LANGUAGE - Any named language.
* DATE - Absolute or relative dates or periods.
* TIME - Times smaller than a day.
* PERCENT - Percentage, including ”%“.
* MONEY - Monetary values, including unit.
* QUANTITY - Measurements, as of weight or distance.
* ORDINAL - “first”, “second”, etc.
* CARDINAL - Numerals that do not fall under another type.

In [398]:
def spaCy_pred(dictt, df):
    
    nlp = en_core_web_sm.load()
    # list of words that have named entities
    text = ([str(X) for X in nlp(dictt)
            if (X.ent_type_ != '') & (str(X) != 'a') & (str(X) != 'good') & (str(X) != 'day') & (str(X) != '.') & (str(X) != ',')])
    # list of named entities
    ne = ([X.ent_type_ for X in nlp(dictt)
            if (X.ent_type_ != '') & (str(X) != 'a') & (str(X) != 'good') & (str(X) != 'day') & (str(X) != '.') & (str(X) != ',')])
    
    sp_pred = []
    
    for n, i in enumerate(ne):
        if i == 'LOC':
            ne[n] = 'LOCATION'
        if i == 'GPE':
            ne[n] = 'LOCATION'
        if i == 'CARDINAL':
            ne[n] = 'CD'
        if i == 'ORG':
            ne[n] = 'ORGANIZATION'
          
    check = 0  
    
    for ww in df['word']:
        check = 0
        for w, n in zip(text, ne):
            if ww.__contains__(w):
                check = 1
                sp_pred.append(str(n))
                break
        if check == 0:
            sp_pred.append('O')
                
    df['spacy_pred'] = sp_pred
                
    return sp_pred, df

***

In [None]:
df.to_csv('D:/DSBA/Project/Final-Project-2/data/Text files/word-time.csv')

In [None]:
spc_df.to_csv('D:/DSBA/Project/Final-Project-2/data/Text files/spacy-ner-tagger.csv')

***

In [None]:
sentence = """Hello, you have called Virtual bank, this is Linda speaking. How may I help you?
Hi Linda. I was just at your Ville branch and I think I left my Debit card in the ATM machine.
Okay. Do you have your Debit card number?
I don’t have.
Okay, well do you have the checking account number associated with the Debit
card? 
That I do have. Are you ready? I will give you what I have got. 765456789. 
Okay. That’s 765456789.
Correct.
What is your identification number?
7745896589665.
Okay, I have 7745896589665 and what is your name sir? 
It is Robert Applebaum.
Okay. I have Robert Applebaum.
Yes.
And what is your date of birth Mr. Applebaum?
July 7th, 1974. 
Okay. July 7th, 1974.
Yes.
And your phone number?
It is 6102651715. 
Okay. I have 6102651715.
Yes.
Okay Mr. Applebaum. I have just suspended your card. If it is in the machine, we will contact you and lift the suspension. 
Oh, thank you.
Sure. Thank you."""

In [None]:
wlst = []
nelst = []

for i in range(0, len(tokenized_sent)):
    if re.search('phone number', tokenized_sent[i]):
        if re.search('([0-9]|zero|two|three|four|five|six|seven|eight|nine)+', tokenized_sent[i+1]) or re.search(r'(\bone\b)+', tokenized_sent[i+1]):
            wlst.append(str(tokenized_sent[i+1]))
            nelst.append('PHONENUM')
        elif re.search('([0-9]|zero|one|two|three|four|five|six|seven|eight|nine)+', tokenized_sent[i+2])  or re.search(r'(\bone\b)+', tokenized_sent[i+2]):
            wlst.append(str(tokenized_sent[i+2]))
            nelst.append('PHONENUM')
    if re.search('account number', tokenized_sent[i]):
        if re.search('([0-9]|zero|two|three|four|five|six|seven|eight|nine)+', tokenized_sent[i+1]) or re.search(r'(\bone\b)+', tokenized_sent[i+1]):
            wlst.append(str(tokenized_sent[i+1]))
            nelst.append('ACCNUM')
        elif re.search('([0-9]|zero|two|three|four|five|six|seven|eight|nine)+', tokenized_sent[i+2]) or re.search(r'(\bone\b)+', tokenized_sent[i+2]):
            wlst.append(str(tokenized_sent[i+2]))
            nelst.append('ACCNUM')
    if re.search('(identify number|identification number)', tokenized_sent[i]):
        if re.search('([0-9]|zero|two|three|four|five|six|seven|eight|nine)+', tokenized_sent[i+1]) or re.search(r'(\bone\b)+', tokenized_sent[i+1]):
            wlst.append(str(tokenized_sent[i+1]))
            nelst.append('IDCARD')
        elif re.search('([0-9]|zero|one|two|three|four|five|six|seven|eight|nine)+', tokenized_sent[i+2]) or re.search(r'(\bone\b)+', tokenized_sent[i+2]):
            wlst.append(str(tokenized_sent[i+2]))
            nelst.append('IDCARD')

pd.DataFrame({'sent': wlst, 'ne': nelst})

***

## Models Evaluation

In [401]:
ref_label = pd.read_csv('D:/DSBA/Project/Final-Project-2/data/Text files/ref-nancy-sandra.csv')
ref_label = [i for i in ref_label['label']]

In [402]:
st_pred, df = Stanford_pred(df)
nltk_pred, df = NLTK_pred(df)
sp_pred, df = spaCy_pred(data['transcript'], df)

In [404]:
df.to_csv('D:/DSBA/Project/Final-Project-2/data/Text files/3_MODELS.csv')

**Accuracy of the 3 models**

In [405]:
from nltk.metrics.scores import accuracy
st_acc = accuracy(ref_label, st_pred)
nltk_acc = accuracy(ref_label, nltk_pred)
spacy_acc = accuracy(ref_label, sp_pred)

print('Stanford Accuracy: %.2f' % (st_acc * 100) + '%')
print('NLTK Accuracy: %.2f' % (nltk_acc * 100) + '%')
print('spaCy Accuracy: %.2f' % (spacy_acc * 100) + '%')

Stanford Accuracy: 89.01%
NLTK Accuracy: 85.34%
spaCy Accuracy: 91.62%


### Specify named entity accuracy evaluation

In [27]:
def only_ENT(ent):
    
    person = []
    org = []
    loc = []
    cd = []
    date = []
    money = []
    
    for p in ent:
        if p != 'PERSON':
            person.append('O')
        else:
            person.append(str(p))
            
    for o in ent:
        if o != 'ORGANIZATION':
            org.append('O')
        else:
            org.append(str(o))
            
    for l in ent:
        if l != 'LOCATION':
            loc.append('O')
        else:
            loc.append(str(l))
            
    for c in ent:
        if c != 'CD':
            cd.append('O')
        else:
            cd.append(str(c))
            
    for d in ent:
        if d != 'DATE':
            date.append('O')
        else:
            date.append(str(d))
            
    for m in ent:
        if m != 'MONEY':
            money.append('O')
        else:
            money.append(str(m))
            
    return person, org, loc, cd, date, money

In [186]:
def acc_each_ENT(r_ps, r_org, r_loc, r_cd, r_date, r_money, st_ps, st_org, st_loc, st_cd, st_date, st_money, nltk_ps, nltk_org, nltk_loc, nltk_cd, nltk_date, nltk_money, sp_ps, sp_org, sp_loc, sp_cd, sp_date, sp_money):
    
    print('-------------------------------------------\n')
    print('PERSON DETECT ACCURACY:')
    print('Stanford Accuracy: %.2f' % (accuracy(r_ps, st_ps) * 100) + '%')
    print('NLTK Accuracy: %.2f' % (accuracy(r_ps, nltk_ps) * 100) + '%')
    print('spaCy Accuracy: %.2f' % (accuracy(r_ps, sp_ps) * 100) + '%')
    print('\n-------------------------------------------\n')

    print('ORGANIZATION DETECT ACCURACY:')
    print('Stanford Accuracy: %.2f' % (accuracy(r_org, st_org) * 100) + '%')
    print('NLTK Accuracy: %.2f' % (accuracy(r_org, nltk_org) * 100) + '%')
    print('spaCy Accuracy: %.2f' % (accuracy(r_org, sp_org) * 100) + '%')
    print('\n-------------------------------------------\n')

    print('LOCATION DETECT ACCURACY:')
    print('Stanford Accuracy: %.2f' % (accuracy(r_loc, st_loc) * 100) + '%')
    print('NLTK Accuracy: %.2f' % (accuracy(r_loc, nltk_loc) * 100) + '%')
    print('spaCy Accuracy: %.2f' % (accuracy(r_loc, sp_loc) * 100) + '%')
    print('\n-------------------------------------------\n')

    print('CARDINAL NUMBER DETECT ACCURACY:')
    print('Stanford Accuracy: %.2f' % (accuracy(r_cd, st_cd) * 100) + '%')
    print('NLTK Accuracy: %.2f' % (accuracy(r_cd, nltk_cd) * 100) + '%')
    print('spaCy Accuracy: %.2f' % (accuracy(r_cd, sp_cd) * 100) + '%')
    print('\n-------------------------------------------\n')

    print('DATE DETECT ACCURACY:')
    print('Stanford Accuracy: %.2f' % (accuracy(r_date, st_date) * 100) + '%')
    print('NLTK Accuracy: %.2f' % (accuracy(r_date, nltk_date) * 100) + '%')
    print('spaCy Accuracy: %.2f' % (accuracy(r_date, sp_date) * 100) + '%')
    print('\n-------------------------------------------\n')

    print('MONEY DETECT ACCURACY:')
    print('Stanford Accuracy: %.2f' % (accuracy(r_money, st_money) * 100) + '%')
    print('NLTK Accuracy: %.2f' % (accuracy(r_money, nltk_money) * 100) + '%')
    print('spaCy Accuracy: %.2f' % (accuracy(r_money, sp_money) * 100) + '%')
    print('\n-------------------------------------------')

### Accuracies of specific named entities

In [406]:
r_ps, r_org, r_loc, r_cd, r_date, r_money = only_ENT(ref_label)
st_ps, st_org, st_loc, st_cd, st_date, st_money = only_ENT(st_pred)
nltk_ps, nltk_org, nltk_loc, nltk_cd, nltk_date, nltk_money = only_ENT(nltk_pred)
sp_ps, sp_org, sp_loc, sp_cd, sp_date, sp_money = only_ENT(sp_pred)

In [407]:
acc_each_ENT(r_ps, r_org, r_loc, r_cd, r_date, r_money, st_ps, st_org, st_loc, st_cd, st_date, st_money, nltk_ps, nltk_org, nltk_loc, nltk_cd, nltk_date, nltk_money, sp_ps, sp_org, sp_loc, sp_cd, sp_date, sp_money)

-------------------------------------------

PERSON DETECT ACCURACY:
Stanford Accuracy: 97.91%
NLTK Accuracy: 96.86%
spaCy Accuracy: 98.95%

-------------------------------------------

ORGANIZATION DETECT ACCURACY:
Stanford Accuracy: 97.38%
NLTK Accuracy: 97.38%
spaCy Accuracy: 97.38%

-------------------------------------------

LOCATION DETECT ACCURACY:
Stanford Accuracy: 96.34%
NLTK Accuracy: 94.76%
spaCy Accuracy: 96.86%

-------------------------------------------

CARDINAL NUMBER DETECT ACCURACY:
Stanford Accuracy: 96.86%
NLTK Accuracy: 96.86%
spaCy Accuracy: 97.91%

-------------------------------------------

DATE DETECT ACCURACY:
Stanford Accuracy: 100.00%
NLTK Accuracy: 98.43%
spaCy Accuracy: 100.00%

-------------------------------------------

MONEY DETECT ACCURACY:
Stanford Accuracy: 100.00%
NLTK Accuracy: 100.00%
spaCy Accuracy: 100.00%

-------------------------------------------


***

## Conversation that has money named entity

In [408]:
data2, df2 = read_load('D:/DSBA/Project/Final-Project-2/dict.json')

In [409]:
st_pred2, df2 = Stanford_pred(df2)
nltk_pred2, df2 = NLTK_pred(df2)
sp_pred2, df2 = spaCy_pred(data2['transcript'], df2)

In [410]:
ref_label2 = pd.read_csv('D:/DSBA/Project/Final-Project-2/data/Text files/ref_label.csv')
ref_label2 = [i for i in ref_label2['label']]

In [411]:
r_ps2, r_org2, r_loc2, r_cd2, r_date2, r_money2 = only_ENT(ref_label2)
st_ps2, st_org2, st_loc2, st_cd2, st_date2, st_money2 = only_ENT(st_pred2)
nltk_ps2, nltk_org2, nltk_loc2, nltk_cd2, nltk_date2, nltk_money2 = only_ENT(nltk_pred2)
sp_ps2, sp_org2, sp_loc2, sp_cd2, sp_date2, sp_money2 = only_ENT(sp_pred2)

In [412]:
st_acc2 = accuracy(ref_label2, st_pred2)
nltk_acc2 = accuracy(ref_label2, nltk_pred2)
spacy_acc2 = accuracy(ref_label2, sp_pred2)

print('Stanford Accuracy: %.2f' % (st_acc2 * 100) + '%')
print('NLTK Accuracy: %.2f' % (nltk_acc2 * 100) + '%')
print('spaCy Accuracy: %.2f' % (spacy_acc2 * 100) + '%')

Stanford Accuracy: 91.87%
NLTK Accuracy: 91.06%
spaCy Accuracy: 93.50%


In [413]:
acc_each_ENT(r_ps2, r_org2, r_loc2, r_cd2, r_date2, r_money2, st_ps2, st_org2, st_loc2, st_cd2, st_date2, st_money2, nltk_ps2, nltk_org2, nltk_loc2, nltk_cd2, nltk_date2, nltk_money2, sp_ps2, sp_org2, sp_loc2, sp_cd2, sp_date2, sp_money2)

-------------------------------------------

PERSON DETECT ACCURACY:
Stanford Accuracy: 96.75%
NLTK Accuracy: 97.56%
spaCy Accuracy: 98.37%

-------------------------------------------

ORGANIZATION DETECT ACCURACY:
Stanford Accuracy: 100.00%
NLTK Accuracy: 99.19%
spaCy Accuracy: 100.00%

-------------------------------------------

LOCATION DETECT ACCURACY:
Stanford Accuracy: 98.37%
NLTK Accuracy: 98.37%
spaCy Accuracy: 98.37%

-------------------------------------------

CARDINAL NUMBER DETECT ACCURACY:
Stanford Accuracy: 100.00%
NLTK Accuracy: 100.00%
spaCy Accuracy: 100.00%

-------------------------------------------

DATE DETECT ACCURACY:
Stanford Accuracy: 97.56%
NLTK Accuracy: 95.93%
spaCy Accuracy: 96.75%

-------------------------------------------

MONEY DETECT ACCURACY:
Stanford Accuracy: 99.19%
NLTK Accuracy: 99.19%
spaCy Accuracy: 100.00%

-------------------------------------------
