# NER Using 3 Models and Rules-based

In [1]:
import pandas as pd
from pprint import pprint
import re
import json

In [2]:
# NLTK and Stanford libraries
import nltk, re, os
import nltk.corpus
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.probability import FreqDist
from nltk.tag.stanford import StanfordNERTagger
from nltk import ne_chunk, pos_tag
from nltk.tree import Tree
from nltk import RegexpParser
from nltk.chunk.api import ChunkParserI

In [3]:
# spaCy libraries
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm

***

In [112]:
# reading json file
with open('D:/DSBA/Project/Final-Project-2/dict.json', 'r') as json_file:
    f = json.load(json_file)
data = f
print(data)

{'transcript': "Hello, you have called virtual bank. This is Sarah speaking. How may I help you? Hey sir, I would to refund my money back. Could you tell me why you want to refund your money? I bought something at the shop with my debit card on the 2nd of December 2019 and it debited my money twice. Okay. What is your name Madam? My name is Amy golf golf. Okay, I have Amy Gough Gough and where did you use the debit card to buy something? I bought a bag at Mega bangna it costs $800 were so sorry. In this case. We could not refund your money back. You have to talk with the store that you bought and they will manage. Judge this oh, really? I do not even know. Yes madam. We're so sorry, but we could not fix this problem the store handles about this. Okay, nevermind. Thanks. Would you like another service? That is all thanks. Thank you for using our service. Have a good day.", 'values': {'start': [0.0, 0.5, 0.6, 0.8, 1.1, 1.6, 2.0, 2.1, 2.3, 2.7, 3.2, 3.4, 3.6, 3.7, 4.0, 4.2, 4.4, 4.7, 4.9,

In [245]:
df = pd.DataFrame({'count': ([X for X in range(len(data['values']['word']))]), 'word': data['values']['word'], 'start_time': data['values']['start'], 'end_time': data['values']['end']})
df.head(10)

Unnamed: 0,count,word,start_time,end_time
0,0,"Hello,",0.0,0.5
1,1,you,0.5,0.6
2,2,have,0.6,0.8
3,3,called,0.8,1.1
4,4,virtual,1.1,1.6
5,5,bank.,1.6,2.0
6,6,This,2.0,2.1
7,7,is,2.1,2.3
8,8,Sarah,2.3,2.7
9,9,speaking.,2.7,3.2


***

## Stanford NER Tagger

It has 3 models

* 3 classes model for recognizing locations, person, and organizations
* 4 classes model for recognizing locations, person, organizations, and miscellaneous entities
* 7 classes model for recognizing locations, person, organizations, times, money, percents, and dates

In this project, we use 7 classes model

In [279]:
def stanford_tagger(document):
    lst_word = []
    lst_ne = []
    lst_count = []
    # lst_ps = []
    java_path = ("C:/Program Files/Java/jdk-15.0.1/bin/java.exe")
    os.environ['JAVAHOME'] = java_path

    jar = ('D:/Program/stanford-ner-4.0.0/stanford-ner.jar')
    model = ('D:/Program/stanford-ner-4.0.0/classifiers/english.muc.7class.distsim.crf.ser') # 7 classes
    st = StanfordNERTagger(model, jar, encoding = 'utf-8')
    
    # word_token = word_tokenize(document)
    classified_text = st.tag(document['word'])
    
    # token_df = pd.DataFrame({'count': ([X for X in range(len(word_token))]),'word': ([X for X in word_token])})
    
    for i in range(len(classified_text)):
        if str(classified_text[i][1]) != 'O':
            lst_word.append(str(classified_text[i][0]))
            lst_ne.append(str(classified_text[i][1]))
            lst_count.append(i)
    st_df = pd.DataFrame({'count': lst_count,'word': lst_word, 'stanford_ne': lst_ne})
    # ps_df = st_df[st_df['ne'] == 'PERSON']
    
    # for w in st_df['word']:
        # lst_ps.append(w)
            # print(str(classified_text[i][0]), '>>',  str(classified_text[i][1]))
    return st_df

In [280]:
st_df = stanford_tagger(df)
st_df

Unnamed: 0,count,word,stanford_ne
0,8,Sarah,PERSON
1,9,speaking.,PERSON
2,49,December,DATE
3,50,2019,DATE
4,72,Amy,PERSON
5,73,Gough,PERSON
6,74,Gough,PERSON


***

## NLTK

**NLTK recognizes the following entities:**
* ORGANIZATION - Georgia-Pacific Corp., WHO
* PERSON - Eddy Bonte, President Obama
* LOCATION - Murray River, Mount Everest
* DATE - June, 2008-06-29
* TIME - two fifty a m, 1:30 p.m.
* MONEY - 175 million Canadian Dollars, GBP 10.40
* PERCENT - twenty pct, 18.75 %
* FACILITY - Washington Monument, Stonehenge
* GPE - South East Asia, Midlothian

In [289]:
pos_tag(df['word'])[0][1]

'NNP'

In [None]:
lst_word = []
lst_ne = []
lst_count = []

tagged_words = pos_tag(df['word'])
ne_tagged = ne_chunk(tagged_words, binary = False)

for chunk in range(len(ne_tagged)):
    if hasattr(ne_tagged[chunk], 'label'):
        lst_word.append(str(ne_tagged[chunk][0][0]))
        lst_ne.append(str(ne_tagged[chunk].label()))
        lst_count.append(chunk)
    if not hasattr(ne_tagged[chunk], 'label'):
        if str(ne_tagged[chunk][0:][1:][0]) == 'CD':
            lst_word.append(str(ne_tagged[chunk][0]))
            lst_ne.append(str(ne_tagged[chunk][0:][1:][0]))
            lst_count.append(chunk)
            
nltk_df = pd.DataFrame({'count': lst_count, 'word': lst_word, 'nltk_ne': lst_ne})
nltk_df

In [313]:
def nltk_tagger(document):
    lst_word = []
    lst_ne = []
    lst_count = []
    
    # word_token = word_tokenize(document)
    tagged_words = pos_tag(document['word'])
    ne_tagged = ne_chunk(tagged_words, binary = False) # False for details NE type
    # ne_tagged.draw()
    
    for chunk in range(len(ne_tagged)):
        if hasattr(ne_tagged[chunk], 'label'):
            lst_word.append(str(ne_tagged[chunk][0][0]))
            lst_ne.append(str(ne_tagged[chunk].label()))
            lst_count.append(chunk)
        if not hasattr(ne_tagged[chunk], 'label'):
            if str(ne_tagged[chunk][0:][1:][0]) == 'CD':
                lst_word.append(str(ne_tagged[chunk][0]))
                lst_ne.append(str(ne_tagged[chunk][0:][1:][0]))
                lst_count.append(chunk)
            
    nltk_df = pd.DataFrame({'count': lst_count, 'word': lst_word, 'nltk_ne': lst_ne})
    
    return nltk_df

In [315]:
nltk_df = nltk_tagger(df)
nltk_df

Unnamed: 0,count,word,nltk_ne
0,8,Sarah,PERSON
1,15,Hey,PERSON
2,47,2nd,CD
3,50,2019,CD
4,66,Amy,PERSON
5,72,Amy,PERSON
6,89,Mega,ORGANIZATION
7,130,Yes,PERSON


***

## spaCy

**spaCy recognizes the following entities:**
* PERSON - People, including fictional.
* NORP - Nationalities or religious or political groups.
* FAC - Buildings, airports, highways, bridges, etc.
* ORG - Companies, agencies, institutions, etc.
* GPE - Countries, cities, states.
* LOC - Non-GPE locations, mountain ranges, bodies of water.
* PRODUCT - Objects, vehicles, foods, etc. (Not services.)
* EVENT - Named hurricanes, battles, wars, sports events, etc.
* WORK_OF_ART - Titles of books, songs, etc.
* LAW - Named documents made into laws.
* LANGUAGE - Any named language.
* DATE - Absolute or relative dates or periods.
* TIME - Times smaller than a day.
* PERCENT - Percentage, including ”%“.
* MONEY - Monetary values, including unit.
* QUANTITY - Measurements, as of weight or distance.
* ORDINAL - “first”, “second”, etc.
* CARDINAL - Numerals that do not fall under another type.

In [91]:
nlp = en_core_web_sm.load()

In [106]:
doc = nlp(ct)

**IOB Scheme**
* "I" : Token is inside an entity.
* "O" : Token is outside an entity.
* "B" : Token begins an entity.
* ""  : No entity tag is set (missing value).

In [111]:
# IOB Scheme
pprint([(X.text, X.ent_type_, X.ent_iob_) for X in doc])

[('hello', '', 'O'),
 ('you', '', 'O'),
 ('have', '', 'O'),
 ('called', '', 'O'),
 ('virtual', '', 'O'),
 ('bank', '', 'O'),
 ('this', '', 'O'),
 ('is', '', 'O'),
 ('linda', 'PERSON', 'B'),
 ('speaking', '', 'O'),
 ('how', '', 'O'),
 ('may', '', 'O'),
 ('i', '', 'O'),
 ('help', '', 'O'),
 ('you', '', 'O'),
 ('?', '', 'O'),
 ('\n', '', 'O'),
 ('hi', '', 'O'),
 ('linda', 'PERSON', 'B'),
 ('i', '', 'O'),
 ('was', '', 'O'),
 ('just', '', 'O'),
 ('at', '', 'O'),
 ('your', '', 'O'),
 ('ville', 'GPE', 'B'),
 ('branch', '', 'O'),
 ('and', '', 'O'),
 ('i', '', 'O'),
 ('think', '', 'O'),
 ('i', '', 'O'),
 ('left', '', 'O'),
 ('my', '', 'O'),
 ('debit', '', 'O'),
 ('card', '', 'O'),
 ('in', '', 'O'),
 ('the', '', 'O'),
 ('atm', 'ORG', 'B'),
 ('machine', '', 'O'),
 ('\n', '', 'O'),
 ('okay', '', 'O'),
 ('do', '', 'O'),
 ('you', '', 'O'),
 ('have', '', 'O'),
 ('your', '', 'O'),
 ('debit', '', 'O'),
 ('card', '', 'O'),
 ('number', '', 'O'),
 ('?', '', 'O'),
 ('\n', '', 'O'),
 ('i', '', 'O'),
 ('do',

In [344]:
def clean_text(text):
    # replace . and a space with only a space
    text = text.replace(".", "").replace(",", "").replace("?", "").replace("$", "").replace("\'", "")
    # get rid of the . at the end of each line. 
    cleaned_text = re.sub("\.$", "", text)
    
    return cleaned_text

def spaCy_tagger(document):
    new_words = []
    new_tokens = []
    count = []
    
    nlp = en_core_web_sm.load()
    doc = nlp(clean_text(document))
    
    for token in range(len(doc)):
        if doc[token].ent_type_ != '':
            if doc[token].ent_type_ == 'DATE':
                if (doc[token].text != 'a') and (doc[token].text != 'good') and (doc[token].text != 'day'):
                    new_words.append(doc[token].text)
                    new_tokens.append(doc[token].ent_type_)
                    count.append(token)
            elif doc[token].ent_type_ == 'GPE' or doc[token].ent_type_ == 'LOC':
                new_words.append(doc[token].text)
                new_tokens.append('LOCATION')
                count.append(token)
            elif doc[token].ent_type_ == 'ORG':
                new_words.append(doc[token].text)
                new_tokens.append('ORGANIZATION')
                count.append(token)
            else:
                new_words.append(doc[token].text)
                new_tokens.append(doc[token].ent_type_)
                count.append(token)
     
    spc_df = pd.DataFrame({'count': count, 'word': new_words, 'spacy_ne': new_tokens})
    
    return spc_df

In [345]:
spc_df = spaCy_tagger(data['transcript'])
spc_df

Unnamed: 0,count,word,spacy_ne
0,8,Sarah,PERSON
1,46,the,DATE
2,47,2nd,DATE
3,48,of,DATE
4,49,December,DATE
5,50,2019,DATE
6,66,Amy,PERSON
7,72,Amy,PERSON
8,73,Gough,PERSON
9,74,Gough,PERSON


In [346]:
spc_df.to_csv('D:/DSBA/Project/Final-Project-2/data/Text files/spacy-ner-tagger.csv')

In [66]:
sentence = """Hello, you have called Virtual bank, this is Linda speaking. How may I help you?
Hi Linda. I was just at your Ville branch and I think I left my Debit card in the ATM machine.
Okay. Do you have your Debit card number?
I don’t have.
Okay, well do you have the checking account number associated with the Debit
card? 
That I do have. Are you ready? I will give you what I have got. 765456789. 
Okay. That’s 765456789.
Correct.
What is your identification number?
7745896589665.
Okay, I have 7745896589665 and what is your name sir? 
It is Robert Applebaum.
Okay. I have Robert Applebaum.
Yes.
And what is your date of birth Mr. Applebaum?
July 7th, 1974. 
Okay. July 7th, 1974.
Yes.
And your phone number?
It is 6102651715. 
Okay. I have 6102651715.
Yes.
Okay Mr. Applebaum. I have just suspended your card. If it is in the machine, we will contact you and lift the suspension. 
Oh, thank you.
Sure. Thank you."""

In [67]:
tokenized_sent = sent_tokenize(sentence)
tokenized_sent

['Hello, you have called Virtual bank, this is Linda speaking.',
 'How may I help you?',
 'Hi Linda.',
 'I was just at your Ville branch and I think I left my Debit card in the ATM machine.',
 'Okay.',
 'Do you have your Debit card number?',
 'I don’t have.',
 'Okay, well do you have the checking account number associated with the Debit\ncard?',
 'That I do have.',
 'Are you ready?',
 'I will give you what I have got.',
 '765456789.',
 'Okay.',
 'That’s 765456789.',
 'Correct.',
 'What is your identification number?',
 '7745896589665.',
 'Okay, I have 7745896589665 and what is your name sir?',
 'It is Robert Applebaum.',
 'Okay.',
 'I have Robert Applebaum.',
 'Yes.',
 'And what is your date of birth Mr. Applebaum?',
 'July 7th, 1974.',
 'Okay.',
 'July 7th, 1974.',
 'Yes.',
 'And your phone number?',
 'It is 6102651715.',
 'Okay.',
 'I have 6102651715.',
 'Yes.',
 'Okay Mr. Applebaum.',
 'I have just suspended your card.',
 'If it is in the machine, we will contact you and lift the su

***

In [64]:
wlst = []
nelst = []

for i in range(0, len(tokenized_sent)):
    if re.search('phone number', tokenized_sent[i]):
        if re.search('([0-9]|zero|two|three|four|five|six|seven|eight|nine)+', tokenized_sent[i+1]) or re.search(r'(\bone\b)+', tokenized_sent[i+1]):
            wlst.append(str(tokenized_sent[i+1]))
            nelst.append('PHONENUM')
        elif re.search('([0-9]|zero|one|two|three|four|five|six|seven|eight|nine)+', tokenized_sent[i+2])  or re.search(r'(\bone\b)+', tokenized_sent[i+2]):
            wlst.append(str(tokenized_sent[i+2]))
            nelst.append('PHONENUM')
    if re.search('account number', tokenized_sent[i]):
        if re.search('([0-9]|zero|two|three|four|five|six|seven|eight|nine)+', tokenized_sent[i+1]) or re.search(r'(\bone\b)+', tokenized_sent[i+1]):
            wlst.append(str(tokenized_sent[i+1]))
            nelst.append('ACCNUM')
        elif re.search('([0-9]|zero|two|three|four|five|six|seven|eight|nine)+', tokenized_sent[i+2]) or re.search(r'(\bone\b)+', tokenized_sent[i+2]):
            wlst.append(str(tokenized_sent[i+2]))
            nelst.append('ACCNUM')
    if re.search('(identify number|identification number)', tokenized_sent[i]):
        if re.search('([0-9]|zero|two|three|four|five|six|seven|eight|nine)+', tokenized_sent[i+1]) or re.search(r'(\bone\b)+', tokenized_sent[i+1]):
            wlst.append(str(tokenized_sent[i+1]))
            nelst.append('IDCARD')
        elif re.search('([0-9]|zero|one|two|three|four|five|six|seven|eight|nine)+', tokenized_sent[i+2]) or re.search(r'(\bone\b)+', tokenized_sent[i+2]):
            wlst.append(str(tokenized_sent[i+2]))
            nelst.append('IDCARD')

pd.DataFrame({'sent': wlst, 'ne': nelst})

Unnamed: 0,sent,ne
0,"I didn't use my old phone number anymore, but ...",PHONENUM
1,That's one one one five four four two two two?,ACCNUM
2,My identification number is 1 1 0 2 5 6 9 8 5 ...,IDCARD
3,Five four two 9800 11-2.,IDCARD
4,My phone number is seven.,PHONENUM
5,Seven one nine eight five five five eight seven.,PHONENUM
