# NER Using 3 Models and Rules-based

In [2]:
import pandas as pd
from pprint import pprint
import re
import json

In [3]:
# NLTK and Stanford libraries
import nltk, re, os
import nltk.corpus
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.probability import FreqDist
from nltk.tag.stanford import StanfordNERTagger
from nltk import ne_chunk, pos_tag
from nltk.tree import Tree
from nltk import RegexpParser
from nltk.chunk.api import ChunkParserI

In [4]:
# spaCy libraries
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm

***

In [5]:
# reading json file
with open('D:/DSBA/Project/Final-Project-2/dict.json', 'r') as json_file:
    f = json.load(json_file)
data = f
data

{'transcript': "Hello, you have called virtual bank. This is Sarah speaking. How may I help you? Hey sir, I would to refund my money back. Could you tell me why you want to refund your money? I bought something at the shop with my debit card on the 2nd of December 2019 and it debited my money twice. Okay. What is your name Madam? My name is Amy golf golf. Okay, I have Amy Gough Gough and where did you use the debit card to buy something? I bought a bag at Mega bangna it costs $800 were so sorry. In this case. We could not refund your money back. You have to talk with the store that you bought and they will manage. Judge this oh, really? I do not even know. Yes madam. We're so sorry, but we could not fix this problem the store handles about this. Okay, nevermind. Thanks. Would you like another service? That is all thanks. Thank you for using our service. Have a good day.",
 'values': {'start': [0.0,
   0.5,
   0.6,
   0.8,
   1.1,
   1.6,
   2.0,
   2.1,
   2.3,
   2.7,
   3.2,
   3.4,


In [6]:
text = data['transcript']
text

"Hello, you have called virtual bank. This is Sarah speaking. How may I help you? Hey sir, I would to refund my money back. Could you tell me why you want to refund your money? I bought something at the shop with my debit card on the 2nd of December 2019 and it debited my money twice. Okay. What is your name Madam? My name is Amy golf golf. Okay, I have Amy Gough Gough and where did you use the debit card to buy something? I bought a bag at Mega bangna it costs $800 were so sorry. In this case. We could not refund your money back. You have to talk with the store that you bought and they will manage. Judge this oh, really? I do not even know. Yes madam. We're so sorry, but we could not fix this problem the store handles about this. Okay, nevermind. Thanks. Would you like another service? That is all thanks. Thank you for using our service. Have a good day."

***

## Stanford NER Tagger

It has 3 models

* 3 classes model for recognizing locations, person, and organizations
* 4 classes model for recognizing locations, person, organizations, and miscellaneous entities
* 7 classes model for recognizing locations, person, organizations, times, money, percents, and dates

In this project, we use 7 classes model

In [7]:
def stanford_tagger(document):
    lst_word = []
    lst_ne = []
    lst_ps = []
    java_path = ("C:/Program Files/Java/jdk-15.0.1/bin/java.exe")
    os.environ['JAVAHOME'] = java_path

    jar = ('D:/Program/stanford-ner-4.0.0/stanford-ner.jar')
    model = ('D:/Program/stanford-ner-4.0.0/classifiers/english.muc.7class.distsim.crf.ser') # 7 classes
    st = StanfordNERTagger(model, jar, encoding = 'utf-8')
    
    word_token = word_tokenize(document)
    classified_text = st.tag(word_token)
    
    for i in range(0, len(classified_text) - 1):
        if str(classified_text[i][1]) != 'O':
            lst_word.append(str(classified_text[i][0]))
            lst_ne.append(str(classified_text[i][1]))
    st_df = pd.DataFrame({'word': lst_word, 'ne': lst_ne})
    ps_df = st_df[st_df['ne'] == 'PERSON']
    
    for w in st_df['word']:
        lst_ps.append(w)
            # print(str(classified_text[i][0]), '>>',  str(classified_text[i][1]))
    return lst_ps, st_df

ps_st, st_df = stanford_tagger(text)

In [8]:
st_df

Unnamed: 0,word,ne
0,Sarah,PERSON
1,December,DATE
2,2019,DATE
3,Amy,PERSON
4,Gough,PERSON
5,Gough,PERSON
6,$,MONEY
7,800,MONEY


***

## NLTK

**NE Type and Examples**
* ORGANIZATION - Georgia-Pacific Corp., WHO
* PERSON - Eddy Bonte, President Obama
* LOCATION - Murray River, Mount Everest
* DATE - June, 2008-06-29
* TIME - two fifty a m, 1:30 p.m.
* MONEY - 175 million Canadian Dollars, GBP 10.40
* PERCENT - twenty pct, 18.75 %
* FACILITY - Washington Monument, Stonehenge
* GPE - South East Asia, Midlothian

In [9]:
def nltk_tagger(document):
    lst_word = []
    lst_ne = []
    lst_ps = []
    word_token = word_tokenize(document)
    tagged_words = pos_tag(word_token)
    ne_tagged = ne_chunk(tagged_words, binary = False) # False for details NE type
    # ne_tagged.draw()
    for chunk in ne_tagged:
        if hasattr(chunk, 'label'):
            if chunk.label() == 'PERSON':
                lst_ps.append(chunk[0][0])
            elif chunk.label() != 'PERSON':
                lst_word.append(chunk[0][0])
                lst_ne.append(chunk.label())
            #  print(chunk[0][0], '>>', chunk.label())
        if not hasattr(chunk, 'label'):
            if str(chunk[0:][1:][0]) == 'CD':
                lst_word.append(chunk[0])
                lst_ne.append(chunk[0:][1:][0])
                # print(chunk[0], '>>', chunk[0:][1:][0])
    nltk_df = pd.DataFrame({'word': lst_word, 'ne': lst_ne})
    return lst_ps, nltk_df

ps_nltk, nltk_df = nltk_tagger(text)

In [10]:
nltk_df

Unnamed: 0,word,ne
0,Hello,GPE
1,2nd,CD
2,2019,CD
3,Mega,ORGANIZATION
4,800,CD


***

## spaCy

In [11]:
nlp = en_core_web_sm.load()

In [12]:
doc = nlp(text)

**IOB Scheme**
* "I" : Token is inside an entity.
* "O" : Token is outside an entity.
* "B" : Token begins an entity.
* ""  : No entity tag is set (missing value).

In [13]:
# IOB Scheme
pprint([(X.text, X.ent_type_, X.ent_iob_) for X in doc])

[('Hello', '', 'O'),
 (',', '', 'O'),
 ('you', '', 'O'),
 ('have', '', 'O'),
 ('called', '', 'O'),
 ('virtual', '', 'O'),
 ('bank', '', 'O'),
 ('.', '', 'O'),
 ('This', '', 'O'),
 ('is', '', 'O'),
 ('Sarah', 'PERSON', 'B'),
 ('speaking', '', 'O'),
 ('.', '', 'O'),
 ('How', '', 'O'),
 ('may', '', 'O'),
 ('I', '', 'O'),
 ('help', '', 'O'),
 ('you', '', 'O'),
 ('?', '', 'O'),
 ('Hey', '', 'O'),
 ('sir', '', 'O'),
 (',', '', 'O'),
 ('I', '', 'O'),
 ('would', '', 'O'),
 ('to', '', 'O'),
 ('refund', '', 'O'),
 ('my', '', 'O'),
 ('money', '', 'O'),
 ('back', '', 'O'),
 ('.', '', 'O'),
 ('Could', '', 'O'),
 ('you', '', 'O'),
 ('tell', '', 'O'),
 ('me', '', 'O'),
 ('why', '', 'O'),
 ('you', '', 'O'),
 ('want', '', 'O'),
 ('to', '', 'O'),
 ('refund', '', 'O'),
 ('your', '', 'O'),
 ('money', '', 'O'),
 ('?', '', 'O'),
 ('I', '', 'O'),
 ('bought', '', 'O'),
 ('something', '', 'O'),
 ('at', '', 'O'),
 ('the', '', 'O'),
 ('shop', '', 'O'),
 ('with', '', 'O'),
 ('my', '', 'O'),
 ('debit', '', 'O'),
 

In [14]:
test = ("Hello, you've called virtual bank. This is Marina speaking. How may I help you? Hello, Marina. I forgot my banking application password. What should I do? You have to press on forgot password button on the application screen and we'll send you a security number to your phone number. Oh, that's bad. I didn't use my old phone number anymore, but I haven't changed my details in the bank information. That's fine. Could I have your account number? That's one one one five four four two two two? Okay. And what's your identification number? Okay. My identification number is 1 1 0 2 5 6 9 8 5 4 2 9800 11-2 Okay, I have one one zero two five six nine eight. Five four two 9800 11-2. Yes. That's right. And what's your name ma'am? My name is Amanda Nelson. Okay. Mrs. Nelson. What's your phone number ma'am? My phone number is seven. Seven one nine eight five five five eight seven. I just replaced your phone number, and you can reset your password in the application by press on the forgot password, and we'll send you a security number to your phone. Thank you. Thank you for using our service Miss Nelson. Have a good day.")

In [16]:
test = nlp(test)

In [18]:
# IOB Scheme
pprint([(X.text, X.ent_type_, X.ent_iob_) for X in test])

[('Hello', '', 'O'),
 (',', '', 'O'),
 ('you', '', 'O'),
 ("'ve", '', 'O'),
 ('called', '', 'O'),
 ('virtual', '', 'O'),
 ('bank', '', 'O'),
 ('.', '', 'O'),
 ('This', '', 'O'),
 ('is', '', 'O'),
 ('Marina', 'LOC', 'B'),
 ('speaking', '', 'O'),
 ('.', '', 'O'),
 ('How', '', 'O'),
 ('may', '', 'O'),
 ('I', '', 'O'),
 ('help', '', 'O'),
 ('you', '', 'O'),
 ('?', '', 'O'),
 ('Hello', '', 'O'),
 (',', '', 'O'),
 ('Marina', 'LOC', 'B'),
 ('.', '', 'O'),
 ('I', '', 'O'),
 ('forgot', '', 'O'),
 ('my', '', 'O'),
 ('banking', '', 'O'),
 ('application', '', 'O'),
 ('password', '', 'O'),
 ('.', '', 'O'),
 ('What', '', 'O'),
 ('should', '', 'O'),
 ('I', '', 'O'),
 ('do', '', 'O'),
 ('?', '', 'O'),
 ('You', '', 'O'),
 ('have', '', 'O'),
 ('to', '', 'O'),
 ('press', '', 'O'),
 ('on', '', 'O'),
 ('forgot', '', 'O'),
 ('password', '', 'O'),
 ('button', '', 'O'),
 ('on', '', 'O'),
 ('the', '', 'O'),
 ('application', '', 'O'),
 ('screen', '', 'O'),
 ('and', '', 'O'),
 ('we', '', 'O'),
 ("'ll", '', 'O'),

In [47]:
new_words = []
new_tokens = []

for token in doc:
    if token.ent_type_ != '':
        if token.ent_type_ == 'DATE':
            if (token.text != 'a') and (token.text != 'good') and (token.text != 'day'):
                new_words.append(token.text)
                new_tokens.append(token.ent_type_)
        else:
            new_words.append(token.text)
            new_tokens.append(token.ent_type_)
spc_df = pd.DataFrame({'word': new_words, 'ne': new_tokens})

In [48]:
spc_df

Unnamed: 0,word,ne
0,Sarah,PERSON
1,the,DATE
2,2nd,DATE
3,of,DATE
4,December,DATE
5,2019,DATE
6,Amy,PERSON
7,Amy,PERSON
8,Gough,PERSON
9,Gough,PERSON


In [45]:
spc_df.to_csv('D:/DSBA/Project/Final-Project-2/data/Text files/ner-tagger.csv')